[Dynamo][Hierarchical Compile] Flatten tuple inputs for regions

ghstack-source-id: e99eea21f6c2e02a15b0027ae1cedffbf4003231 Pull Request resolved: https://github.com/pytorch/pytorch/pull/158812
[Dynamo][Hierarchical Compile] Flatten tuple outputs in graph dedupe pass
2025-10-24 15:44:58 +08:00 · 2025-08-15 23:45:18 -07:00 · 2025-08-15 18:45:06 -07:00 · 2025-08-14 16:14:27 -07:00
396 changed files with 2308 additions and 13741 deletions
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -92,7 +92,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        "/usr/local/cuda/lib64/libnccl.so.2",
        "/usr/local/cuda/lib64/libnvJitLink.so.12",
        "/usr/local/cuda/lib64/libnvrtc.so.12",
-        "/usr/local/cuda/lib64/libnvshmem_host.so.3",
        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
@ -210,6 +209,8 @@ if __name__ == "__main__":
    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
    if enable_cuda:
        build_vars += "MAX_JOBS=5 "
+        # nvshmem is broken for aarch64 see https://github.com/pytorch/pytorch/issues/160425
+        build_vars += "USE_NVSHMEM=OFF "

    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
    desired_cuda = os.getenv("DESIRED_CUDA")
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -64,10 +64,6 @@ FROM cuda as cuda12.9
 RUN bash ./install_cuda.sh 12.9
 ENV DESIRED_CUDA=12.9

-FROM cuda as cuda13.0
-RUN bash ./install_cuda.sh 13.0
-ENV DESIRED_CUDA=13.0
-
 FROM ${ROCM_IMAGE} as rocm
 ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
 ADD ./common/install_mkl.sh install_mkl.sh
@ -80,10 +76,10 @@ ADD ./common/install_mnist.sh install_mnist.sh
 RUN bash ./install_mnist.sh

 FROM base as all_cuda
+COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
 COPY --from=cuda12.8  /usr/local/cuda-12.8 /usr/local/cuda-12.8
 COPY --from=cuda12.9  /usr/local/cuda-12.9 /usr/local/cuda-12.9
-COPY --from=cuda13.0  /usr/local/cuda-13.0 /usr/local/cuda-13.0

 # Final step
 FROM ${BASE_TARGET} as final
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -168,7 +168,7 @@ case "$tag" in
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-clang12-onnx)
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
    VISION=yes
    ONNX=yes
@ -288,6 +288,7 @@ case "$tag" in
    GCC_VERSION=11
    ACL=yes
    VISION=yes
+    CONDA_CMAKE=yes
    OPENBLAS=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
@ -298,6 +299,7 @@ case "$tag" in
    GCC_VERSION=11
    ACL=yes
    VISION=yes
+    CONDA_CMAKE=yes
    OPENBLAS=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@ -1,2 +0,0 @@
-transformers==4.54.0
-soxr==0.5.0
--- a/.ci/docker/ci_commit_pins/huggingface.txt
+++ b/.ci/docker/ci_commit_pins/huggingface.txt
@ -0,0 +1 @@
+v4.54.0
--- a/.ci/docker/ci_commit_pins/nccl-cu13.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu13.txt
@ -1 +0,0 @@
-v2.27.7-1
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-0958dc9b2bb815e428f721f9da599dab0dc1c5d7
+ae324eeac8e102a2b40370e341460f3791353398
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -10,7 +10,7 @@ else
  arch_path='sbsa'
 fi

-NVSHMEM_VERSION=3.3.20
+NVSHMEM_VERSION=3.3.9

 function install_cuda {
  version=$1
@ -62,16 +62,14 @@ function install_nvshmem {
  mkdir -p "${tmpdir}" && cd "${tmpdir}"

  # nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
-  # This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
-  filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
-  suffix=".tar.xz"
-  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}${suffix}"
+  filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
+  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"

  # download, unpack, install
  wget -q "${url}"
-  tar xf "${filename}${suffix}"
-  cp -a "${filename}/include/"* /usr/local/cuda/include/
-  cp -a "${filename}/lib/"*     /usr/local/cuda/lib64/
+  tar xf "${filename}.tar.gz"
+  cp -a "libnvshmem/include/"* /usr/local/cuda/include/
+  cp -a "libnvshmem/lib/"*     /usr/local/cuda/lib64/

  # cleanup
  cd ..
@ -128,6 +126,74 @@ function install_129 {
  ldconfig
 }

+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.4 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+function prune_126 {
+  echo "Pruning CUDA 12.6"
+  #####################################################################################
+  # CUDA 12.6 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.6 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.6/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
+}
+
 function install_128 {
  CUDNN_VERSION=9.8.0.87
  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
@ -146,39 +212,18 @@ function install_128 {
  ldconfig
 }

-function install_130 {
-  CUDNN_VERSION=9.12.0.46
-  NVSHMEM_VERSION=3.3.20
-  echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
-  # install CUDA 13.0 in the same container
-  install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
-
-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  install_cudnn 13 $CUDNN_VERSION
-
-  install_nvshmem 13 $NVSHMEM_VERSION
-
-  CUDA_VERSION=13.0 bash install_nccl.sh
-
-  CUDA_VERSION=13.0 bash install_cusparselt.sh
-
-  ldconfig
-}
-
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
    case "$1" in
-    12.4) install_124;
+    12.4) install_124; prune_124
        ;;
-    12.6|12.6.*) install_126;
+    12.6|12.6.*) install_126; prune_126
        ;;
    12.8|12.8.*) install_128;
        ;;
    12.9|12.9.*) install_129;
        ;;
-    13.0|13.0.*) install_130;
-        ;;
    *) echo "bad argument $1"; exit 1
        ;;
    esac
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,15 +5,7 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ "13" ]]; then
-    arch_path='sbsa'
-    export TARGETARCH=${TARGETARCH:-$(uname -m)}
-    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
-        arch_path='x86_64'
-    fi
-    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.8.0.4_cuda13-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
-elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -5,7 +5,9 @@ set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

 function install_huggingface() {
-  pip_install -r huggingface-requirements.txt
+  local version
+  commit=$(get_pinned_commit huggingface)
+  pip_install "git+https://github.com/huggingface/transformers@${commit}"
 }

 function install_timm() {
@ -24,6 +26,9 @@ function install_torchbench() {

  python install.py --continue_on_fail

+  # soxr comes from https://github.com/huggingface/transformers/pull/39429
+  pip install transformers==4.54.0 soxr==0.5.0
+
  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
  popd
--- a/.ci/docker/common/install_nccl.sh
+++ b/.ci/docker/common/install_nccl.sh
@ -7,8 +7,6 @@ if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
 elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
-elif [[ ${CUDA_VERSION:0:2} == "13" ]]; then
-  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu13.txt)
 else
  echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
  exit 1
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -19,7 +19,7 @@ pip_install \
  transformers==4.36.2

 pip_install coloredlogs packaging
-pip_install onnxruntime==1.22.1
+pip_install onnxruntime==1.18.1
 pip_install onnxscript==0.3.1

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -96,11 +96,11 @@ ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt

 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -56,10 +56,10 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

 # Install XPU Dependencies
 ARG XPU_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -96,11 +96,11 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt

 ARG TRITON
 ARG TRITON_CPU
--- a/.ci/lumen_cli/cli/lib/core/vllm.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm.py
@ -62,7 +62,7 @@ class VllmBuildParameters:
    )

    # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
-    output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")
+    output_dir: Path = env_path_field("OUTPUT_DIR", "shared")

    # --- Build args ----------------------------------------------------------
    target_stage: str = env_str_field("TARGET_STAGE", "export-wheels")
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma/build_magma.sh

 .PHONY: all
-all: magma-cuda130
 all: magma-cuda129
 all: magma-cuda128
 all: magma-cuda126
@ -26,12 +25,6 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output

-.PHONY: magma-cuda130
-magma-cuda130: DESIRED_CUDA := 13.0
-magma-cuda130: CUDA_ARCH_LIST := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
-magma-cuda130:
-	$(DOCKER_RUN)
-
 .PHONY: magma-cuda129
 magma-cuda129: DESIRED_CUDA := 12.9
 magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
--- a/.ci/magma/build_magma.sh
+++ b/.ci/magma/build_magma.sh
@ -28,7 +28,6 @@ pushd ${PACKAGE_DIR}/magma-${MAGMA_VERSION}
 patch < ${PACKAGE_FILES}/CMake.patch
 patch < ${PACKAGE_FILES}/cmakelists.patch
 patch -p0 < ${PACKAGE_FILES}/thread_queue.patch
-patch -p1 < ${PACKAGE_FILES}/cuda13.patch
 patch -p1 < ${PACKAGE_FILES}/getrf_shfl.patch
 patch -p1 < ${PACKAGE_FILES}/getrf_nbparam.patch
 # The build.sh script expects to be executed from the sources root folder
@ -38,7 +37,6 @@ popd
 # Package recipe, license and tarball
 # Folder and package name are backward compatible for the build workflow
 cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
-cp ${PACKAGE_FILES}/cuda13.patch ${PACKAGE_RECIPE}/cuda13.patch
 cp ${PACKAGE_FILES}/thread_queue.patch ${PACKAGE_RECIPE}/thread_queue.patch
 cp ${PACKAGE_FILES}/cmakelists.patch ${PACKAGE_RECIPE}/cmakelists.patch
 cp ${PACKAGE_FILES}/getrf_shfl.patch ${PACKAGE_RECIPE}/getrf_shfl.patch
--- a/.ci/magma/package_files/cuda13.patch
+++ b/.ci/magma/package_files/cuda13.patch
@ -1,26 +0,0 @@
-diff --git a/interface_cuda/interface.cpp b/interface_cuda/interface.cpp
-index 73fed1b20..e77519bfe 100644
--- a/interface_cuda/interface.cpp
-+++ b/interface_cuda/interface.cpp
-@@ -438,14 +438,20 @@ magma_print_environment()
-         cudaDeviceProp prop;
-         err = cudaGetDeviceProperties( &prop, dev );
-         check_error( err );
-+        #ifdef MAGMA_HAVE_CUDA
-+#if CUDA_VERSION < 13000
-         printf( "%% device %d: %s, %.1f MHz clock, %.1f MiB memory, capability %d.%d\n",
-                 dev,
-                 prop.name,
-                 prop.clockRate / 1000.,
-+#else
-+        printf( "%% device %d: %s, ??? MHz clock, %.1f MiB memory, capability %d.%d\n",
-+                dev,
-+                prop.name,
-+#endif
-                 prop.totalGlobalMem / (1024.*1024.),
-                 prop.major,
-                 prop.minor );
-        #ifdef MAGMA_HAVE_CUDA
-         int arch = prop.major*100 + prop.minor*10;
-         if ( arch < MAGMA_CUDA_ARCH_MIN ) {
-             printf("\n"
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -134,7 +134,7 @@ if [[ $CUDA_VERSION == 12* ]]; then
            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
            "/usr/local/cuda/lib64/libcufile.so.0"
            "/usr/local/cuda/lib64/libcufile_rdma.so.1"
-            "/usr/local/cuda/lib64/libnvshmem_host.so.3"
+            "/usr/local/cuda/lib64/libnvshem_host.so.3"
            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"
            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
        )
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -174,15 +174,13 @@ checkout_install_torchbench() {
    # to install and test other models
    python install.py --continue_on_fail
  fi
-  popd

-  pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt
-  # https://github.com/pytorch/pytorch/issues/160689 to remove torchao because
-  # its current version 0.12.0 doesn't work with transformers 4.54.0
-  pip uninstall -y torchao
+  # soxr comes from https://github.com/huggingface/transformers/pull/39429
+  pip install transformers==4.54.0 soxr==0.5.0

  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
+  popd
 }

 torchbench_setup_macos() {
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1701,7 +1701,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  fi
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
-  PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
+  PYTHONPATH=/torchbench:$PYTHONPATH test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
    test_inductor_aoti
  fi
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -133,25 +133,6 @@ EXTRA_CONDA_INSTALL_FLAGS=""
 CONDA_ENV_CREATE_FLAGS=""
 RENAME_WHEEL=true
 case $desired_python in
-    3.14t)
-        echo "Using 3.14 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="=2.1.0"
-        CONDA_ENV_CREATE_FLAGS="python-freethreading"
-        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-        desired_python="3.14.0rc1"
-        RENAME_WHEEL=false
-        ;;
-    3.14)
-        echo "Using 3.14t deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="=2.1.0"
-        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-        desired_python="3.14.0rc1"
-        RENAME_WHEEL=false
-        ;;
    3.13t)
        echo "Using 3.13 deps"
        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
--- a/.github/actions/build-external-packages/action.yml
+++ b/.github/actions/build-external-packages/action.yml
@ -1,80 +0,0 @@
-# .github/workflows/build-external.yml
-name: Build External packages
-
-description: build external packages for PyTorch
-
-inputs:
-  cuda-arch-list:
-    description: TORCH_CUDA_ARCH_LIST (e.g., "8.0;8.9;9.0")
-    type: string
-    required: true
-    default: ""
-  docker-image:
-    description: Base image to use
-    type: string
-    required: true
-  build-targets:
-    description: Build targets
-    type: string
-    required: true
-  torch-wheel-dir:
-    description: Directory to built torch wheel
-    type: string
-    required: false
-    default: dist
-  output-dir:
-    description: Directory to store build artifact
-    default: external
-    type: string
-    required: false
-
-outputs:
-  build_time:
-    description: "Total build time in seconds"
-    value: ${{ steps.build-external.outputs.build_time }}
-  output_dir:
-    description: "Directory where build artifact is stored"
-    value: ${{ steps.build-external.outputs.output_dir }}
-
-runs:
-  using: composite
-  steps:
-    - name: Build external packages in sequence
-      id: build-external
-      env:
-        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-        SCCACHE_REGION: us-east-1
-        TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
-        BASE_IMAGE: ${{ inputs.docker-image }}
-        BUILD_TARGETS: ${{ inputs.build-targets }}
-        PARENT_OUTPUT_DIR: ${{ inputs.output-dir}}
-      shell: bash
-      run: |
-        set -euo pipefail
-        python3 --version
-        docker images
-        START_TIME=$(date +%s)
-        (
-          cd .ci/lumen_cli
-          python3 -m pip install -e .
-        )
-        MAX_JOBS="$(nproc --ignore=6)"
-        export MAX_JOBS
-
-        # Split the comma-separated list and build each target
-        IFS=',' read -ra TARGETS <<< "$BUILD_TARGETS"
-        for target in "${TARGETS[@]}"; do
-          OUTPUT_DIR="$PARENT_OUTPUT_DIR/$target"
-          export OUTPUT_DIR
-          echo "Building external package: $target in directory $OUTPUT_DIR"
-          python3 -m cli.run build external "$target"
-
-        done
-
-        END_TIME=$(date +%s)
-        {
-          echo "build_time=$((END_TIME - START_TIME))"
-          if [ -d "$PARENT_OUTPUT_DIR" ]; then
-            echo "output_dir=$PARENT_OUTPUT_DIR"
-          fi
-        } >> "$GITHUB_OUTPUT"
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-02351a683668dd65bc82343e55245e308eb97b4e
+bdb88e1d66f272cad72156c90ac8428ca61a601c
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-0fc8fa751a4321d6531467537ff77cf3c1c70260
+0ca2393b47e72c4424a49aa3b32c7c5d0e378a72
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-a1c6ee92c85e8b0955c20892ed68f032a6015c09
+095faec1e7b6cc47220181e74ae9cde2605f9b00
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -1,20 +0,0 @@
-version: 2
-updates:
-  # Update to the latest transformers version with dependabot
-  - package-ecosystem: "pip"
-    directory: "/.ci/docker/ci_commit_pins"
-    schedule:
-      interval: "daily"
-    target-branch: "main"
-    allow:
-      - dependency-name: "transformers"
-    commit-message:
-      prefix: "[Dependabot] Update"
-      include: "scope"
-    labels:
-      - "dependencies"
-      - "open source"
-      - "python"
-      - "topic: not user facing"
-      - "module: ci"
-      - "module: inductor"
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -27,7 +27,6 @@ ciflow_push_tags:
 - ciflow/trunk
 - ciflow/unstable
 - ciflow/xpu
- ciflow/vllm
 - ciflow/torchbench
 - ciflow/op-benchmark
 - ciflow/pull
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -54,7 +54,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -71,7 +71,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -88,7 +88,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -315,7 +315,7 @@ def generate_wheels_matrix(
            if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
                continue
            # TODO: Enable python 3.14 on non linux OSes
-            if os not in ["linux", "linux-aarch64", "macos-arm64"] and (
+            if os != "linux" and (
                python_version == "3.14" or python_version == "3.14t"
            ):
                continue
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -110,33 +110,12 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -96,13 +96,6 @@ on:
        required: false
        type: string
        default: ""
-      build-external-packages:
-        description: |
-          If set, the build external packages and saves their wheels as artifacts
-          use command separated list of packages to build ex: 'vllm,transformers'.
-        required: false
-        type: string
-        default: ""

    secrets:
      HUGGING_FACE_HUB_TOKEN:
@ -363,26 +356,6 @@ jobs:
          END_TIME=$(date +%s)
          echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"

-      - name: Build external packages
-        id: build-external-packages
-        if: inputs.build-external-packages != '' &&  steps.build.outcome != 'skipped'
-        uses: ./.github/actions/build-external-packages
-        with:
-          build-targets: ${{ inputs.build-external-packages }}
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-          cuda-arch-list: ${{ inputs.cuda-arch-list }}
-          output-dir: external
-
-      - name: Move external packages to dist
-        if: steps.build-external-packages.outputs.output_dir != '' && steps.build-external-packages.outcome != 'skipped'
-        shell: bash
-        run: |
-          src="${{ steps.build-external-packages.outputs.output_dir }}"
-          if [ -d "$src" ]; then
-            mkdir -p "dist/$(dirname "$src")"
-            mv "$src" "dist/$(dirname "$src")/"
-          fi
-
      - name: Stop monitoring script
        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -136,7 +136,7 @@ jobs:
          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        run: |
-          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_json==0.6.7
+          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_sajson==0.6.7
          "$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -36,7 +36,7 @@ jobs:
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
-        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "rocm6.3", "rocm6.4", "cpu"]
    steps:
      - name: Build docker image
        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
--- a/.github/workflows/build-magma-linux.yml
+++ b/.github/workflows/build-magma-linux.yml
@ -34,7 +34,7 @@ jobs:
      id-token: write
    strategy:
      matrix:
-        cuda_version: ["130", "129", "128", "126"]
+        cuda_version: ["129", "128", "126"]
    steps:
      - name: Checkout PyTorch
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -57,11 +57,6 @@ jobs:
          echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
      - name: Checkout optional submodules
        run: python3 tools/optional_submodules.py
-      - name: Copy docs requirements for inclusion
-        run: |
-          # Replace symlink with actual file
-          rm docs/requirements.txt || true
-          cp .ci/docker/requirements-docs.txt docs/requirements.txt
      - name: Create source distribution
        run: |
            # Create new folder with specified name so extracting the archive yields that
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -132,7 +132,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -243,7 +243,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -354,7 +354,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -465,7 +465,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -576,7 +576,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -687,7 +687,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -712,225 +712,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_14-cpu-aarch64-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      DESIRED_PYTHON: "3.14"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_14-cpu-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cpu-aarch64-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_14-cpu-aarch64-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cpu-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.2xlarge
-      ALPINE_IMAGE: "arm64v8/alpine"
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cpu-aarch64-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14-cpu-aarch64-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cpu-aarch64
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_14-cuda-aarch64-12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.14"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_14-cuda-aarch64-12_9
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cuda-aarch64-12_9-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14-cuda-aarch64-12_9-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cuda-aarch64-12_9
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_14t-cpu-aarch64-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      DESIRED_PYTHON: "3.14t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_14t-cpu-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cpu-aarch64-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_14t-cpu-aarch64-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cpu-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.2xlarge
-      ALPINE_IMAGE: "arm64v8/alpine"
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cpu-aarch64-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14t-cpu-aarch64-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cpu-aarch64
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_14t-cuda-aarch64-12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.14t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_14t-cuda-aarch64-12_9
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cuda-aarch64-12_9-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14t-cuda-aarch64-12_9-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cuda-aarch64-12_9
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -60,7 +60,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -127,7 +127,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_6-test:  # Testing
@ -193,7 +193,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_8-test:  # Testing
@ -259,7 +259,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_9-test:  # Testing
@ -719,7 +719,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_6-test:  # Testing
@ -785,7 +785,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_8-test:  # Testing
@ -851,7 +851,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_9-test:  # Testing
@ -1311,7 +1311,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_6-test:  # Testing
@ -1377,7 +1377,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_8-test:  # Testing
@ -1508,7 +1508,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_9-test:  # Testing
@ -1968,7 +1968,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_6-test:  # Testing
@ -2034,7 +2034,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
@ -2100,7 +2100,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_9-test:  # Testing
@ -2560,7 +2560,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_6-test:  # Testing
@ -2626,7 +2626,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_8-test:  # Testing
@ -2692,7 +2692,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_9-test:  # Testing
@ -3152,7 +3152,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_6-test:  # Testing
@ -3218,7 +3218,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_8-test:  # Testing
@ -3284,7 +3284,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_9-test:  # Testing
@ -3744,7 +3744,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_6-test:  # Testing
@ -3810,7 +3810,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_8-test:  # Testing
@ -3876,7 +3876,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_9-test:  # Testing
@ -4336,7 +4336,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_6-test:  # Testing
@ -4402,7 +4402,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_8-test:  # Testing
@ -4468,7 +4468,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_9-test:  # Testing
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -115,33 +115,12 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -260,33 +239,12 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -405,33 +363,12 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -550,33 +487,12 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -695,33 +611,12 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -840,33 +735,12 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -900,293 +774,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_14-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.14"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          set -eux -o pipefail
-          # shellcheck disable=SC1090
-          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
-          mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
-
-          # Build
-          USE_PYTORCH_METAL_EXPORT=1
-          USE_COREML_DELEGATE=1
-          TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
-          export USE_PYTORCH_METAL_EXPORT
-          export USE_COREML_DELEGATE
-          export TORCH_PACKAGE_NAME
-          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
-      - name: Test PyTorch wheel
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          set -eux -o pipefail
-          # shellcheck disable=SC1090
-          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
-          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
-          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
-
-          # Create new "clean" conda environment for testing
-
-          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
-          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
-
-          # shellcheck disable=SC2086
-          python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: wheel-py3_14-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_14-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: wheel-py3_14-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu
-      DESIRED_PYTHON: "3.14"
-      build_name: wheel-py3_14-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_14t-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.14t"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          set -eux -o pipefail
-          # shellcheck disable=SC1090
-          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
-          mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
-
-          # Build
-          USE_PYTORCH_METAL_EXPORT=1
-          USE_COREML_DELEGATE=1
-          TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
-          export USE_PYTORCH_METAL_EXPORT
-          export USE_COREML_DELEGATE
-          export TORCH_PACKAGE_NAME
-          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
-      - name: Test PyTorch wheel
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          set -eux -o pipefail
-          # shellcheck disable=SC1090
-          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
-          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
-          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
-
-          # Create new "clean" conda environment for testing
-
-          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
-          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
-
-          # shellcheck disable=SC2086
-          python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: wheel-py3_14t-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_14t-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: wheel-py3_14t-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu
-      DESIRED_PYTHON: "3.14t"
-      build_name: wheel-py3_14t-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/h100-cutlass-backend.yml
+++ b/.github/workflows/h100-cutlass-backend.yml
@ -4,12 +4,9 @@ on:
  pull_request:
    paths:
      - .github/workflows/h100-cutlass-backend.yml
-      - torch/_inductor/codegen/cuda/**
-      - test/inductor/test_cutlass_backend.py
-      - test/inductor/test_cutlass_evt.py
  workflow_dispatch:
  schedule:
-    - cron: 22 9,21 * * *  # every 12 hours
+    - cron: 22 9 * * *  # every 24 hours about 2:22am PDT
  push:
    tags:
      - ciflow/h100-cutlass-backend/*
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -93,7 +93,7 @@ jobs:
      script: |
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
        echo "Running mypy"
-        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
+        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY --all-files" .github/scripts/lintrunner.sh

  lintrunner-noclang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -111,9 +111,9 @@ jobs:
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
        echo "Running all other linters"
        if [ "$CHANGED_FILES" = '*' ]; then
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY --all-files" .github/scripts/lintrunner.sh
        else
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
        fi

  quick-checks:
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -156,13 +156,13 @@ jobs:
      sync-tag: asan-test
    secrets: inherit

-  linux-jammy-py3_10-clang12-onnx-build:
-    name: linux-jammy-py3.10-clang12-onnx
+  linux-jammy-py3_9-clang12-onnx-build:
+    name: linux-jammy-py3.9-clang12-onnx
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-clang12-onnx
+      build-environment: linux-jammy-py3.9-clang12-onnx
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-onnx
      test-matrix: |
        { include: [
@ -171,16 +171,16 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_10-clang12-onnx-test:
-    name: linux-jammy-py3.10-clang12-onnx
+  linux-jammy-py3_9-clang12-onnx-test:
+    name: linux-jammy-py3.9-clang12-onnx
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-jammy-py3_10-clang12-onnx-build
+      - linux-jammy-py3_9-clang12-onnx-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.10-clang12-onnx
-      docker-image: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.9-clang12-onnx
+      docker-image: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-py3_9-clang12-build:
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -1,45 +0,0 @@
-name: vllm-test
-
-on:
-  push:
-    tags:
-      - ciflow/vllm/*
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-      opt_out_experiments: lf
-
-  torch-build-sm89:
-    name: sm89-vllm-test
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      build-additional-packages: "vision audio torchao"
-      build-external-packages: "vllm"
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm89
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
-      cuda-arch-list: '8.9'
-      runner: linux.24xlarge.memory
-      test-matrix: |
-        { include: [
-          { config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu"  },
-          { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-        ]}
-    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -32,7 +32,6 @@ coverage.xml
 aten/build/
 aten/src/ATen/Config.h
 aten/src/ATen/cuda/CUDAConfig.h
-aten/src/ATen/hip/HIPConfig.h
 benchmarks/.data
 caffe2/cpp_test/
 dist/
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@ -121,7 +121,7 @@ inline int64_t legacy_cat_wrap_dim_symint(
    const std::vector<std::vector<c10::SymInt>>& tensor_sizes) {
  for (auto& sizes : tensor_sizes) {
    if (sizes.size() == 1) {
-      if (TORCH_GUARD_OR_FALSE(sizes[0].sym_eq(0))) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[0].sym_eq(0))) {
        continue;
      }
    }
@ -135,7 +135,7 @@ inline int64_t legacy_cat_wrap_dim(
    const MaterializedITensorListRef& tensors) {
  for (const Tensor& tensor : tensors) {
    if (tensor.dim() == 1) {
-      if (TORCH_GUARD_OR_FALSE(tensor.sym_sizes()[0].sym_eq(0))) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(tensor.sym_sizes()[0].sym_eq(0))) {
        continue;
      }
    }
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1847,12 +1847,8 @@ int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fa
  switch (scaling_type) {
    case ScalingType::BlockWise1x32:
      TORCH_CHECK(scale_dtype == kFloat8_e8m0fnu);
-#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
-#ifdef USE_ROCM
-      return HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0;
-#else
+#if CUDA_VERSION >= 12080
      return CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0;
-#endif // USE_ROCM
 #else
      TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales of 1x32 blocks is only supported for CUDA 12.8 and above");
 #endif // if CUDA_VERSION >= 12080
@ -1950,26 +1946,12 @@ void scaled_gemm(
  // hipblaslt supported row-wise before cublas, and did so their own way (via
  // the SCALE_POINTERSs), but then migrated to match how cublas does it (via
  // the SCALE_MODEs). Here we check for this early custom mode.
-  bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
 #if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
-  if (use_rowwise) {
+  if (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise) {
    matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
    matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
  }
-  else if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
-  #if ROCM_VERSION >= 70000
-            if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
-                // TODO: add constraints based on hipblaslt internals
-                TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
-                           "Matrix dimensions must be multiples of 32 for MX format. "
-                           "Got m=", m, ", n=", n, ", k=", k);
-            }
-  #endif
-  }
-#else
-  // rowwise isn't supported using cublaslt or older hipblaslt
-  TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
-#endif  // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
+#endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
  computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);
  computeDesc.setAttribute(matmulDescB, mat2_scale_ptr);
  if (result_scale_ptr != nullptr) {
@ -2008,16 +1990,15 @@ void scaled_gemm(
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_BIAS);
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
  }
-    // For other data types, use the get_scale_mode function based on scaling type
-    // The SCALE_MODE attrs only exist in cuBLAS 12.8+/ROCm 7.0 or in recent hipblaslt,
-    // but we must invoke get_scale_mode anyways to trigger the version checks.
-    // Note that AMD/ROCm follows OCP Spec 1.0, which is different from NVIDIA's implementation. See get_scale_mode() for details.
-    [[maybe_unused]] int a_scale_mode = get_scale_mode(mat1_scaling_type, mat1_scale_dtype, use_fast_accum);
-    [[maybe_unused]] int b_scale_mode = get_scale_mode(mat2_scaling_type, mat2_scale_dtype, use_fast_accum);
-#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000 && defined(HIPBLASLT_OUTER_VEC))
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, a_scale_mode);
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode);
-#endif // if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000 && defined(HIPBLASLT_OUTER_VEC))
+
+  // The SCALE_MODE attrs only exist in cuBLAS 12.8+ or in recent hipblaslt,
+  // but we must invoke get_scale_mode anyways to trigger the version checks.
+  [[maybe_unused]] int a_scale_mode = get_scale_mode(mat1_scaling_type, mat1_scale_dtype, use_fast_accum);
+  [[maybe_unused]] int b_scale_mode = get_scale_mode(mat2_scaling_type, mat2_scale_dtype, use_fast_accum);
+#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC))
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, a_scale_mode);
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode);
+#endif

  CuBlasLtMatmulPreference preference;
  auto ltworkspace = CublasLtWorkspace();
--- a/aten/src/ATen/cuda/CUDADataType.h
+++ b/aten/src/ATen/cuda/CUDADataType.h
@ -90,7 +90,7 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
    case c10::ScalarType::Float8_e5m2fnuz:
      return HIP_R_8F_E5M2_FNUZ;
 #endif
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080) || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080)
    case c10::ScalarType::Float4_e2m1fn_x2:
      return CUDA_R_4F_E2M1;
 #endif
--- a/aten/src/ATen/cuda/PeerToPeerAccess.cpp
+++ b/aten/src/ATen/cuda/PeerToPeerAccess.cpp
@ -4,9 +4,6 @@

 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
-#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
-#include <c10/cuda/driver_api.h>
-#endif
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>

@ -15,7 +12,6 @@
 namespace at::cuda {

 static std::vector<int8_t> p2pAccessEnabled_;
-static std::vector<int8_t> fabricAccessEnabled_;
 static int64_t num_devices_ = -1;

 namespace detail {
@ -33,23 +29,20 @@ void init_p2p_access_cache(int64_t num_devices) {
  for (const auto i : c10::irange(num_devices)) {
    p2pAccessEnabled_[i * num_devices + i] = 1;
  }
-  fabricAccessEnabled_.clear();
-  fabricAccessEnabled_.resize(num_devices, -1);
 }

-} // namespace detail
+}  // namespace detail

 bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) {
  at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);

-  TORCH_CHECK(dev >= 0 || dev < num_devices_, dev, " is not a device");
-  TORCH_CHECK(
-      dev_to_access >= 0 || dev_to_access < num_devices_,
-      dev_to_access,
-      " is not a device");
+  TORCH_CHECK(dev >= 0 || dev < num_devices_,
+              dev, " is not a device");
+  TORCH_CHECK(dev_to_access >= 0 || dev_to_access < num_devices_,
+              dev_to_access, " is not a device");
  TORCH_INTERNAL_ASSERT(num_devices_ >= 0, "p2p access cache not initialized");

-  auto& cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access];
+  auto &cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access];

  if (cache != -1) {
    return cache;
@ -65,118 +58,4 @@ bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) {
  return cache;
 }

-namespace {
-#if !defined USE_ROCM && defined CUDA_VERSION && CUDA_VERSION >= 12040 && defined PYTORCH_C10_DRIVER_API_SUPPORTED
-
-nvmlDevice_t get_nvml_device(c10::DeviceIndex dev) {
-  static bool nvml_init [[maybe_unused]] = []() {
-    TORCH_INTERNAL_ASSERT(NVML_SUCCESS == DriverAPI::get()->nvmlInit_v2_());
-    return true;
-  }();
-
-  auto prop = at::cuda::getDeviceProperties(dev);
-  char pci_id // NOLINT(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-      [NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-  snprintf(
-      pci_id,
-      sizeof(pci_id),
-      NVML_DEVICE_PCI_BUS_ID_FMT,
-      prop->pciDomainID,
-      prop->pciBusID,
-      prop->pciDeviceID);
-
-  nvmlDevice_t nvml_device = nullptr;
-  TORCH_INTERNAL_ASSERT(
-      NVML_SUCCESS ==
-      DriverAPI::get()->nvmlDeviceGetHandleByPciBusId_v2_(
-          pci_id, &nvml_device));
-  return nvml_device;
-}
-
-bool isFabricSupported() {
-  // 1. try allocating memory
-  CUmemGenericAllocationHandle handle = 0;
-  CUmemAllocationProp prop = {};
-  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-  prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
-  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-
-  size_t granularity{};
-  const auto driver_api = c10::cuda::DriverAPI::get();
-  C10_CUDA_DRIVER_CHECK(driver_api->cuMemGetAllocationGranularity_(
-      &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
-
-  auto status = driver_api->cuMemCreate_(&handle, granularity, &prop, 0);
-  if (status != CUDA_SUCCESS) {
-    LOG(INFO)
-        << "status " << status
-        << " Could not allocate memory with FABRIC handle, falling back to fd handle exchange\n";
-    return false;
-  }
-  // 2. check export
-  CUmemFabricHandle sharedHandle;
-  status = driver_api->cuMemExportToShareableHandle_(
-      &sharedHandle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0);
-  if (status != CUDA_SUCCESS) {
-    LOG(INFO)
-        << "status " << status
-        << " Could not export FABRIC handle, falling back to fd handle exchange\n";
-    driver_api->cuMemRelease_(handle);
-    return false;
-  }
-  // 3. check import
-  CUmemGenericAllocationHandle import_handle = 0;
-  status = driver_api->cuMemImportFromShareableHandle_(
-      &import_handle, &sharedHandle, CU_MEM_HANDLE_TYPE_FABRIC);
-  if (status != CUDA_SUCCESS) {
-    LOG(INFO)
-        << "status " << status
-        << " Could not import FABRIC handle, falling back to fd handle exchange\n";
-    driver_api->cuMemRelease_(handle);
-    return false;
-  }
-  driver_api->cuMemRelease_(import_handle);
-  driver_api->cuMemRelease_(handle);
-  LOG(INFO) << "using fabric to exchange memory handles\n";
-  return true;
-}
-#endif
-} // namespace
-
-bool get_fabric_access(c10::DeviceIndex dev) {
-#if !defined USE_ROCM && defined CUDA_VERSION && CUDA_VERSION >= 12040 && defined PYTORCH_C10_DRIVER_API_SUPPORTED
-  at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
-
-  TORCH_CHECK(dev >= 0 || dev < num_devices_, dev, " is not a device");
-  auto& cache = fabricAccessEnabled_[dev];
-  if (cache != -1) {
-    return cache;
-  }
-  auto nvml_device = get_nvml_device(dev);
-  if (nvml_device != nullptr) {
-    nvmlGpuFabricInfoV_t fabricInfo;
-    fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
-    fabricInfo.version = nvmlGpuFabricInfo_v2;
-    if (DriverAPI::get()->nvmlDeviceGetGpuFabricInfoV_ == nullptr) {
-      return false;
-    }
-    TORCH_CHECK(
-        NVML_SUCCESS ==
-        DriverAPI::get()->nvmlDeviceGetGpuFabricInfoV_(
-            nvml_device, &fabricInfo));
-    auto state = fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
-    if (state) {
-      // now perform the full cycle of allocating - exporting - importing memory
-      state = isFabricSupported();
-    }
-    cache = state ? 1 : 0;
-    return cache;
-  } else {
-    return false;
-  }
-#else
-  return false;
-#endif
-}
-
-} // namespace at::cuda
+}  // namespace at::cuda::detail
--- a/aten/src/ATen/cuda/PeerToPeerAccess.h
+++ b/aten/src/ATen/cuda/PeerToPeerAccess.h
@ -8,6 +8,5 @@ void init_p2p_access_cache(int64_t num_devices);
 }

 TORCH_CUDA_CPP_API bool get_p2p_access(c10::DeviceIndex source_dev, c10::DeviceIndex dest_dev);
-TORCH_CUDA_CPP_API bool get_fabric_access(c10::DeviceIndex device);

 }  // namespace at::cuda
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@ -85,15 +85,6 @@ constexpr hipDataType HipDataTypeFor<c10::Float8_e8m0fnu>() {
  return static_cast<hipDataType>(500);
 }

-template <>
-constexpr hipDataType HipDataTypeFor<c10::Float4_e2m1fn_x2>() {
-#if ROCM_VERSION >= 70000
-  return HIP_R_4F_E2M1;
-#else
-  return static_cast<hipDataType>(33);
-#endif
-}
-
 template <typename T>
 int GetBatchFromParams(const GemmParams<T>* params) {
  return 1;
--- a/aten/src/ATen/native/QuantizedLinear.cpp
+++ b/aten/src/ATen/native/QuantizedLinear.cpp
@ -411,8 +411,7 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
 Tensor fbgemm_linear_fp16_weight_fp32_activation(
    const Tensor& input,
    const Tensor& packed_weight,
-    const std::optional<Tensor>& bias,
-    at::Tensor& output) {
+    const std::optional<Tensor>& bias) {
  TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
                  "and will be removed in a future PyTorch release.")

@ -437,11 +436,9 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
  const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
  const int64_t N = packed_weight_fp16.numCols();
-
  std::vector<int64_t> output_size = input.sizes().vec();
  output_size.back() = N;
-  // Resize output Tensor
-  output.resize_(output_size);
+  Tensor output = at::empty(output_size, input.options().dtype(at::kFloat));

  // Call the fp16 gemm interface
  fbgemm::cblas_gemm_compute(
@ -463,14 +460,6 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
  return output;
 }

-Tensor fbgemm_linear_fp16_weight_fp32_activation(
-    const Tensor& input,
-    const Tensor& packed_weight,
-    const std::optional<Tensor>& bias) {
-      at::Tensor output = at::empty({0}, input.options().dtype(at::kFloat));
-      return at::native::fbgemm_linear_fp16_weight_fp32_activation(input, packed_weight, bias, output);
-  }
-
 Tensor fbgemm_linear_fp16_weight(
    const Tensor& input,
    const Tensor& packed_weight,
@ -479,15 +468,6 @@ Tensor fbgemm_linear_fp16_weight(
      input, packed_weight, bias);
 }

-Tensor fbgemm_linear_fp16_weight(
-  const Tensor& input,
-    const Tensor& packed_weight,
-    const Tensor& bias,
-  at::Tensor& output) {
-  return at::native::fbgemm_linear_fp16_weight_fp32_activation(
-      input, packed_weight, bias, output);
-}
-
 #else // USE_FBGEMM

 Tensor fbgemm_linear_int8_weight_fp32_activation(
@ -574,21 +554,6 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
      false, "This PyTorch installation was not built with FBGEMM operators");
 }

-Tensor fbgemm_linear_fp16_weight_fp32_activation(
-    const Tensor& input,
-    const Tensor& packed_weight,
-    const std::optional<Tensor>& bias,
-    at::Tensor& output) {
-  TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
-                  "and will be removed in a future PyTorch release.")
-
-  // We make a strong guarantee that models using these operators will have the
-  // same numerics across different machines. Therefore, we do not provide a
-  // fallback path and rather fail loudly if we cannot run FBGEMM.
-  TORCH_CHECK(
-      false, "This PyTorch installation was not built with FBGEMM operators");
-}
-
 Tensor fbgemm_linear_fp16_weight_fp32_activation(
    const Tensor& input,
    const Tensor& packed_weight,
@ -603,21 +568,6 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
      false, "This PyTorch installation was not built with FBGEMM operators");
 }

-Tensor fbgemm_linear_fp16_weight(
-    const Tensor& input,
-    const Tensor& packed_weight,
-    const Tensor& bias,
-    at::Tensor& output) {
-  TORCH_WARN_ONCE("fbgemm_linear_fp16_weight is deprecated "
-                  "and will be removed in a future PyTorch release.")
-
-  // We make a strong guarantee that models using these operators will have the
-  // same numerics across different machines. Therefore, we do not provide a
-  // fallback path and rather fail loudly if we cannot run FBGEMM.
-  TORCH_CHECK(
-      false, "This PyTorch installation was not built with FBGEMM operators");
-}
-
 Tensor fbgemm_linear_fp16_weight(
    const Tensor& input,
    const Tensor& packed_weight,
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -1283,35 +1283,15 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
  if (use_fast_accum) {
    TORCH_CHECK(mat1.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat2.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat1` or `mat2` tensors have the `Float4_e2m1fn_x2` dtype.");
  }
-#ifdef USE_ROCM
-  if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2 || mat2.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
-    TORCH_CHECK(ROCM_VERSION >= 70000, "Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above");
-  }
-  if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) {
-    TORCH_CHECK(ROCM_VERSION >= 60500, "Float8_e5m2 is only supported for ROCm 6.5 and above");
-  }
-  if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) {
-    TORCH_CHECK(ROCM_VERSION >= 60500, "Float8_e4m3fn is only supported for ROCm 6.5 and above");
-  }
-#endif
  if (bias) {
-    TORCH_CHECK(out.scalar_type() != kFloat,
-        "Bias is not supported when out_dtype is set to Float32");
-
-    TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 ||
-                bias->scalar_type() == ScalarType::Half,
-        "Bias must be BFloat16 or Half, but got ", bias->scalar_type());
-
-    TORCH_CHECK((out.scalar_type() != kFloat &&
-                 out.scalar_type() != ScalarType::BFloat16) ||
-                bias->scalar_type() == ScalarType::BFloat16,
-        "Bias must be BFloat16 to compute ", out.scalar_type(),
-        " output, but got ", bias->scalar_type());
-
-    TORCH_CHECK(out.scalar_type() != ScalarType::Half ||
-                bias->scalar_type() == ScalarType::Half,
-        "Bias must be Float16 to compute ", out.scalar_type(),
-        " output, but got ", bias->scalar_type());
+    TORCH_CHECK(out.scalar_type() != kFloat, "Bias is not supported when out_dtype is set to Float32");
+    TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half,
+         "Bias must be either Half or BFloat16, but got ", bias->scalar_type());
+    TORCH_CHECK((out.scalar_type() != kFloat && out.scalar_type() != ScalarType::BFloat16) ||
+          bias->scalar_type() == ScalarType::BFloat16,
+          "Bias must be BFloat16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
+    TORCH_CHECK(out.scalar_type() != ScalarType::Half || bias->scalar_type() == ScalarType::Half,
+          "Bias must be Float16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
  }
  {
    auto bias_ = bias.value_or(Tensor());
@ -1373,22 +1353,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
         "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
  }
-  else if (scaling_choice_a == ScalingType::BlockWise1x32 && scaling_choice_b == ScalingType::BlockWise1x32) {
-    #if ROCM_VERSION >= 70000
-    TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
-                "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
-
-    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
-                mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
-                "Matrix dimensions must be multiples of 32 for block-wise scaling");
-
-    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
-                out.scalar_type() == ScalarType::Half,
-                "Block-wise scaling only supports BFloat16 or Half output types");
-#else
-    TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
-#endif
-  }
 #endif

  cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, scale_result, scaling_choice_a, scaling_choice_b);
@ -1466,14 +1430,12 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
      params.k = args.k;
      params.a = args.mata->data_ptr();
      params.a_scale_ptr = args.scale_mata_ptr;
-      params.a_scale_dtype = args.scale_mata_dtype.value();
      params.lda = args.lda;
      params.a_dtype = args.mata->scalar_type();
      params.a_scale_dtype = args.scale_mata_dtype.value();
      params.a_scaling_type = args.scaling_mata_type.value();
      params.b = args.matb->data_ptr();
      params.b_scale_ptr = args.scale_matb_ptr;
-      params.b_scale_dtype = args.scale_matb_dtype.value();
      params.ldb = args.ldb;
      params.b_dtype = args.matb->scalar_type();
      params.b_scale_dtype = args.scale_matb_dtype.value();
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -148,56 +148,6 @@ namespace fe = cudnn_frontend;

 #define MAX_MHA_DIM 4

-// Whether we will use ragged offsets in the dense (non-nested) path
-// to avoid recompilation
-bool use_ragged_in_dense(
-    const Tensor& q,
-    const Tensor& k,
-    const Tensor& v,
-    const Tensor& o,
-    bool has_bias) {
-  static bool flag =
-      c10::utils::check_env("TORCH_CUDNN_SDPA_AVOID_RECOMPILE") == true;
-  if (!flag) {
-    return flag;
-  }
-  TORCH_WARN_ONCE(
-      "TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 is currently experimental. "
-      "Please report any issues to https://github.com/pytorch/pytorch/issues.");
-  if (has_bias) {
-    TORCH_WARN_ONCE(
-        "TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works without bias."
-        "Consider using the is_causal hint instead of bias for causal masking."
-        "Falling back to regular dense case, which may trigger excessive recompilation.");
-    return !has_bias;
-  }
-  bool all_bshd = q.dim() == 4 && q.transpose(1, 2).is_contiguous() &&
-      k.dim() == 4 && k.transpose(1, 2).is_contiguous() && v.dim() == 4 &&
-      v.transpose(1, 2).is_contiguous() && o.dim() == 4 &&
-      o.transpose(1, 2).is_contiguous();
-  if (!all_bshd) {
-    TORCH_WARN_ONCE(
-        "TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works with Q, K, V, and output in BSHD memory layout,"
-        "e.g., Q, K, V must be allocated with torch.randn((B, S, H, D).transpose(1, 2)."
-        "Falling back to regualr dense case, which may trigger excessive recompilation.");
-  }
-  return all_bshd;
-}
-
-int roundup_power2(int dim) {
-  if (!dim) {
-    return 1;
-  }
-  dim--;
-  dim |= dim >> 1;
-  dim |= dim >> 2;
-  dim |= dim >> 4;
-  dim |= dim >> 8;
-  dim |= dim >> 16;
-  dim++;
-  return dim;
-}
-
 struct MHAParams {
  c10::DeviceIndex device_id;
  fe::DataType_t dataType;
@ -221,7 +171,6 @@ struct MHAParams {
  // might be redundant if we take 0 dim/stride
  // as signaling no-bias
  bool has_attn_bias;
-  bool use_ragged;
 };

 void setMHAParams(
@ -279,20 +228,6 @@ void setMHAParams(
  std::copy(k.strides().begin(), k.strides().end(), params.k_stride.begin());
  std::copy(v.sizes().begin(), v.sizes().end(), params.v_dim.begin());
  std::copy(v.strides().begin(), v.strides().end(), params.v_stride.begin());
-  bool use_ragged = use_ragged_in_dense(q, k, v, q, params.has_attn_bias);
-  params.use_ragged = use_ragged;
-  if (use_ragged) {
-    // ignore B - stride in BSHD (THD) avoid-recompile
-    params.q_stride[0] = INT_MAX;
-    params.k_stride[0] = INT_MAX;
-    params.v_stride[0] = INT_MAX;
-    // fix seqlen to rounded value
-    params.s_q = roundup_power2(params.s_q);
-    params.s_kv = roundup_power2(params.s_kv);
-    params.q_dim[2] = roundup_power2(params.q_dim[2]);
-    params.k_dim[2] = roundup_power2(params.k_dim[2]);
-    params.v_dim[2] = roundup_power2(params.v_dim[2]);
-  }
  // uninit is OK as the struct is memset 0'd
  if (params.has_attn_bias) {
    std::copy(
@ -342,29 +277,15 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
 template <typename T, typename KeyType>
 struct MHAGraphCache {
  std::unordered_map<KeyType, T, ParamsWrapperHash<KeyType>> engine_cache;
-  int count = 0;
-  int hits = 0;

  // no mutexes here as caches are now thread local for v8, can also return a
  // pointer to the Execution Plan if we know it will not be invalidated by
  // another thread
  T* find(const KeyType& key) {
-    static bool flag =
-        c10::utils::check_env("TORCH_CUDNN_SDPA_CACHE_DEBUG") == true;
-    if (flag && count) {
-      TORCH_WARN(
-          "SDPA Cache Called ",
-          count,
-          " times. Hit rate: ",
-          100 * hits / count,
-          "%");
-    }
-    count++;
    auto it = engine_cache.find(key);
    if (it == engine_cache.end()) {
      return nullptr;
    }
-    hits++;
    return &(it->second);
  }

@ -481,25 +402,6 @@ auto build_graph(
          .set_is_inference(return_softmaxstats == false)
          .set_causal_mask(is_causal)
          .set_attn_scale(attn_scale);
-  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
-    auto SEQ_LEN_Q_ =
-        mha_graph->tensor(fe::graph::Tensor_attributes()
-                              .set_uid(SEQ_LEN_Q)
-                              .set_name("Seq_q")
-                              .set_dim({b, 1, 1, 1})
-                              .set_stride({1, 1, 1, 1})
-                              .set_data_type(fe::DataType_t::INT32));
-    auto SEQ_LEN_KV_ =
-        mha_graph->tensor(fe::graph::Tensor_attributes()
-                              .set_uid(SEQ_LEN_KV)
-                              .set_name("Seq_kv")
-                              .set_dim({b, 1, 1, 1})
-                              .set_stride({1, 1, 1, 1})
-                              .set_data_type(fe::DataType_t::INT32));
-    scaled_dot_product_flash_attention_options.set_seq_len_q(SEQ_LEN_Q_)
-        .set_seq_len_kv(SEQ_LEN_KV_)
-        .set_padding_mask(true);
-  }
  if (dropout_probability != 0.0f) {
    auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
                                      .set_uid(SEED)
@ -523,11 +425,23 @@ auto build_graph(
        dropout_probability, seed, offset);
  }
  auto Q_ = mha_graph->tensor(
-      fe::graph::Tensor_attributes().set_uid(Q).set_name("Q"));
+      fe::graph::Tensor_attributes()
+          .set_uid(Q)
+          .set_name("Q")
+          .set_dim(q.sizes().vec())
+          .set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec())));
  auto K_ = mha_graph->tensor(
-      fe::graph::Tensor_attributes().set_uid(K).set_name("K"));
+      fe::graph::Tensor_attributes()
+          .set_uid(K)
+          .set_name("K")
+          .set_dim(k.sizes().vec())
+          .set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec())));
  auto V_ = mha_graph->tensor(
-      fe::graph::Tensor_attributes().set_uid(V).set_name("V"));
+      fe::graph::Tensor_attributes()
+          .set_uid(V)
+          .set_name("V")
+          .set_dim(v.sizes().vec())
+          .set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec())));
  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
  if (attn_bias.has_value()) {
    bias =
@ -541,90 +455,12 @@ auto build_graph(

  auto [O_, Stats] =
      mha_graph->sdpa(Q_, K_, V_, scaled_dot_product_flash_attention_options);
-  O_->set_uid(O).set_output(true);
+  O_->set_uid(O);
+  O_->set_output(true).set_dim(o.sizes().vec()).set_stride(o.strides().vec());
+
  if (Stats) {
-    Stats->set_uid(LSE)
-        .set_output(true)
-        .set_data_type(fe::DataType_t::FLOAT)
-        .set_stride(softmaxstats.strides().vec());
-  }
-  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
-    auto RAG_Q_OFF_ =
-        mha_graph->tensor(fe::graph::Tensor_attributes()
-                              .set_uid(RAG_Q_OFF)
-                              .set_name("cum_seq_q")
-                              .set_dim({b + 1, 1, 1, 1})
-                              .set_stride({1, 1, 1, 1})
-                              .set_data_type(fe::DataType_t::INT32));
-    auto RAG_K_OFF_ =
-        mha_graph->tensor(fe::graph::Tensor_attributes()
-                              .set_uid(RAG_K_OFF)
-                              .set_name("cum_seq_k")
-                              .set_dim({b + 1, 1, 1, 1})
-                              .set_stride({1, 1, 1, 1})
-                              .set_data_type(fe::DataType_t::INT32));
-    auto RAG_V_OFF_ =
-        mha_graph->tensor(fe::graph::Tensor_attributes()
-                              .set_uid(RAG_V_OFF)
-                              .set_name("cum_seq_v")
-                              .set_dim({b + 1, 1, 1, 1})
-                              .set_stride({1, 1, 1, 1})
-                              .set_data_type(fe::DataType_t::INT32));
-    auto RAG_O_OFF_ =
-        mha_graph->tensor(fe::graph::Tensor_attributes()
-                              .set_uid(RAG_O_OFF)
-                              .set_name("cum_seq_o")
-                              .set_dim({b + 1, 1, 1, 1})
-                              .set_stride({1, 1, 1, 1})
-                              .set_data_type(fe::DataType_t::INT32));
-    auto RAG_STATS_OFF_ =
-        mha_graph->tensor(fe::graph::Tensor_attributes()
-                              .set_uid(RAG_LSE_OFF)
-                              .set_name("cum_seq_stats")
-                              .set_dim({b + 1, 1, 1, 1})
-                              .set_stride({1, 1, 1, 1})
-                              .set_data_type(fe::DataType_t::INT32));
-    O_->set_ragged_offset(RAG_O_OFF_);
-    Q_->set_ragged_offset(RAG_Q_OFF_);
-    K_->set_ragged_offset(RAG_K_OFF_);
-    V_->set_ragged_offset(RAG_V_OFF_);
-    auto qsizevec = q.sizes().vec();
-    auto ksizevec = k.sizes().vec();
-    auto vsizevec = v.sizes().vec();
-    auto osizevec = o.sizes().vec();
-    qsizevec[2] = roundup_power2(qsizevec[2]);
-    ksizevec[2] = roundup_power2(ksizevec[2]);
-    vsizevec[2] = roundup_power2(vsizevec[2]);
-    osizevec[2] = roundup_power2(osizevec[2]);
-    // we checked for BSHD contig., set fake strides as cuDNN will complain
-    // if e.g., a ragged dim is smaller than a non-ragged one:
-    // consider HBSD tensor where H is 1
-    Q_->set_dim(qsizevec).set_stride(
-        {INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1});
-    K_->set_dim(ksizevec).set_stride(
-        {INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1});
-    V_->set_dim(vsizevec).set_stride(
-        {INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1});
-    O_->set_dim(osizevec).set_stride(
-        {INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1});
-    if (Stats) {
-      Stats->set_ragged_offset(RAG_STATS_OFF_);
-      auto statssizevec = softmaxstats.sizes().vec();
-      statssizevec[2] = roundup_power2(statssizevec[2]);
-      Stats->set_dim(statssizevec);
-    }
-  } else {
-    Q_->set_dim(q.sizes().vec())
-        .set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec()));
-    K_->set_dim(k.sizes().vec())
-        .set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec()));
-    V_->set_dim(v.sizes().vec())
-        .set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec()));
-    O_->set_dim(o.sizes().vec())
-        .set_stride(fixSizeOneDimStrideSDPA(o.sizes(), o.strides().vec()));
-    if (Stats) {
-      Stats->set_dim(softmaxstats.sizes().vec());
-    }
+    Stats->set_uid(LSE);
+    Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
  }

  AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
@ -730,7 +566,7 @@ auto build_graph_nestedtensor(
  auto q_strides = q.strides();
  auto k_strides = k.strides();
  auto v_strides = v.strides();
-  // NB: cuDNN API shape is transposed: we pass it nominally as HTD
+  // NB: cuDNN API shape is transposed
  constexpr int strideidx0 = 1;
  constexpr int strideidx1 = 0;
  constexpr int strideidx2 = 2;
@ -888,32 +724,21 @@ auto build_graph_backward(
                                   .set_name("CUDNN_SDPA_BACKWARD")
                                   .set_causal_mask(is_causal)
                                   .set_attn_scale(attn_scale);
-  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
-    auto SEQ_LEN_Q_ =
-        mha_graph->tensor(fe::graph::Tensor_attributes()
-                              .set_uid(SEQ_LEN_Q)
-                              .set_name("Seq_q")
-                              .set_dim({b, 1, 1, 1})
-                              .set_stride({1, 1, 1, 1})
-                              .set_data_type(fe::DataType_t::INT32));
-    auto SEQ_LEN_KV_ =
-        mha_graph->tensor(fe::graph::Tensor_attributes()
-                              .set_uid(SEQ_LEN_KV)
-                              .set_name("Seq_kv")
-                              .set_dim({b, 1, 1, 1})
-                              .set_stride({1, 1, 1, 1})
-                              .set_data_type(fe::DataType_t::INT32));
-    sdpa_backward_options.set_seq_len_q(SEQ_LEN_Q_)
-        .set_seq_len_kv(SEQ_LEN_KV_)
-        .set_padding_mask(true);
-  }
-
-  auto Q_ = mha_graph->tensor(
-      fe::graph::Tensor_attributes().set_uid(Q).set_name("Q"));
-  auto K_ = mha_graph->tensor(
-      fe::graph::Tensor_attributes().set_uid(K).set_name("K"));
-  auto V_ = mha_graph->tensor(
-      fe::graph::Tensor_attributes().set_uid(V).set_name("V"));
+  auto Q_ = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_uid(Q)
+                                  .set_name("Q")
+                                  .set_dim(q.sizes().vec())
+                                  .set_stride(q.strides().vec()));
+  auto K_ = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_uid(K)
+                                  .set_name("K")
+                                  .set_dim(k.sizes().vec())
+                                  .set_stride(k.strides().vec()));
+  auto V_ = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_uid(V)
+                                  .set_name("V")
+                                  .set_dim(v.sizes().vec())
+                                  .set_stride(v.strides().vec()));
  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
  if (attn_bias.has_value()) {
    bias =
@ -945,108 +770,31 @@ auto build_graph_backward(
                                                : fe::DataType_t::INT64));
    sdpa_backward_options.set_dropout(dropout_probability, seed, offset);
  }
-  auto O_ = mha_graph->tensor(
-      fe::graph::Tensor_attributes().set_uid(O).set_name("O"));
+
+  auto O_ = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_uid(O)
+                                  .set_name("O")
+                                  .set_dim(o.sizes().vec())
+                                  .set_stride(o.strides().vec()));
  auto Stats = mha_graph->tensor(fe::graph::Tensor_attributes()
                                     .set_uid(LSE)
                                     .set_name("Stats")
+                                     .set_dim(softmaxstats.sizes().vec())
                                     .set_stride(softmaxstats.strides().vec())
                                     .set_data_type(fe::DataType_t::FLOAT));
-  auto Do = mha_graph->tensor(
-      fe::graph::Tensor_attributes().set_uid(DO).set_name("DO"));
+  auto Do = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_uid(DO)
+                                  .set_name("DO")
+                                  .set_dim(dO.sizes().vec())
+                                  .set_stride(dO.strides().vec()));
  auto [Dq, Dk, Dv] = mha_graph->sdpa_backward(
      Q_, K_, V_, O_, Do, Stats, sdpa_backward_options);
-  Dq->set_uid(DQ).set_output(true);
-  Dk->set_uid(DK).set_output(true);
-  Dv->set_uid(DV).set_output(true);
-  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
-    auto RAG_Q_OFF_ =
-        mha_graph->tensor(fe::graph::Tensor_attributes()
-                              .set_uid(RAG_Q_OFF)
-                              .set_name("cum_seq_q")
-                              .set_dim({b + 1, 1, 1, 1})
-                              .set_stride({1, 1, 1, 1})
-                              .set_data_type(fe::DataType_t::INT32));
-    auto RAG_K_OFF_ =
-        mha_graph->tensor(fe::graph::Tensor_attributes()
-                              .set_uid(RAG_K_OFF)
-                              .set_name("cum_seq_k")
-                              .set_dim({b + 1, 1, 1, 1})
-                              .set_stride({1, 1, 1, 1})
-                              .set_data_type(fe::DataType_t::INT32));
-    auto RAG_V_OFF_ =
-        mha_graph->tensor(fe::graph::Tensor_attributes()
-                              .set_uid(RAG_V_OFF)
-                              .set_name("cum_seq_v")
-                              .set_dim({b + 1, 1, 1, 1})
-                              .set_stride({1, 1, 1, 1})
-                              .set_data_type(fe::DataType_t::INT32));
-    auto RAG_O_OFF_ =
-        mha_graph->tensor(fe::graph::Tensor_attributes()
-                              .set_uid(RAG_O_OFF)
-                              .set_name("cum_seq_o")
-                              .set_dim({b + 1, 1, 1, 1})
-                              .set_stride({1, 1, 1, 1})
-                              .set_data_type(fe::DataType_t::INT32));
-    auto RAG_STATS_OFF_ =
-        mha_graph->tensor(fe::graph::Tensor_attributes()
-                              .set_uid(RAG_LSE_OFF)
-                              .set_name("cum_seq_stats")
-                              .set_dim({b + 1, 1, 1, 1})
-                              .set_stride({1, 1, 1, 1})
-                              .set_data_type(fe::DataType_t::INT32));
-    O_->set_ragged_offset(RAG_O_OFF_);
-    Q_->set_ragged_offset(RAG_Q_OFF_);
-    K_->set_ragged_offset(RAG_K_OFF_);
-    V_->set_ragged_offset(RAG_V_OFF_);
-    Dq->set_ragged_offset(RAG_Q_OFF_);
-    Dk->set_ragged_offset(RAG_K_OFF_);
-    Dv->set_ragged_offset(RAG_V_OFF_);
-    Do->set_ragged_offset(RAG_O_OFF_);
-    auto qsizevec = q.sizes().vec();
-    auto ksizevec = k.sizes().vec();
-    auto vsizevec = v.sizes().vec();
-    auto osizevec = o.sizes().vec();
-    qsizevec[2] = roundup_power2(qsizevec[2]);
-    ksizevec[2] = roundup_power2(ksizevec[2]);
-    vsizevec[2] = roundup_power2(vsizevec[2]);
-    osizevec[2] = roundup_power2(osizevec[2]);
-    // see corresponding section in the forward about the hardcoding
-    // of strides here
-    Q_->set_dim(qsizevec).set_stride(
-        {INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1});
-    K_->set_dim(ksizevec).set_stride(
-        {INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1});
-    V_->set_dim(vsizevec).set_stride(
-        {INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1});
-    O_->set_dim(osizevec).set_stride(
-        {INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1});
-    // should be identical to their non-d counterparts
-    Dq->set_dim(qsizevec).set_stride(
-        {INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1});
-    Dk->set_dim(ksizevec).set_stride(
-        {INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1});
-    Dv->set_dim(vsizevec).set_stride(
-        {INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1});
-    Do->set_dim(osizevec).set_stride(
-        {INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1});
-
-    Stats->set_ragged_offset(RAG_STATS_OFF_);
-    auto statssizevec = softmaxstats.sizes().vec();
-    statssizevec[2] = roundup_power2(statssizevec[2]);
-    Stats->set_dim(statssizevec);
-  } else {
-    O_->set_dim(o.sizes().vec()).set_stride(o.strides().vec());
-    Q_->set_dim(q.sizes().vec()).set_stride(q.strides().vec());
-    K_->set_dim(k.sizes().vec()).set_stride(k.strides().vec());
-    V_->set_dim(v.sizes().vec()).set_stride(v.strides().vec());
-    Dq->set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec());
-    Dk->set_dim(dK.sizes().vec()).set_stride(dK.strides().vec());
-    Dv->set_dim(dV.sizes().vec()).set_stride(dV.strides().vec());
-    Do->set_dim(dO.sizes().vec()).set_stride(dO.strides().vec());
-    Stats->set_dim(softmaxstats.sizes().vec());
-  }
-
+  Dq->set_uid(DQ);
+  Dq->set_output(true).set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec());
+  Dk->set_uid(DK);
+  Dk->set_output(true).set_dim(dK.sizes().vec()).set_stride(dK.strides().vec());
+  Dv->set_uid(DV);
+  Dv->set_output(true).set_dim(dV.sizes().vec()).set_stride(dV.strides().vec());
  AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
  AT_CUDNN_FRONTEND_CHECK(
@ -1318,47 +1066,6 @@ void run_cudnn_SDP_fprop(
    Tensor& o,
    Tensor& dropoutseed,
    Tensor& dropoutoffset) {
-  // do nothing if we got 0-element tensors
-  if (!q.numel() || !k.numel() || !v.numel()) {
-    return;
-  }
-  Tensor seqlen_q, seqlen_kv;
-  Tensor rag_off_q, rag_off_k, rag_off_v, rag_off_o, rag_off_lse;
-
-  if (!o.defined()) {
-    // q is passed to us in BHSD dim order
-    alloc_with_matching_layout(q, o, {b, h, s_q, d_v});
-  }
-  bool use_ragged = use_ragged_in_dense(q, k, v, o, attn_bias.has_value());
-  if (return_softmaxstats && !softmaxstats.defined()) {
-    // TODO(eqy): investigate why cuDNN doesn't like BSH layout softmaxstats
-    if (!use_ragged) {
-      softmaxstats = at::empty({b, h, s_q, 1}, q.options().dtype(kFloat));
-    } else {
-      softmaxstats =
-          at::empty({b, s_q, h, 1}, q.options().dtype(kFloat)).transpose(1, 2);
-    }
-  }
-
-  if (use_ragged) {
-    seqlen_q = at::full({b, 1, 1, 1}, s_q, q.options().dtype(kInt));
-    seqlen_kv = at::full({b, 1, 1, 1}, s_kv, q.options().dtype(kInt));
-    auto cum_seqlen_q = at::full({b + 1, 1, 1, 1}, s_q, q.options().dtype(kInt))
-                            .cumsum(0, kInt)
-                            .add_(-s_q);
-    auto cum_seqlen_kv =
-        at::full({b + 1, 1, 1, 1}, s_kv, q.options().dtype(kInt))
-            .cumsum(0, kInt)
-            .add_(-s_kv);
-    rag_off_q = cum_seqlen_q.mul(q.stride(-2));
-    rag_off_k = cum_seqlen_kv.mul(k.stride(-2));
-    rag_off_v = cum_seqlen_kv.mul(v.stride(-2));
-    rag_off_o = cum_seqlen_q.mul(o.stride(-2));
-    if (return_softmaxstats) {
-      rag_off_lse = cum_seqlen_q.mul(softmaxstats.stride(-2));
-    }
-  }
-
  const auto dprops = at::cuda::getCurrentDeviceProperties();
  auto _dropoutseed = dropoutseed;
  auto _dropoutoffset = dropoutoffset;
@ -1369,10 +1076,21 @@ void run_cudnn_SDP_fprop(
  }

  cudnnHandle_t handle = getCudnnHandle();
+  if (!o.defined()) {
+    // q is passed to us in BHSD dim order
+    alloc_with_matching_layout(q, o, {b, h, s_q, d_v});
+  }
+
+  if (return_softmaxstats && !softmaxstats.defined()) {
+    // TODO(eqy): verify that this is correct
+    softmaxstats = at::empty({b, h, s_q}, q.options().dtype(kFloat));
+  }
+
+  // do nothing if we got 0-element tensors
+  if (!q.numel() || !k.numel() || !v.numel()) {
+    return;
+  }

-  // NB: The key initialization will round up sequence length, stride data etc.
-  // if use_ragged_in_dense is enabled (to allow multiple sequence lenghths to
-  // reuse the same cached value/graph)
  auto key = MHACacheKeyWrapper(
      b,
      h,
@ -1429,17 +1147,6 @@ void run_cudnn_SDP_fprop(
    variant_pack[SEED] = _dropoutseed.data_ptr();
    variant_pack[OFFSET] = _dropoutoffset.data_ptr();
  }
-  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
-    variant_pack[SEQ_LEN_Q] = seqlen_q.data_ptr();
-    variant_pack[SEQ_LEN_KV] = seqlen_kv.data_ptr();
-    variant_pack[RAG_Q_OFF] = rag_off_q.data_ptr();
-    variant_pack[RAG_K_OFF] = rag_off_k.data_ptr();
-    variant_pack[RAG_V_OFF] = rag_off_v.data_ptr();
-    variant_pack[RAG_O_OFF] = rag_off_o.data_ptr();
-    if (return_softmaxstats) {
-      variant_pack[RAG_LSE_OFF] = rag_off_lse.data_ptr();
-    }
-  }
  auto workspace_size = mha_graph->get_workspace_size();
  auto workspace_ptr =
      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
@ -1571,9 +1278,6 @@ void run_cudnn_SDP_bprop(
      !softmaxstats.numel()) {
    return;
  }
-  Tensor seqlen_q, seqlen_kv;
-  Tensor rag_off_q, rag_off_k, rag_off_v, rag_off_o, rag_off_lse;
-
  auto dprops = at::cuda::getCurrentDeviceProperties();
  auto _dropoutseed = dropoutseed;
  auto _dropoutoffset = dropoutoffset;
@ -1600,28 +1304,10 @@ void run_cudnn_SDP_bprop(
      "with matching strides...");
 #else
  const auto innermost_dO_stride = dO.strides()[dO.strides().size() - 1];
-  if (innermost_dO_stride != 1 ||
-      use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+  if (innermost_dO_stride != 1) {
    permute_to_matching_layout(o, dO_);
  }
 #endif
-  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
-    seqlen_q = at::full({b, 1, 1, 1}, s_q, q.options().dtype(kInt));
-    seqlen_kv = at::full({b, 1, 1, 1}, s_kv, q.options().dtype(kInt));
-    auto cum_seqlen_q = at::full({b + 1, 1, 1, 1}, s_q, q.options().dtype(kInt))
-                            .cumsum(0, kInt)
-                            .add_(-s_q);
-    auto cum_seqlen_kv =
-        at::full({b + 1, 1, 1, 1}, s_kv, q.options().dtype(kInt))
-            .cumsum(0, kInt)
-            .add_(-s_kv);
-    rag_off_q = cum_seqlen_q.mul(q.stride(-2));
-    rag_off_k = cum_seqlen_kv.mul(k.stride(-2));
-    rag_off_v = cum_seqlen_kv.mul(v.stride(-2));
-    rag_off_o = cum_seqlen_q.mul(o.stride(-2));
-    rag_off_lse = cum_seqlen_q.mul(softmaxstats.stride(-2));
-  }
-
  cudnnHandle_t handle = getCudnnHandle();
  auto key = MHACacheKeyWrapper(
      b,
@ -1686,16 +1372,6 @@ void run_cudnn_SDP_bprop(
  if (attn_bias.has_value()) {
    variant_pack[BIAS] = attn_bias.value().data_ptr();
  }
-  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
-    variant_pack[SEQ_LEN_Q] = seqlen_q.data_ptr();
-    variant_pack[SEQ_LEN_KV] = seqlen_kv.data_ptr();
-    variant_pack[RAG_Q_OFF] = rag_off_q.data_ptr();
-    variant_pack[RAG_K_OFF] = rag_off_k.data_ptr();
-    variant_pack[RAG_V_OFF] = rag_off_v.data_ptr();
-    variant_pack[RAG_O_OFF] = rag_off_o.data_ptr();
-    variant_pack[RAG_LSE_OFF] = rag_off_lse.data_ptr();
-  }
-
  auto workspace_size = mha_graph->get_workspace_size();
  auto workspace_ptr =
      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
--- a/aten/src/ATen/native/mps/kernels/GridSampler.h
+++ b/aten/src/ATen/native/mps/kernels/GridSampler.h
@ -1,25 +0,0 @@
-#pragma once
-#include <c10/metal/common.h>
-
-#ifdef __METAL__
-enum class GridSamplerInterpolation { Bilinear, Nearest, Bicubic };
-enum class GridSamplerPadding { Zeros, Border, Reflection };
-#else
-#include <ATen/native/GridSamplerUtils.h>
-using at::native::GridSamplerInterpolation;
-using at::native::GridSamplerPadding;
-#endif
-
-template <unsigned N = 5, typename idx_type_t = int32_t>
-struct GridSamplerParams {
-  int32_t sampler_dims;
-  ::c10::metal::array<idx_type_t, N> output_sizes;
-  ::c10::metal::array<idx_type_t, N> output_strides;
-  ::c10::metal::array<idx_type_t, N> input_sizes;
-  ::c10::metal::array<idx_type_t, N> input_strides;
-  ::c10::metal::array<idx_type_t, N> grid_sizes;
-  ::c10::metal::array<idx_type_t, N> grid_strides;
-  GridSamplerInterpolation interpolation_mode;
-  GridSamplerPadding padding_mode;
-  bool align_corners;
-};
--- a/aten/src/ATen/native/mps/kernels/GridSampler.metal
+++ b/aten/src/ATen/native/mps/kernels/GridSampler.metal
@ -1,324 +0,0 @@
-#include <ATen/native/mps/kernels/GridSampler.h>
-#include <c10/metal/utils.h>
-#include <metal_array>
-#include <metal_stdlib>
-
-using namespace metal;
-using namespace c10::metal;
-
-struct GridSamplerOffsets {
-  int32_t output;
-  int32_t input;
-  int32_t grid;
-
-  GridSamplerOffsets() : output(0), input(0), grid(0) {}
-};
-
-// Find offsets into the tensors that this thread will operate on,
-// based on the thread ID.
-static GridSamplerOffsets find_grid_sampler_offsets(
-    constant int32_t* output_sizes,
-    constant int32_t* output_strides,
-    constant int32_t* input_strides,
-    constant int32_t* grid_strides,
-    int32_t sampler_dims,
-    uint tid) {
-  auto dims = sampler_dims + 2;
-  auto output_idx = static_cast<int32_t>(tid);
-  GridSamplerOffsets offsets;
-
-  for (auto dim = dims - 1; dim >= 0; dim--) {
-    auto dim_idx = output_idx % output_sizes[dim];
-    output_idx = output_idx / output_sizes[dim];
-
-    // Select the output element that this thread will calculate.
-    // output shape:
-    //   2 sampler dims: (N, C, Hout, Wout)
-    //   3 sampler dims: (N, C, Dout, Hout, Wout)
-    offsets.output += output_strides[dim] * dim_idx;
-
-    // Select the batch and channel for the input.
-    // input shape:
-    //   2 sampler dims: (N, C, Hin, Win)
-    //   3 sampler dims: (N, C, Din, Hin, Win)
-    if (dim < 2) {
-      offsets.input += input_strides[dim] * dim_idx;
-    }
-
-    // Select the grid coordinates for the output element.
-    // grid shape:
-    //   2 sampler dims: (N, Hout, Wout, 2)
-    //   3 sampler dims: (N, Dout, Hout, Wout, 3)
-    if (dim == 0) {
-      offsets.grid += grid_strides[dim] * dim_idx;
-    } else if (dim >= 2) {
-      offsets.grid += grid_strides[dim - 1] * dim_idx;
-    }
-  }
-
-  return offsets;
-}
-
-// Mod function which gives postive output when `a` is negative
-static int32_t mod(int32_t a, int32_t b) {
-  auto r = a % b;
-  return r + (r < 0 ? b : 0);
-}
-
-// Sentinel index value to indicate zero padding
-constant int32_t IDX_ZERO = -1;
-
-// Apply padding to an index into the input
-static int32_t pad_input_index(
-    int32_t idx,
-    int32_t input_size,
-    GridSamplerPadding padding_mode,
-    bool align_corners) {
-  int32_t idx_padded = idx;
-
-  if (padding_mode == GridSamplerPadding::Zeros) {
-    idx_padded = (idx < 0) ? IDX_ZERO : idx_padded;
-    idx_padded = (idx >= input_size) ? IDX_ZERO : idx_padded;
-
-  } else if (padding_mode == GridSamplerPadding::Border) {
-    idx_padded = (idx < 0) ? 0 : idx_padded;
-    idx_padded = (idx >= input_size) ? input_size - 1 : idx_padded;
-
-  } else if (padding_mode == GridSamplerPadding::Reflection) {
-    auto scale_length = align_corners ? (input_size - 1) : input_size;
-    auto idx_mod = mod(idx, scale_length);
-    auto idx_mod_reverse = (input_size - 1) - idx_mod;
-    bool is_reverse = (abs(idx - idx_mod) / scale_length) % 2 == 1;
-    idx_padded = is_reverse ? idx_mod_reverse : idx_mod;
-  }
-  return idx_padded;
-}
-
-template <int32_t dims, typename T>
-T get_tensor_val(
-    constant T* input,
-    constant int32_t* input_strides,
-    int32_t indices[dims]) {
-  bool found_idx_zero = false;
-  int32_t offset = 0;
-
-  for (auto dim = 0; dim < dims; dim++) {
-    auto idx = indices[dim];
-    found_idx_zero = found_idx_zero || (idx == IDX_ZERO);
-    offset += (found_idx_zero ? 0 : idx) * input_strides[dim];
-  }
-
-  return found_idx_zero ? 0 : input[offset];
-}
-
-// This function performs 3D linear interpolation for one value. One way to
-// think of how this works is to imagine a unit cube where each corner of the
-// cube has one scalar value associated with it. Inside the cube, the values
-// change linearly, so the gradient is constant. The values associated with each
-// corner are given by the `input`, indexed at all eight different combinations
-// of the `left_indices` and `right_indices`. Given a 3D coordinate anywhere
-// within the cube, specified by the `scales` argument, we must calculate the
-// value associated with that position.
-template <typename T>
-T interpolate_linear_3d(
-    constant T* input,
-    constant int32_t* input_strides,
-    int32_t left_indices[3],
-    int32_t right_indices[3],
-    opmath_t<T> scales[3]) {
-  int32_t a_idx[3] = {left_indices[0], left_indices[1], left_indices[2]};
-  int32_t b_idx[3] = {left_indices[0], left_indices[1], right_indices[2]};
-  int32_t c_idx[3] = {left_indices[0], right_indices[1], left_indices[2]};
-  int32_t d_idx[3] = {left_indices[0], right_indices[1], right_indices[2]};
-  int32_t e_idx[3] = {right_indices[0], left_indices[1], left_indices[2]};
-  int32_t f_idx[3] = {right_indices[0], left_indices[1], right_indices[2]};
-  int32_t g_idx[3] = {right_indices[0], right_indices[1], left_indices[2]};
-  int32_t h_idx[3] = {right_indices[0], right_indices[1], right_indices[2]};
-  auto a =
-      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, a_idx));
-  auto b =
-      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, b_idx));
-  auto c =
-      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, c_idx));
-  auto d =
-      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, d_idx));
-  auto e =
-      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, e_idx));
-  auto f =
-      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, f_idx));
-  auto g =
-      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, g_idx));
-  auto h =
-      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, h_idx));
-
-  auto scale0_right = scales[0];
-  auto scale1_right = scales[1];
-  auto scale2_right = scales[2];
-  auto scale0_left = 1 - scale0_right;
-  auto scale1_left = 1 - scale1_right;
-  auto scale2_left = 1 - scale2_right;
-
-  return static_cast<T>(
-      scale0_left * scale1_left * scale2_left * a +
-      scale0_left * scale1_left * scale2_right * b +
-      scale0_left * scale1_right * scale2_left * c +
-      scale0_left * scale1_right * scale2_right * d +
-      scale0_right * scale1_left * scale2_left * e +
-      scale0_right * scale1_left * scale2_right * f +
-      scale0_right * scale1_right * scale2_left * g +
-      scale0_right * scale1_right * scale2_right * h);
-}
-
-// Calculates a single output element.
-// `input` shape:
-//    2 sampler dims: (Hin, Win)
-//    3 sampler dims: (Din, Hin, Win)
-// `coords` values:
-//    2 sampler dims: (Wcoord, Hcoord)
-//    3 sampler dims: (Wcoord, Hcoord, Dcoord)
-template <typename T>
-void grid_sampler_single_element(
-    device T* output,
-    constant T* input,
-    constant T* coords,
-    int32_t dims,
-    constant int32_t* input_sizes,
-    constant int32_t* input_strides,
-    GridSamplerInterpolation interpolation_mode,
-    GridSamplerPadding padding_mode,
-    bool align_corners) {
-  int32_t left_indices[3];
-  int32_t right_indices[3];
-  opmath_t<T> scales[3];
-
-  // For each dimension, find the pair of indices in the cooresponding dimension
-  // of `input` which surround the grid coordinate in that dimension. We'll do
-  // this by mapping different coordiante spaces onto each other. There are
-  // basically three different coordinate spaces to keep in mind:
-  //
-  //  * aligned grid space
-  //    - `-1` refers to the leftmost input value.
-  //    - `1` refers to the rightmost input value.
-  //
-  //  * unaligned grid space
-  //    - `-1` refers to the midpoint between the leftmost input value and
-  //      a padding value to the left of that.
-  //    - `1` refers to the midpoint between the rightmost input value and
-  //      a padding value to the right of that.
-  //
-  //  * input index space
-  //    - `n` refers to the n-th value of the input.
-  //    - `0` refers to the leftmost input value.
-  //    - `N-1` refers to the rightmost input value.
-  //
-  // If `align_corners == False`, then the coordinates are is in unaligned grid
-  // space, and we will map it onto aligned grid space. If `align_corners ==
-  // True`, then coordinates are already in aligned grid space.
-  //
-  // Then we will map unaligned grid space onto input index space, making it
-  // relatively simple to find the two input indices that surround the
-  // coordinate.
-  for (auto coord_dim = 0; coord_dim < dims; coord_dim++) {
-    auto input_dim = dims - coord_dim - 1;
-    auto input_size = input_sizes[input_dim];
-    auto coord = static_cast<opmath_t<T>>(coords[coord_dim]);
-
-    // Interpret nan as -1
-    coord = isnan(coord) ? -1 : coord;
-
-    if (!align_corners) {
-      // Map unaligned grid space to aligned grid space
-      auto corner_alignment_factor = static_cast<opmath_t<T>>(input_size) /
-          static_cast<opmath_t<T>>(input_size - 1);
-      coord = coord * corner_alignment_factor;
-    }
-
-    // Map aligned grid space to input index space
-    coord = (coord + 1) * (static_cast<opmath_t<T>>(input_size - 1) / 2);
-
-    // Get the input indices surrounding the coordinate, apply padding to them,
-    // and obtain the scaling factor between the two for interpolation.
-    auto left_idx = static_cast<int32_t>(floor(coord));
-    auto right_idx = static_cast<int32_t>(ceil(coord));
-    left_indices[input_dim] =
-        pad_input_index(left_idx, input_size, padding_mode, align_corners);
-    right_indices[input_dim] =
-        pad_input_index(right_idx, input_size, padding_mode, align_corners);
-
-    auto scale = coord - left_idx;
-
-    if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-      // TODO: For some reason, rounding the scale to 0 or 1 and then using
-      // linear interpolation seems to work perfectly with zero padding mode,
-      // but we get flaky failures with border and reflection padding modes.
-      // Need to investigate and fix it.
-      scale = (scale <= 0.5) ? 0 : 1;
-    }
-    scales[input_dim] = scale;
-  }
-
-  // Now that we have the bounding indices and scale factor for each dimension
-  // of the input, we can interpolate.
-  if (dims == 3) {
-    *output = interpolate_linear_3d(
-        input, input_strides, left_indices, right_indices, scales);
-  }
-}
-
-template <typename T>
-kernel void grid_sampler(
-    device T* output [[buffer(0)]],
-    constant T* input [[buffer(1)]],
-    constant T* grid [[buffer(2)]],
-    constant GridSamplerParams<5>& params [[buffer(3)]],
-    uint tid [[thread_position_in_grid]]) {
-  auto output_sizes = params.output_sizes.data();
-  auto output_strides = params.output_strides.data();
-  auto input_sizes = params.input_sizes.data();
-  auto input_strides = params.input_strides.data();
-  auto grid_strides = params.grid_strides.data();
-  auto sampler_dims = params.sampler_dims;
-
-  auto offsets = find_grid_sampler_offsets(
-      output_sizes,
-      output_strides,
-      input_strides,
-      grid_strides,
-      sampler_dims,
-      tid);
-
-  output += offsets.output;
-  input += offsets.input;
-  auto coords = grid + offsets.grid;
-
-  input_sizes += 2;
-  input_strides += 2;
-
-  auto interpolation_mode = params.interpolation_mode;
-  auto padding_mode = params.padding_mode;
-  auto align_corners = params.align_corners;
-
-  grid_sampler_single_element(
-      output,
-      input,
-      coords,
-      sampler_dims,
-      input_sizes,
-      input_strides,
-      interpolation_mode,
-      padding_mode,
-      align_corners);
-}
-
-#define REGISTER_GRID_SAMPLER_OP(DTYPE)                     \
-  template [[host_name("grid_sampler_" #DTYPE)]]            \
-  kernel void grid_sampler<DTYPE>(                          \
-      device DTYPE * output [[buffer(0)]],                  \
-      constant DTYPE * input [[buffer(1)]],                 \
-      constant DTYPE * grid [[buffer(2)]],                  \
-      constant GridSamplerParams<5> & params [[buffer(3)]], \
-      uint tid [[thread_position_in_grid]]);
-
-REGISTER_GRID_SAMPLER_OP(float);
-REGISTER_GRID_SAMPLER_OP(half);
-REGISTER_GRID_SAMPLER_OP(bfloat);
--- a/aten/src/ATen/native/mps/operations/GridSampler.mm
+++ b/aten/src/ATen/native/mps/operations/GridSampler.mm
@ -1,10 +1,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/GridSamplerUtils.h>
-#include <ATen/native/Pool.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
-#include <ATen/native/mps/kernels/GridSampler.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -12,17 +9,9 @@
 #else
 #include <ATen/ops/grid_sampler_2d.h>
 #include <ATen/ops/grid_sampler_2d_native.h>
-#include <ATen/ops/grid_sampler_3d_native.h>
 #endif

 namespace at::native {
-
-#ifndef PYTORCH_JIT_COMPILE_SHADERS
-static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
-#else
-#include <ATen/native/mps/GridSampler_metallib.h>
-#endif
-
 namespace mps {
 static void grid_sampler_2d_mps_impl(Tensor& output,
                                     const Tensor& input,
@ -131,96 +120,6 @@ static void grid_sampler_2d_mps_impl(Tensor& output,
    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
  }
 }
-
-static void grid_sampler_template(Tensor& output,
-                                  const Tensor& input,
-                                  const Tensor& grid,
-                                  int64_t _interpolation_mode,
-                                  int64_t _padding_mode,
-                                  bool align_corners,
-                                  int32_t sampler_dims,
-                                  const std::string& op_name) {
-  check_grid_sampler_common(input, grid);
-  switch (sampler_dims) {
-    case 2:
-      check_grid_sampler_2d(input, grid);
-      break;
-    case 3:
-      check_grid_sampler_3d(input, grid, _interpolation_mode);
-      break;
-    default:
-      TORCH_INTERNAL_ASSERT(false, "Only 2D and 3D sampling are supported, but got: ", sampler_dims);
-  }
-  TORCH_CHECK(input.scalar_type() == grid.scalar_type(),
-              "expected input and grid to have the same type, but got ",
-              input.scalar_type(),
-              " and ",
-              grid.scalar_type());
-
-  auto interpolation_mode = static_cast<GridSamplerInterpolation>(_interpolation_mode);
-  auto padding_mode = static_cast<GridSamplerPadding>(_padding_mode);
-
-  switch (interpolation_mode) {
-    case GridSamplerInterpolation::Bilinear:
-      break;
-    case GridSamplerInterpolation::Nearest:
-      TORCH_CHECK(false, op_name, ": Unsupported Nearest interpolation");
-      break;
-    case GridSamplerInterpolation::Bicubic:
-      TORCH_CHECK(false, op_name, ": Unsupported Bicubic interpolation");
-      break;
-    default:
-      TORCH_CHECK(false, op_name, ": Unrecognised interpolation mode: ", _interpolation_mode);
-  }
-
-  switch (padding_mode) {
-    case GridSamplerPadding::Zeros:
-    case GridSamplerPadding::Border:
-    case GridSamplerPadding::Reflection:
-      break;
-    default:
-      TORCH_CHECK(false, op_name, ": Unrecognised Padding Mode: ", _padding_mode);
-  }
-
-  auto input_size = input.sizes();
-  auto grid_size = grid.sizes();
-  output.resize_({input_size[0], input_size[1], grid_size[1], grid_size[2], grid_size[3]}, MemoryFormat::Contiguous);
-
-  auto dims = input.dim();
-
-  GridSamplerParams<5> params;
-  params.sampler_dims = sampler_dims;
-  params.padding_mode = padding_mode;
-  params.interpolation_mode = interpolation_mode;
-  params.align_corners = align_corners;
-
-  for (const auto dim : c10::irange(dims)) {
-    params.output_sizes[dim] = safe_downcast<int32_t, int64_t>(output.size(dim));
-    params.output_strides[dim] = safe_downcast<int32_t, int64_t>(output.stride(dim));
-    params.input_sizes[dim] = safe_downcast<int32_t, int64_t>(input.size(dim));
-    params.input_strides[dim] = safe_downcast<int32_t, int64_t>(input.stride(dim));
-    params.grid_sizes[dim] = safe_downcast<int32_t, int64_t>(grid.size(dim));
-    params.grid_strides[dim] = safe_downcast<int32_t, int64_t>(grid.stride(dim));
-  }
-
-  auto num_threads = output.numel();
-  MPSStream* mpsStream = getCurrentMPSStream();
-
-  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
-    @autoreleasepool {
-      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
-      auto pso = lib.getPipelineStateForFunc("grid_sampler_" + scalarToMetalTypeString(input));
-
-      getMPSProfiler().beginProfileKernel(pso, op_name, {input, grid});
-      [computeEncoder setComputePipelineState:pso];
-      mtl_setArgs(computeEncoder, output, input, grid, params);
-
-      mtl_dispatch1DJob(computeEncoder, pso, num_threads);
-      getMPSProfiler().endProfileKernel(pso);
-    }
-  });
-}
-
 } // namespace mps

 Tensor grid_sampler_2d_mps(const Tensor& input,
@ -236,21 +135,4 @@ Tensor grid_sampler_2d_mps(const Tensor& input,
  return output;
 }

-Tensor grid_sampler_3d_mps(const Tensor& input,
-                           const Tensor& grid,
-                           int64_t interpolation_mode,
-                           int64_t padding_mode,
-                           bool align_corners) {
-  auto output = at::empty({0}, input.options(), MemoryFormat::Contiguous);
-  mps::grid_sampler_template(output,
-                             input,
-                             grid,
-                             interpolation_mode,
-                             padding_mode,
-                             align_corners,
-                             /*sampler_dims=*/3,
-                             /*op_name=*/"grid_sampler_3d");
-  return output;
-}
-
 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@ -456,7 +456,7 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
    errMessage += ": reduction dim must be in the range of input shape";
    for (const auto dim : dim_value) {
      auto wrap_dim = maybe_wrap_dim(dim, num_input_dims);
-      TORCH_CHECK(wrap_dim < (num_input_dims ? num_input_dims : 1), errMessage.c_str())
+      TORCH_CHECK(wrap_dim < static_cast<decltype(wrap_dim)>(input_shape.size()), errMessage.c_str())
    }
  }

--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2931,7 +2931,6 @@
  dispatch:
    CPU: grid_sampler_3d_cpu
    CUDA: grid_sampler_3d_cuda
-    MPS: grid_sampler_3d_mps
  autogen: grid_sampler_3d.out

 # `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for
@ -3448,12 +3447,8 @@

 - func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor? bias) -> Tensor

- func: fbgemm_linear_fp16_weight_fp32_activation.out(Tensor input, Tensor packed_weight, Tensor? bias, Tensor(a!) output) -> Tensor
-
 - func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor

- func: fbgemm_linear_fp16_weight.out(Tensor input, Tensor packed_weight, Tensor bias, Tensor(a!) output) -> Tensor
-
 - func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor

 - func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@ -260,7 +260,7 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_attention_backward(
                          attn_bias_ /*const std::optional<Tensor>& attn_bias*/,
                          out /*const Tensor& o*/,
                          grad_out/*const Tensor& dO*/,
-                          logsumexp/*const Tensor& softmaxstats*/,
+                          logsumexp.unsqueeze(-1)/*const Tensor& softmaxstats*/,
                          dq/*Tensor& dQ*/,
                          dk/*Tensor& dK*/,
                          dv/*Tensor& dV*/,
--- a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
@ -243,6 +243,12 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
  } else {
    softmax_fa_t = at::empty({ 0, 0, 0, 0 }, opts);
  }
+
+  at::Tensor atomic_counter;
+  if (is_causal) {
+    atomic_counter = at::zeros({1}, opts.dtype(at::kInt));
+  }
+
  auto [needs_swa, window_left, window_right] = calculate_swa(window_size_left,
                                                              window_size_right,
                                                              seqlen_q,
@ -256,14 +262,6 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
  constexpr bool uses_swa = false;
 #endif

-  // SWA in AOTriton Kernels is treated as "Generalized Causal masks"
-  is_causal = is_causal || uses_swa;
-
-  at::Tensor atomic_counter;
-  if (is_causal) {
-    atomic_counter = at::zeros({1}, opts.dtype(at::kInt));
-  }
-
  hipError_t err; // TODO: Error handling
  using aotriton::v2::flash::attn_fwd;
  using sdp::aotriton_adapter::mk_aotensor;
@ -457,9 +455,6 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
  constexpr bool uses_swa = false;
 #endif

-  // SWA in AOTriton Kernels is treated as "Generalized Causal masks"
-  is_causal = is_causal || needs_swa;
-
  auto [seed_t, offset_t, philox_state, use_philox_state] =
    prepare_philox_arguments(p_dropout, batch_size * num_heads * 32);

--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -4190,7 +4190,7 @@ def run(runner, args, original_dir=None):
                nonlocal marked
                for i, s in enumerate(t.size()):
                    if s == batch_size:
-                        torch._dynamo.maybe_mark_dynamic(t, i)
+                        torch._dynamo.mark_dynamic(t, i)
                        marked = True
                        break

--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@ -370,7 +370,6 @@ class HuggingfaceRunner(BenchmarkRunner):
        return name in [
            "ElectraForQuestionAnswering",
            "MegatronBertForQuestionAnswering",
-            "GPT2ForSequenceClassification",
        ]

    def _get_model_cls_and_config(self, model_name):
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -631,9 +631,6 @@ libtorch_nativert_sources = [
    "torch/nativert/kernels/NativeKernels.cpp",
    "torch/nativert/kernels/GeneratedStaticDispatchKernels.cpp",
    "torch/nativert/kernels/GeneratedNativeStaticDispatchKernels.cpp",
-    "torch/nativert/graph/passes/SubgraphRewriter.cpp",
-    "torch/nativert/graph/passes/pass_manager/GraphPasses.cpp",
-    "torch/nativert/graph/passes/pass_manager/PassManager.cpp",
 ]

 torch_mobile_tracer_sources = [
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@ -38,13 +38,6 @@ DriverAPI create_driver_api() {
    C10_NVML_DRIVER_API(LOOKUP_NVML_ENTRY)
 #undef LOOKUP_NVML_ENTRY
  }
-
-  if (handle_1) {
-#define LOOKUP_NVML_ENTRY_OPTIONAL(name) \
-  r.name##_ = ((decltype(&name))dlsym(handle_1, #name));
-    C10_NVML_DRIVER_API_OPTIONAL(LOOKUP_NVML_ENTRY_OPTIONAL)
-#undef LOOKUP_NVML_ENTRY_OPTIONAL
-  }
  return r;
 }

--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@ -67,8 +67,6 @@
  _(nvmlDeviceGetComputeRunningProcesses) \
  _(nvmlSystemGetCudaDriverVersion_v2)

-#define C10_NVML_DRIVER_API_OPTIONAL(_) _(nvmlDeviceGetGpuFabricInfoV)
-
 namespace c10::cuda {

 struct DriverAPI {
@ -77,7 +75,6 @@ struct DriverAPI {
  C10_LIBCUDA_DRIVER_API_REQUIRED(CREATE_MEMBER_VERSIONED)
  C10_LIBCUDA_DRIVER_API_OPTIONAL(CREATE_MEMBER_VERSIONED)
  C10_NVML_DRIVER_API(CREATE_MEMBER)
-  C10_NVML_DRIVER_API_OPTIONAL(CREATE_MEMBER)
 #undef CREATE_MEMBER_VERSIONED
 #undef CREATE_MEMBER

--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1122,11 +1122,6 @@ elseif(USE_CUDA)
    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
  endif()
-  # Set driver api defined for PeerToPeerAccess
-  if(NOT WIN32)
-    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/PeerToPeerAccess.cpp PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1")
-  endif()
-
 endif()

 if(USE_XPU)
--- a/docs/source/_static/img/aoti_debugging_guide/cuda_ima_cca.png
+++ b/docs/source/_static/img/aoti_debugging_guide/cuda_ima_cca.png
--- a/docs/source/torch.compiler_aot_inductor.md
+++ b/docs/source/torch.compiler_aot_inductor.md
@ -202,7 +202,6 @@ Below are some useful tools for debugging AOT Inductor.

 logging
 torch.compiler_aot_inductor_minifier
-torch.compiler_aot_inductor_debugging_guide
 ```

 To enable runtime checks on inputs, set the environment variable `AOTI_RUNTIME_CHECK_INPUTS` to 1. This will raise a `RuntimeError` if the inputs to the compiled model differ in size, data type, or strides from those used during export.
--- a/docs/source/torch.compiler_aot_inductor_debugging_guide.md
+++ b/docs/source/torch.compiler_aot_inductor_debugging_guide.md
@ -1,73 +0,0 @@
-# AOTInductor Debugging Guide
-
-If you encounter CUDA illegal memory access (IMA) errors while using [AOT Inductor](./torch.compiler_aot_inductor.md), this guide provides a systematic approach to debug such errors. AOT Inductor is part of the PT2 stack, similar to torch.compile, but it produces a compilation artifact that can work in a C++ environment. CUDA illegal memory errors can happen non-deterministically and even appear transient at times.
-
-On a high-level, there are three main steps in debugging CUDA IMA errors:
-
- **Sanity checks**: Use basic debugging flags to catch common issues before diving deeper.
- **Pinpoint the CUDA IMA**: Make the error deterministic and identify the problematic kernel.
- **Identify problematic kernels**: Use intermediate value debugging to inspect kernel inputs and outputs.
-
-## Step 1: Sanity Checks
-
-Before diving deep into reliably reproducing the error, try out some existing debugging flags:
-
-```bash
-AOTI_RUNTIME_CHECK_INPUTS=1
-TORCHINDUCTOR_NAN_ASSERTS=1
-```
-
-These flags take effect at compilation time (more precisely, at codegen time):
-
- `AOTI_RUNTIME_CHECK_INPUTS=1` checks if the inputs satisfy the same set of guards used during compilation. See {ref}`torch.compiler_troubleshooting` for more details.
- `TORCHINDUCTOR_NAN_ASSERTS=1` adds codegen before and after each Inductor's kernel to check for NaN.
-
-## Step 2: Pinpoint the CUDA IMA
-
-One hard part is CUDA IMA errors can be non-deterministic. They can happen at different locations, and sometimes not happen at all (though that just means the numerics are silently incorrect). With the following two flags, we can trigger the error deterministically:
-
-```bash
-PYTORCH_NO_CUDA_MEMORY_CACHING=1
-CUDA_LAUNCH_BLOCKING=1
-```
-
-These flags take effect at runtime:
-
- `PYTORCH_NO_CUDA_MEMORY_CACHING=1` disables PyTorch's Caching Allocator, which allocates a bigger buffer than needed immediately to reduce the number of buffer allocations. This is usually the reason why CUDA illegal memory access errors are non-deterministic.
-![How PyTorch's caching allocator can mask CUDA illegal memory access errors](./_static/img/aoti_debugging_guide/cuda_ima_cca.png)
-*Figure: How PyTorch's caching allocator can mask CUDA illegal memory access errors*
-
- `CUDA_LAUNCH_BLOCKING=1` forces the kernels to launch one at a time. Without this, we would get the famous "CUDA kernel errors might be asynchronously reported at some other API call" warning since kernels are launched asynchronously.
-
-## Step 3: Identify Problematic Kernels with Intermediate Value Debugger
-
-The AOTI Intermediate Value Debugger can help pinpoint the problematic kernel and get information about the inputs and outputs of said kernel.
-
-First, use:
-
-```bash
-AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=3
-```
-
-This flag takes effect at compilation time and prints the kernels one by one at runtime. Together with the previous flags, this would let us know which kernel was launched right before the error happened.
-
-However, it is important to note that just because the error happened in that kernel, it doesn't mean that kernel is problematic. For example, it can happen that an earlier kernel is problematic and produces some wrong outputs. So the natural next step is to inspect the inputs to the problematic kernel:
-
-```bash
-AOT_INDUCTOR_FILTERED_KERNELS_TO_PRINT="triton_poi_fused_add_ge_logical_and_logical_or_lt_231,_add_position_embeddings_kernel_5" AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2
-```
-
-The filtered kernels to print environment variable has the names of the kernels you want to inspect. If the inputs to the kernel are not as expected, you then inspect the kernel that produces the bad input.
-
-## Additional Debugging Tools
-
-### Logging and Tracing
-
- **tlparse / TORCH_TRACE**: Provides complete output codes for inspection and records the set of guards used. See {ref}`tlparse / TORCH_TRACE <tlparse-torch-trace>` for more details.
- **TORCH_LOGS**: Use `TORCH_LOGS="+inductor,output_code"` to see more PT2 internal logs. See {ref}`TORCH_LOGS <torch-logs>` for more details.
- **TORCH_SHOW_CPP_STACKTRACES**: Set `TORCH_SHOW_CPP_STACKTRACES=1` to potentially see more stack traces.
-
-### Common Sources of Issues
-
- [**Dynamic shapes**](./torch.compiler_dynamic_shapes.md): Historically a source of many IMAs. Pay special attention when debugging dynamic shape scenarios.
- **Custom ops**: Especially when implemented in C++ and used with dynamic shapes. There is a need to Symint'ify the meta function.
--- a/docs/source/torch.compiler_troubleshooting.md
+++ b/docs/source/torch.compiler_troubleshooting.md
@ -192,8 +192,6 @@ For more information on dynamic shapes, see [The dynamic shapes manual](https://

 ## Logging Tools

-(tlparse-torch-trace)=
-
 ### tlparse / TORCH_TRACE

 `tlparse` / `TORCH_TRACE` are a pair of tools that produce compilation reports that look like this:
@ -254,8 +252,6 @@ Here are some insights you can gain from a `tlparse`:
  For example, you can look at the high-level generated FX graph or the generated Triton code.
 - Is there relevant information for a particular frame? You can find these in `compilation_metrics`.

-(torch-logs)=
-
 ### TORCH_LOGS

 You can use the `TORCH_LOGS` environment variable to selectively enable parts of the `torch.compile` stack to log.
--- a/model2/.data/serialization_id
+++ b/model2/.data/serialization_id
@ -1 +0,0 @@
-1171719005974771805808300960005001569062
--- a/model2/.data/version
+++ b/model2/.data/version
@ -1 +0,0 @@
-6
--- a/model2/CMakeLists.txt
+++ b/model2/CMakeLists.txt
@ -1,33 +0,0 @@
-cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
-project(aoti_example LANGUAGES CXX)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-
-add_executable(aoti_example main.cpp)
-set_property(TARGET aoti_example PROPERTY CXX_STANDARD 17)
-
-find_package(TorchStandalone REQUIRED)
-# Set up include directories to find headers at the correct paths
-target_include_directories(aoti_example PRIVATE ${TorchStandalone_INCLUDE_DIRS})
-target_include_directories(aoti_example PRIVATE ${TorchStandalone_INCLUDE_DIRS}/standalone)
-
-enable_language(CUDA)
-set(CMAKE_CUDA_STANDARD 17)
-find_package(CUDAToolkit REQUIRED)
-
-
-target_compile_definitions(aoti_example PRIVATE NOMINMAX USE_CUDA)
-
-# Add compile flags
-target_compile_options(aoti_example PRIVATE /O2 /DLL /MD /std:c++20 /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc /Zc:__cplusplus /permissive- /openmp /openmp:experimental )
-
-
-target_link_libraries(aoti_example PRIVATE
-    ${TorchStandalone_LIBRARIES}  # if you have this variable from find_package(TorchStandalone)
-    CUDA::cudart                  # CUDA runtime library
-    cuda                       # CUDA driver library (usually nvcuda.lib on Windows)
-)
-
-# cmake -DTorchStandalone_DIR="C:/Users/shangdiy/source/repos/torchnative/standalone/build/torchstandalone_install/lib/cmake/TorchStandalone"  ..  
-# cmake --build . --config Release
--- a/model2/archive_format
+++ b/model2/archive_format
@ -1 +0,0 @@
-pt2
--- a/model2/archive_version
+++ b/model2/archive_version
@ -1 +0,0 @@
-0
--- a/model2/byteorder
+++ b/model2/byteorder
@ -1 +0,0 @@
-little
--- a/model2/data/aotinductor/model/CMakeLists.txt
+++ b/model2/data/aotinductor/model/CMakeLists.txt
@ -1,69 +0,0 @@
-
-cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
-project(model LANGUAGES CXX)
-set(CMAKE_CXX_STANDARD 17)
-
-# Set a library target
-add_library(model SHARED)
-
-
-# TODO: change to TorchStandalone
-find_package(TorchStandalone REQUIRED)
-# Set up include directories to find headers at the correct paths
-target_include_directories(model PRIVATE ${TorchStandalone_INCLUDE_DIRS})
-target_include_directories(model PRIVATE ${TorchStandalone_INCLUDE_DIRS}/standalone)
-
-
-# Add macro definitions
-target_compile_definitions(model PRIVATE NOMINMAX TORCH_INDUCTOR_CPP_WRAPPER STANDALONE_TORCH_HEADER  C10_USING_CUSTOM_GENERATED_MACROS USE_CUDA) # CPU_CAPABILITY_AVX512  
-
-# Add compile flags
-target_compile_options(model PRIVATE /O2 /DLL /MD /std:c++20 /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc /Zc:__cplusplus /permissive- /openmp /openmp:experimental )
-
-# Backend-specific flags
-# target_compile_options(model PRIVATE  -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16  -c)  # TODO remove
-
-
-enable_language(CUDA)
-set(CMAKE_CUDA_STANDARD 17)
-find_package(CUDAToolkit REQUIRED)
-
-# Make output use .pyd instead of .dll
-set_target_properties(model PROPERTIES 
-    SUFFIX ".pyd" 
-    LINK_FLAGS "/DEF:${CMAKE_CURRENT_SOURCE_DIR}/model_exports.def"
-)
-
-set(KERNEL_TARGETS "")
-set(KERNEL_OBJECT_FILES "")
-# Function to compile ptx to cubin
-function(embed_gpu_kernel KERNEL_NAME PTX_FILE)
-    set(CUBIN_BASENAME ${KERNEL_NAME}.cubin)
-    set(CUBIN_FILE ${CMAKE_CURRENT_BINARY_DIR}/${CUBIN_BASENAME})
-    # --- PTX to FATBIN Command & Target ---
-    add_custom_command(
-        OUTPUT ${CUBIN_FILE}
-        COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} --cubin ${PTX_FILE}
-                -o ${CUBIN_FILE} ${NVCC_GENCODE_FLAGS}
-                -gencode arch=compute_89,code=sm_89
-        DEPENDS ${PTX_FILE}
-    )
-
-    add_custom_target(build_kernel_object_${KERNEL_NAME} DEPENDS ${CUBIN_FILE})
-    set(KERNEL_TARGETS ${KERNEL_TARGETS} build_kernel_object_${KERNEL_NAME} PARENT_SCOPE)
-endfunction()
-target_sources(model PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/model.wrapper.cpp)
-target_sources(model PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/model_consts.weights.cpp)
-
-embed_gpu_kernel(model_triton_tem_fused_addmm_relu_t_0 ${CMAKE_CURRENT_SOURCE_DIR}/model_triton_tem_fused_addmm_relu_t_0.ptx)
-
-embed_gpu_kernel(model_triton_tem_fused_addmm_relu_sigmoid_t_1 ${CMAKE_CURRENT_SOURCE_DIR}/model_triton_tem_fused_addmm_relu_sigmoid_t_1.ptx)
-add_dependencies(model ${KERNEL_TARGETS})
-target_link_libraries(model PRIVATE ${KERNEL_OBJECT_FILES})
-
-# Add linker flags
-target_link_options(model PRIVATE )
-
-# Add libraries
-# TODO: change to TorchStandalone
-target_link_libraries(model PRIVATE  ${TorchStandalone_LIBRARIES} cuda CUDA::cudart)
--- a/model2/data/aotinductor/model/model.wrapper.cpp
+++ b/model2/data/aotinductor/model/model.wrapper.cpp
--- a/model2/data/aotinductor/model/model.wrapper_compile_flags.json
+++ b/model2/data/aotinductor/model/model.wrapper_compile_flags.json
@ -1 +0,0 @@
-{"compiler": "/home/shangdiy/miniconda3/envs/pytorch-3.10/bin/x86_64-conda-linux-gnu-c++", "definitions": ["NOMINMAX", "TORCH_INDUCTOR_CPP_WRAPPER", "STANDALONE_TORCH_HEADER", " C10_USING_CUSTOM_GENERATED_MACROS", "CPU_CAPABILITY_AVX512", " USE_CUDA"], "include_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/include/python3.10", "/home/shangdiy/miniconda3/envs/pytorch-3.10/Include", "/home/shangdiy/pytorch/torch/include", "/home/shangdiy/pytorch/torch/include/torch/csrc/api/include", "/usr/local/cuda-12/include"], "cflags": ["O2", "DLL", "MD", "std:c++20", "wd4819", "wd4251", "wd4244", "wd4267", "wd4275", "wd4018", "wd4190", "wd4624", "wd4067", "wd4068", "EHsc", "Zc:__cplusplus", "permissive-", "openmp", "openmp:experimental"], "ldflags": [], "libraries_dirs": [], "libraries": [], "passthrough_args": ["", "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16"], "aot_mode": true, "use_relative_path": false, "compile_only": true}
--- a/model2/data/aotinductor/model/model.wrapper_linker_flags.json
+++ b/model2/data/aotinductor/model/model.wrapper_linker_flags.json
@ -1 +0,0 @@
-{"compiler": "/home/shangdiy/miniconda3/envs/pytorch-3.10/bin/x86_64-conda-linux-gnu-c++", "definitions": ["NOMINMAX", "TORCH_INDUCTOR_CPP_WRAPPER", "STANDALONE_TORCH_HEADER", " C10_USING_CUSTOM_GENERATED_MACROS", "CPU_CAPABILITY_AVX512", " USE_CUDA"], "include_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/include/python3.10", "/home/shangdiy/miniconda3/envs/pytorch-3.10/Include", "/home/shangdiy/pytorch/torch/include", "/home/shangdiy/pytorch/torch/include/torch/csrc/api/include", "/usr/local/cuda-12/include"], "cflags": ["O2", "DLL", "MD", "std:c++20", "wd4819", "wd4251", "wd4244", "wd4267", "wd4275", "wd4018", "wd4190", "wd4624", "wd4067", "wd4068", "EHsc", "Zc:__cplusplus", "permissive-", "openmp", "openmp:experimental"], "ldflags": [], "libraries_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/libs", "/home/shangdiy/pytorch/torch/lib", "/usr/local/cuda-12.8/targets/x86_64-linux/lib", "/usr/local/cuda-12.8/targets/x86_64-linux/lib/stubs"], "libraries": ["torch", "torch_cpu", "sleef", "c10", "c10_cuda", "cuda", "torch_cuda"], "passthrough_args": ["", "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16"], "aot_mode": true, "use_relative_path": false, "compile_only": false}
--- a/model2/data/aotinductor/model/model.wrapper_metadata.json
+++ b/model2/data/aotinductor/model/model.wrapper_metadata.json
@ -1 +0,0 @@
-{"AOTI_DEVICE_KEY": "cuda"}
--- a/model2/data/aotinductor/model/model_consts.weights.cpp
+++ b/model2/data/aotinductor/model/model_consts.weights.cpp
@ -1,58 +0,0 @@
-#if defined(__clang__) || defined (__GNUC__)	
-#define ATTRIBUTE_NO_SANITIZE_ADDRESS __attribute__((no_sanitize("address")))	
-#else	
-#define ATTRIBUTE_NO_SANITIZE_ADDRESS	
-#endif	
-	
-ATTRIBUTE_NO_SANITIZE_ADDRESS	
-alignas(64) extern unsigned char _binary_constants_bin_start[768] = {	
-69, 165, 123, 190, 252, 181, 16, 62, 242, 69, 193, 59, 44, 80, 100, 62, 	
-237, 163, 142, 188, 138, 139, 109, 190, 61, 248, 20, 190, 77, 84, 143, 60, 	
-111, 90, 163, 60, 96, 140, 230, 189, 101, 69, 38, 190, 132, 190, 12, 188, 	
-28, 113, 159, 190, 252, 128, 154, 62, 247, 234, 217, 61, 206, 79, 58, 61, 	
-224, 209, 135, 61, 211, 238, 147, 62, 231, 229, 27, 190, 169, 208, 57, 62, 	
-100, 20, 153, 190, 161, 160, 85, 190, 207, 10, 156, 62, 234, 107, 155, 190, 	
-188, 85, 116, 62, 27, 211, 114, 60, 94, 21, 158, 189, 147, 210, 34, 62, 	
-203, 109, 80, 62, 28, 242, 141, 189, 205, 27, 152, 190, 38, 104, 6, 189, 	
-211, 16, 249, 189, 72, 103, 143, 190, 163, 44, 140, 189, 178, 223, 127, 189, 	
-5, 112, 160, 189, 177, 55, 132, 190, 218, 22, 159, 62, 115, 30, 35, 190, 	
-26, 247, 9, 62, 251, 219, 27, 62, 165, 86, 135, 62, 99, 168, 66, 190, 	
-238, 64, 93, 62, 65, 147, 86, 62, 167, 108, 97, 62, 183, 219, 50, 190, 	
-138, 83, 106, 62, 90, 122, 208, 189, 149, 140, 161, 188, 44, 145, 194, 189, 	
-5, 142, 186, 61, 202, 230, 153, 190, 133, 72, 136, 62, 251, 1, 3, 62, 	
-225, 146, 54, 190, 91, 176, 219, 189, 118, 244, 10, 189, 232, 107, 142, 62, 	
-185, 6, 151, 62, 241, 137, 223, 61, 124, 100, 114, 190, 15, 240, 168, 189, 	
-149, 252, 58, 190, 238, 93, 243, 188, 144, 218, 115, 61, 159, 91, 6, 62, 	
-57, 139, 74, 190, 84, 200, 49, 61, 193, 78, 32, 190, 84, 121, 26, 190, 	
-219, 39, 115, 190, 171, 127, 94, 62, 248, 253, 177, 61, 63, 18, 127, 62, 	
-146, 18, 137, 189, 203, 90, 161, 190, 139, 194, 239, 58, 126, 54, 40, 190, 	
-47, 247, 30, 190, 106, 93, 191, 61, 22, 48, 120, 61, 56, 123, 7, 62, 	
-150, 229, 210, 189, 118, 231, 158, 188, 7, 98, 215, 60, 72, 251, 89, 190, 	
-190, 160, 137, 190, 173, 194, 158, 62, 225, 26, 118, 190, 174, 199, 4, 189, 	
-205, 148, 16, 62, 20, 225, 155, 61, 90, 124, 133, 190, 88, 196, 34, 61, 	
-26, 104, 51, 190, 149, 106, 40, 62, 25, 136, 177, 60, 169, 111, 138, 190, 	
-214, 181, 226, 189, 109, 17, 77, 62, 224, 166, 55, 62, 250, 128, 160, 61, 	
-104, 223, 250, 61, 34, 182, 210, 187, 60, 87, 149, 190, 189, 55, 98, 188, 	
-58, 86, 85, 190, 170, 43, 132, 190, 81, 220, 87, 190, 47, 226, 138, 62, 	
-189, 162, 36, 190, 30, 232, 34, 60, 138, 147, 167, 61, 151, 129, 157, 61, 	
-206, 33, 152, 62, 109, 227, 113, 190, 147, 255, 11, 190, 175, 56, 46, 189, 	
-46, 238, 1, 189, 123, 159, 85, 188, 14, 126, 148, 189, 226, 226, 169, 189, 	
-255, 106, 134, 61, 38, 140, 187, 60, 119, 73, 49, 62, 32, 236, 43, 62, 	
-78, 33, 232, 189, 72, 188, 139, 62, 94, 206, 20, 62, 25, 230, 75, 189, 	
-171, 239, 26, 190, 136, 218, 121, 62, 96, 115, 85, 62, 126, 92, 55, 190, 	
-112, 108, 134, 61, 64, 212, 69, 190, 253, 118, 214, 188, 210, 116, 66, 62, 	
-204, 131, 123, 190, 13, 151, 38, 190, 56, 17, 252, 189, 153, 151, 138, 62, 	
-21, 30, 216, 61, 146, 103, 32, 62, 140, 60, 78, 62, 183, 149, 174, 61, 	
-95, 153, 164, 61, 144, 167, 187, 189, 112, 53, 153, 190, 127, 195, 105, 61, 	
-169, 167, 251, 189, 42, 204, 123, 62, 116, 193, 86, 62, 98, 147, 30, 61, 	
-176, 138, 137, 62, 245, 244, 17, 62, 201, 90, 140, 62, 177, 110, 77, 62, 	
-188, 31, 129, 190, 66, 203, 85, 62, 182, 209, 112, 188, 216, 91, 222, 59, 	
-18, 208, 131, 189, 151, 142, 150, 190, 36, 252, 31, 62, 241, 2, 180, 61, 	
-83, 240, 159, 62, 37, 152, 115, 190, 13, 52, 107, 62, 169, 178, 148, 62, 	
-171, 54, 38, 62, 33, 4, 199, 189, 201, 247, 216, 189, 225, 89, 146, 190, 	
-192, 118, 79, 189, 92, 171, 12, 62, 136, 235, 3, 62, 180, 202, 87, 62, 	
-8, 129, 122, 61, 160, 75, 170, 188, 20, 84, 6, 62, 60, 194, 56, 190, 	
-182, 99, 44, 190, 88, 96, 228, 189, 50, 106, 5, 190, 34, 133, 12, 190, 	
-26, 50, 0, 190, 176, 25, 127, 61, 48, 69, 219, 61, 192, 237, 252, 187, 	
-};	
-alignas(64) extern unsigned char * _binary_constants_bin_end;	
--- a/model2/data/aotinductor/model/model_exports.def
+++ b/model2/data/aotinductor/model/model_exports.def
@ -1,6 +0,0 @@
-LIBRARY model
-EXPORTS
-    AOTInductorModelContainerCreate
-    AOTInductorModelContainerCreateWithDevice
-    AOTInductorModelContainerRun
-    AOTInductorModelContainerDelete
--- a/model2/data/aotinductor/model/model_triton_tem_fused_addmm_relu_sigmoid_t_1.cubin
+++ b/model2/data/aotinductor/model/model_triton_tem_fused_addmm_relu_sigmoid_t_1.cubin
--- a/model2/data/aotinductor/model/model_triton_tem_fused_addmm_relu_sigmoid_t_1.ptx
+++ b/model2/data/aotinductor/model/model_triton_tem_fused_addmm_relu_sigmoid_t_1.ptx
@ -1,684 +0,0 @@
-//
-// Generated by LLVM NVPTX Back-End
-//
-
-.version 8.7
-.target sm_89
-.address_size 64
-
-	// .globl	model_triton_tem_fused_addmm_relu_sigmoid_t_1 // -- Begin function model_triton_tem_fused_addmm_relu_sigmoid_t_1
-.extern .shared .align 16 .b8 global_smem[];
-                                        // @model_triton_tem_fused_addmm_relu_sigmoid_t_1
-.visible .entry model_triton_tem_fused_addmm_relu_sigmoid_t_1(
-	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_0,
-	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_1,
-	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_2,
-	.param .u32 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_3,
-	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_4
-)
-.reqntid 32
-{
-	.reg .pred 	%p<12>;
-	.reg .b32 	%r<375>;
-	.reg .b64 	%rd<27>;
-	.loc	1 18 0                          // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:18:0
-$L__func_begin0:
-	.loc	1 18 0                          // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:18:0
-
-// %bb.0:
-	ld.param.b32 	%r1, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_3];
-$L__tmp0:
-	.loc	1 34 16                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:34:16
-	setp.ne.s32 	%p1, %r1, 0;
-	@%p1 bra 	$L__BB0_2;
-	bra.uni 	$L__BB0_1;
-$L__BB0_2:
-	.loc	1 0 16                          // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0:16
-	ld.param.b64 	%rd3, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_2];
-	ld.param.b64 	%rd2, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_1];
-	ld.param.b64 	%rd1, [model_triton_tem_fused_addmm_relu_sigmoid_t_1_param_0];
-	.loc	1 43 24                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:43:24
-	mov.u32 	%r26, %ctaid.x;
-	.loc	1 44 28                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:44:28
-	add.s32 	%r27, %r1, 15;
-	.loc	1 44 34                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:44:34
-	shr.s32 	%r28, %r27, 31;
-	shr.u32 	%r29, %r28, 28;
-	add.s32 	%r30, %r27, %r29;
-	shr.s32 	%r31, %r30, 4;
-	.loc	1 50 41                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:41
-	and.b32 	%r32, %r26, 2147483640;
-	.loc	1 50 30                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:30
-	sub.s32 	%r33, %r31, %r32;
-	.loc	1 50 50                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:50:50
-	min.s32 	%r34, %r33, 8;
-	.loc	1 51 40                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:51:40
-	rem.s32 	%r35, %r26, %r34;
-	.loc	1 51 34                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:51:34
-	add.s32 	%r36, %r35, %r32;
-	.loc	1 52 19                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:52:19
-	and.b32 	%r37, %r26, 7;
-	.loc	1 52 30                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:52:30
-	div.s32 	%r38, %r37, %r34;
-	.loc	1 56 17                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:17
-	shl.b32 	%r39, %r36, 4;
-	.loc	1 56 40                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:40
-	mov.u32 	%r40, %tid.x;
-	shr.u32 	%r41, %r40, 2;
-	and.b32 	%r42, %r41, 2;
-	bfe.u32 	%r43, %r40, 2, 2;
-	and.b32 	%r44, %r40, 16;
-	shr.u32 	%r45, %r44, 2;
-	or.b32 	%r46, %r43, %r45;
-	and.b32 	%r47, %r40, 15;
-	bfe.u32 	%r48, %r40, 4, 1;
-	.loc	1 56 27                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:56:27
-	or.b32 	%r49, %r39, %r46;
-	or.b32 	%r50, %r49, 8;
-	or.b32 	%r51, %r39, %r47;
-	.loc	1 0 0                           // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0
-	rem.s32 	%r52, %r50, %r1;
-	rem.s32 	%r53, %r49, %r1;
-	.loc	1 71 30                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:71:30
-	shl.b32 	%r54, %r40, 2;
-	and.b32 	%r55, %r54, 12;
-	.loc	1 76 28                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:76:28
-	shl.b32 	%r56, %r53, 4;
-	shl.b32 	%r57, %r52, 4;
-	.loc	1 76 25                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:76:25
-	or.b32 	%r58, %r56, %r55;
-	or.b32 	%r59, %r57, %r55;
-	.loc	1 77 25                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:25
-	mul.wide.s32 	%rd16, %r58, 4;
-	add.s64 	%rd4, %rd1, %rd16;
-	mul.wide.s32 	%rd17, %r59, 4;
-	add.s64 	%rd5, %rd1, %rd17;
-	.loc	1 77 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:20
-	// begin inline asm
-	mov.u32 %r2, 0x0;
-	mov.u32 %r3, 0x0;
-	mov.u32 %r4, 0x0;
-	mov.u32 %r5, 0x0;
-	ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd4 + 0 ];
-	// end inline asm
-	// begin inline asm
-	mov.u32 %r6, 0x0;
-	mov.u32 %r7, 0x0;
-	mov.u32 %r8, 0x0;
-	mov.u32 %r9, 0x0;
-	ld.global.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd5 + 0 ];
-	// end inline asm
-	shl.b32 	%r60, %r46, 6;
-	shl.b32 	%r61, %r55, 2;
-	or.b32 	%r62, %r60, %r61;
-	mov.b32 	%r63, global_smem;
-	add.s32 	%r64, %r63, %r62;
-	st.shared.v4.b32 	[%r64], {%r2, %r3, %r4, %r5};
-	st.shared.v4.b32 	[%r64+512], {%r6, %r7, %r8, %r9};
-	.loc	1 82 25                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:25
-	mul.wide.u32 	%rd18, %r55, 4;
-	add.s64 	%rd6, %rd2, %rd18;
-	.loc	1 82 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:20
-	// begin inline asm
-	mov.u32 %r10, 0x0;
-	mov.u32 %r11, 0x0;
-	mov.u32 %r12, 0x0;
-	mov.u32 %r13, 0x0;
-	ld.global.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd6 + 0 ];
-	// end inline asm
-	// begin inline asm
-	mov.u32 %r14, 0x0;
-	mov.u32 %r15, 0x0;
-	mov.u32 %r16, 0x0;
-	mov.u32 %r17, 0x0;
-	ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd6 + 0 ];
-	// end inline asm
-	add.s32 	%r65, %r63, 1024;
-	add.s32 	%r66, %r65, %r62;
-	st.shared.v4.b32 	[%r66], {%r10, %r11, %r12, %r13};
-	st.shared.v4.b32 	[%r66+512], {%r14, %r15, %r16, %r17};
-	.loc	1 90 17                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:90:17
-	shl.b32 	%r67, %r38, 4;
-	.loc	1 90 27                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:90:27
-	or.b32 	%r68, %r67, %r48;
-	.loc	1 93 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:20
-	setp.lt.s32 	%p10, %r51, %r1;
-	.loc	1 93 34                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:34
-	setp.eq.s32 	%p11, %r68, 0;
-	.loc	1 93 26                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:93:26
-	and.pred 	%p2, %p10, %p11;
-	.loc	1 96 21                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:96:21
-	add.s32 	%r69, %r51, %r68;
-	add.s32 	%r70, %r69, 2;
-	add.s32 	%r71, %r69, 4;
-	add.s32 	%r72, %r69, 6;
-	add.s32 	%r73, %r69, 8;
-	add.s32 	%r74, %r69, 10;
-	add.s32 	%r75, %r69, 12;
-	add.s32 	%r76, %r69, 14;
-	.loc	1 77 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:77:20
-	bar.sync 	0;
-	or.b32 	%r77, %r42, %r45;
-	shl.b32 	%r78, %r77, 6;
-	add.s32 	%r79, %r63, %r78;
-	ld.shared.v4.b32 	{%r80, %r81, %r82, %r83}, [%r79+112];
-	ld.shared.v4.b32 	{%r84, %r85, %r86, %r87}, [%r79+96];
-	ld.shared.v4.b32 	{%r88, %r89, %r90, %r91}, [%r79+80];
-	ld.shared.v4.b32 	{%r92, %r93, %r94, %r95}, [%r79+64];
-	ld.shared.v4.b32 	{%r96, %r97, %r98, %r99}, [%r79+48];
-	ld.shared.v4.b32 	{%r100, %r101, %r102, %r103}, [%r79+32];
-	ld.shared.v4.b32 	{%r104, %r105, %r106, %r107}, [%r79+16];
-	ld.shared.v4.b32 	{%r108, %r109, %r110, %r111}, [%r79];
-	ld.shared.v4.b32 	{%r112, %r113, %r114, %r115}, [%r79+624];
-	ld.shared.v4.b32 	{%r116, %r117, %r118, %r119}, [%r79+608];
-	ld.shared.v4.b32 	{%r120, %r121, %r122, %r123}, [%r79+592];
-	ld.shared.v4.b32 	{%r124, %r125, %r126, %r127}, [%r79+576];
-	ld.shared.v4.b32 	{%r128, %r129, %r130, %r131}, [%r79+560];
-	ld.shared.v4.b32 	{%r132, %r133, %r134, %r135}, [%r79+544];
-	ld.shared.v4.b32 	{%r136, %r137, %r138, %r139}, [%r79+528];
-	ld.shared.v4.b32 	{%r140, %r141, %r142, %r143}, [%r79+512];
-	.loc	1 82 20                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:82:20
-	shl.b32 	%r144, %r40, 1;
-	and.b32 	%r145, %r144, 14;
-	shl.b32 	%r146, %r145, 6;
-	add.s32 	%r147, %r65, %r146;
-	ld.shared.v4.b32 	{%r148, %r149, %r150, %r151}, [%r147];
-	ld.shared.v4.b32 	{%r152, %r153, %r154, %r155}, [%r147+64];
-	ld.shared.v4.b32 	{%r156, %r157, %r158, %r159}, [%r147+16];
-	ld.shared.v4.b32 	{%r160, %r161, %r162, %r163}, [%r147+80];
-	ld.shared.v4.b32 	{%r164, %r165, %r166, %r167}, [%r147+32];
-	ld.shared.v4.b32 	{%r168, %r169, %r170, %r171}, [%r147+96];
-	ld.shared.v4.b32 	{%r172, %r173, %r174, %r175}, [%r147+48];
-	ld.shared.v4.b32 	{%r176, %r177, %r178, %r179}, [%r147+112];
-	.loc	1 85 25                         // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:85:25
-	fma.rn.f32 	%r180, %r108, %r148, 0fBE0AE428;
-	fma.rn.f32 	%r181, %r109, %r149, %r180;
-	fma.rn.f32 	%r182, %r110, %r150, %r181;
-	fma.rn.f32 	%r183, %r111, %r151, %r182;
-	fma.rn.f32 	%r184, %r104, %r156, %r183;
-	fma.rn.f32 	%r185, %r105, %r157, %r184;
-	fma.rn.f32 	%r186, %r106, %r158, %r185;
-	fma.rn.f32 	%r187, %r107, %r159, %r186;
-	fma.rn.f32 	%r188, %r100, %r164, %r187;
-	fma.rn.f32 	%r189, %r101, %r165, %r188;
-	fma.rn.f32 	%r190, %r102, %r166, %r189;
-	fma.rn.f32 	%r191, %r103, %r167, %r190;
-	fma.rn.f32 	%r192, %r96, %r172, %r191;
-	fma.rn.f32 	%r193, %r97, %r173, %r192;
-	fma.rn.f32 	%r194, %r98, %r174, %r193;
-	fma.rn.f32 	%r195, %r99, %r175, %r194;
-	fma.rn.f32 	%r196, %r108, %r152, 0fBE0AE428;
-	fma.rn.f32 	%r197, %r109, %r153, %r196;
-	fma.rn.f32 	%r198, %r110, %r154, %r197;
-	fma.rn.f32 	%r199, %r111, %r155, %r198;
-	fma.rn.f32 	%r200, %r104, %r160, %r199;
-	fma.rn.f32 	%r201, %r105, %r161, %r200;
-	fma.rn.f32 	%r202, %r106, %r162, %r201;
-	fma.rn.f32 	%r203, %r107, %r163, %r202;
-	fma.rn.f32 	%r204, %r100, %r168, %r203;
-	fma.rn.f32 	%r205, %r101, %r169, %r204;
-	fma.rn.f32 	%r206, %r102, %r170, %r205;
-	fma.rn.f32 	%r207, %r103, %r171, %r206;
-	fma.rn.f32 	%r208, %r96, %r176, %r207;
-	fma.rn.f32 	%r209, %r97, %r177, %r208;
-	fma.rn.f32 	%r210, %r98, %r178, %r209;
-	fma.rn.f32 	%r211, %r99, %r179, %r210;
-	fma.rn.f32 	%r212, %r92, %r148, 0fBE0AE428;
-	fma.rn.f32 	%r213, %r93, %r149, %r212;
-	fma.rn.f32 	%r214, %r94, %r150, %r213;
-	fma.rn.f32 	%r215, %r95, %r151, %r214;
-	fma.rn.f32 	%r216, %r88, %r156, %r215;
-	fma.rn.f32 	%r217, %r89, %r157, %r216;
-	fma.rn.f32 	%r218, %r90, %r158, %r217;
-	fma.rn.f32 	%r219, %r91, %r159, %r218;
-	fma.rn.f32 	%r220, %r84, %r164, %r219;
-	fma.rn.f32 	%r221, %r85, %r165, %r220;
-	fma.rn.f32 	%r222, %r86, %r166, %r221;
-	fma.rn.f32 	%r223, %r87, %r167, %r222;
-	fma.rn.f32 	%r224, %r80, %r172, %r223;
-	fma.rn.f32 	%r225, %r81, %r173, %r224;
-	fma.rn.f32 	%r226, %r82, %r174, %r225;
-	fma.rn.f32 	%r227, %r83, %r175, %r226;
-	fma.rn.f32 	%r228, %r92, %r152, 0fBE0AE428;
-	fma.rn.f32 	%r229, %r93, %r153, %r228;
-	fma.rn.f32 	%r230, %r94, %r154, %r229;
-	fma.rn.f32 	%r231, %r95, %r155, %r230;
-	fma.rn.f32 	%r232, %r88, %r160, %r231;
-	fma.rn.f32 	%r233, %r89, %r161, %r232;
-	fma.rn.f32 	%r234, %r90, %r162, %r233;
-	fma.rn.f32 	%r235, %r91, %r163, %r234;
-	fma.rn.f32 	%r236, %r84, %r168, %r235;
-	fma.rn.f32 	%r237, %r85, %r169, %r236;
-	fma.rn.f32 	%r238, %r86, %r170, %r237;
-	fma.rn.f32 	%r239, %r87, %r171, %r238;
-	fma.rn.f32 	%r240, %r80, %r176, %r239;
-	fma.rn.f32 	%r241, %r81, %r177, %r240;
-	fma.rn.f32 	%r242, %r82, %r178, %r241;
-	fma.rn.f32 	%r243, %r83, %r179, %r242;
-	fma.rn.f32 	%r244, %r140, %r148, 0fBE0AE428;
-	fma.rn.f32 	%r245, %r141, %r149, %r244;
-	fma.rn.f32 	%r246, %r142, %r150, %r245;
-	fma.rn.f32 	%r247, %r143, %r151, %r246;
-	fma.rn.f32 	%r248, %r136, %r156, %r247;
-	fma.rn.f32 	%r249, %r137, %r157, %r248;
-	fma.rn.f32 	%r250, %r138, %r158, %r249;
-	fma.rn.f32 	%r251, %r139, %r159, %r250;
-	fma.rn.f32 	%r252, %r132, %r164, %r251;
-	fma.rn.f32 	%r253, %r133, %r165, %r252;
-	fma.rn.f32 	%r254, %r134, %r166, %r253;
-	fma.rn.f32 	%r255, %r135, %r167, %r254;
-	fma.rn.f32 	%r256, %r128, %r172, %r255;
-	fma.rn.f32 	%r257, %r129, %r173, %r256;
-	fma.rn.f32 	%r258, %r130, %r174, %r257;
-	fma.rn.f32 	%r259, %r131, %r175, %r258;
-	fma.rn.f32 	%r260, %r140, %r152, 0fBE0AE428;
-	fma.rn.f32 	%r261, %r141, %r153, %r260;
-	fma.rn.f32 	%r262, %r142, %r154, %r261;
-	fma.rn.f32 	%r263, %r143, %r155, %r262;
-	fma.rn.f32 	%r264, %r136, %r160, %r263;
-	fma.rn.f32 	%r265, %r137, %r161, %r264;
-	fma.rn.f32 	%r266, %r138, %r162, %r265;
-	fma.rn.f32 	%r267, %r139, %r163, %r266;
-	fma.rn.f32 	%r268, %r132, %r168, %r267;
-	fma.rn.f32 	%r269, %r133, %r169, %r268;
-	fma.rn.f32 	%r270, %r134, %r170, %r269;
-	fma.rn.f32 	%r271, %r135, %r171, %r270;
-	fma.rn.f32 	%r272, %r128, %r176, %r271;
-	fma.rn.f32 	%r273, %r129, %r177, %r272;
-	fma.rn.f32 	%r274, %r130, %r178, %r273;
-	fma.rn.f32 	%r275, %r131, %r179, %r274;
-	fma.rn.f32 	%r276, %r124, %r148, 0fBE0AE428;
-	fma.rn.f32 	%r277, %r125, %r149, %r276;
-	fma.rn.f32 	%r278, %r126, %r150, %r277;
-	fma.rn.f32 	%r279, %r127, %r151, %r278;
-	fma.rn.f32 	%r280, %r120, %r156, %r279;
-	fma.rn.f32 	%r281, %r121, %r157, %r280;
-	fma.rn.f32 	%r282, %r122, %r158, %r281;
-	fma.rn.f32 	%r283, %r123, %r159, %r282;
-	fma.rn.f32 	%r284, %r116, %r164, %r283;
-	fma.rn.f32 	%r285, %r117, %r165, %r284;
-	fma.rn.f32 	%r286, %r118, %r166, %r285;
-	fma.rn.f32 	%r287, %r119, %r167, %r286;
-	fma.rn.f32 	%r288, %r112, %r172, %r287;
-	fma.rn.f32 	%r289, %r113, %r173, %r288;
-	fma.rn.f32 	%r290, %r114, %r174, %r289;
-	fma.rn.f32 	%r291, %r115, %r175, %r290;
-	fma.rn.f32 	%r292, %r124, %r152, 0fBE0AE428;
-	fma.rn.f32 	%r293, %r125, %r153, %r292;
-	fma.rn.f32 	%r294, %r126, %r154, %r293;
-	fma.rn.f32 	%r295, %r127, %r155, %r294;
-	fma.rn.f32 	%r296, %r120, %r160, %r295;
-	fma.rn.f32 	%r297, %r121, %r161, %r296;
-	fma.rn.f32 	%r298, %r122, %r162, %r297;
-	fma.rn.f32 	%r299, %r123, %r163, %r298;
-	fma.rn.f32 	%r300, %r116, %r168, %r299;
-	fma.rn.f32 	%r301, %r117, %r169, %r300;
-	fma.rn.f32 	%r302, %r118, %r170, %r301;
-	fma.rn.f32 	%r303, %r119, %r171, %r302;
-	fma.rn.f32 	%r304, %r112, %r176, %r303;
-	fma.rn.f32 	%r305, %r113, %r177, %r304;
-	fma.rn.f32 	%r306, %r114, %r178, %r305;
-	fma.rn.f32 	%r307, %r115, %r179, %r306;
-	mov.b32 	%r308, 0f00000000;
-$L__tmp1:
-	.loc	2 47 30                         // standard.py:47:30 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
-	sub.f32 	%r309, %r308, %r195;
-	sub.f32 	%r310, %r308, %r211;
-	sub.f32 	%r311, %r308, %r227;
-	sub.f32 	%r312, %r308, %r243;
-	sub.f32 	%r313, %r308, %r259;
-	sub.f32 	%r314, %r308, %r275;
-	sub.f32 	%r315, %r308, %r291;
-	sub.f32 	%r316, %r308, %r307;
-	.loc	2 47 29                         // standard.py:47:29 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
-	mul.f32 	%r317, %r309, 0f3FB8AA3B;
-	ex2.approx.f32 	%r318, %r317;
-	mul.f32 	%r319, %r310, 0f3FB8AA3B;
-	ex2.approx.f32 	%r320, %r319;
-	mul.f32 	%r321, %r311, 0f3FB8AA3B;
-	ex2.approx.f32 	%r322, %r321;
-	mul.f32 	%r323, %r312, 0f3FB8AA3B;
-	ex2.approx.f32 	%r324, %r323;
-	mul.f32 	%r325, %r313, 0f3FB8AA3B;
-	ex2.approx.f32 	%r326, %r325;
-	mul.f32 	%r327, %r314, 0f3FB8AA3B;
-	ex2.approx.f32 	%r328, %r327;
-	mul.f32 	%r329, %r315, 0f3FB8AA3B;
-	ex2.approx.f32 	%r330, %r329;
-	mul.f32 	%r331, %r316, 0f3FB8AA3B;
-	ex2.approx.f32 	%r332, %r331;
-	.loc	2 47 20                         // standard.py:47:20 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
-	add.f32 	%r333, %r318, 0f3F800000;
-	add.f32 	%r334, %r320, 0f3F800000;
-	add.f32 	%r335, %r322, 0f3F800000;
-	add.f32 	%r336, %r324, 0f3F800000;
-	add.f32 	%r337, %r326, 0f3F800000;
-	add.f32 	%r338, %r328, 0f3F800000;
-	add.f32 	%r339, %r330, 0f3F800000;
-	add.f32 	%r340, %r332, 0f3F800000;
-	mov.b32 	%r341, 0f3F800000;
-	.loc	2 47 16                         // standard.py:47:16 @[ cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:99:22 ]
-	div.full.f32 	%r342, %r341, %r333;
-	div.full.f32 	%r343, %r341, %r334;
-	div.full.f32 	%r344, %r341, %r335;
-	div.full.f32 	%r345, %r341, %r336;
-	div.full.f32 	%r346, %r341, %r337;
-	div.full.f32 	%r347, %r341, %r338;
-	div.full.f32 	%r348, %r341, %r339;
-	div.full.f32 	%r349, %r341, %r340;
-$L__tmp2:
-	.loc	1 100 25                        // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:100:25
-	mul.wide.s32 	%rd19, %r69, 4;
-	add.s64 	%rd8, %rd3, %rd19;
-	mul.wide.s32 	%rd20, %r70, 4;
-	add.s64 	%rd9, %rd3, %rd20;
-	mul.wide.s32 	%rd21, %r71, 4;
-	add.s64 	%rd10, %rd3, %rd21;
-	mul.wide.s32 	%rd22, %r72, 4;
-	add.s64 	%rd11, %rd3, %rd22;
-	mul.wide.s32 	%rd23, %r73, 4;
-	add.s64 	%rd12, %rd3, %rd23;
-	mul.wide.s32 	%rd24, %r74, 4;
-	add.s64 	%rd13, %rd3, %rd24;
-	mul.wide.s32 	%rd25, %r75, 4;
-	add.s64 	%rd14, %rd3, %rd25;
-	mul.wide.s32 	%rd26, %r76, 4;
-	add.s64 	%rd15, %rd3, %rd26;
-	.loc	1 100 68                        // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:100:68
-	bar.sync 	0;
-	and.b32 	%r350, %r54, 96;
-	or.b32 	%r351, %r145, %r350;
-	shl.b32 	%r352, %r40, 4;
-	and.b32 	%r353, %r352, 240;
-	shr.u32 	%r354, %r350, 1;
-	add.s32 	%r355, %r63, %r354;
-	shl.b32 	%r356, %r351, 2;
-	add.s32 	%r357, %r355, %r356;
-	st.shared.v2.b32 	[%r357], {%r342, %r343};
-	or.b32 	%r358, %r350, 16;
-	shr.u32 	%r359, %r358, 1;
-	add.s32 	%r360, %r63, %r359;
-	add.s32 	%r361, %r360, %r356;
-	st.shared.v2.b32 	[%r361+64], {%r344, %r345};
-	or.b32 	%r362, %r350, 128;
-	shr.u32 	%r363, %r362, 1;
-	add.s32 	%r364, %r63, %r363;
-	add.s32 	%r365, %r364, %r356;
-	st.shared.v2.b32 	[%r365+512], {%r346, %r347};
-	or.b32 	%r366, %r350, 144;
-	shr.u32 	%r367, %r366, 1;
-	add.s32 	%r368, %r63, %r367;
-	add.s32 	%r369, %r368, %r356;
-	st.shared.v2.b32 	[%r369+576], {%r348, %r349};
-	bar.sync 	0;
-	shr.u32 	%r370, %r353, 1;
-	add.s32 	%r371, %r63, %r370;
-	shl.b32 	%r372, %r353, 2;
-	add.s32 	%r373, %r371, %r372;
-	add.s32 	%r374, %r373, %r45;
-	ld.shared.b32 	%r18, [%r374];
-	ld.shared.b32 	%r19, [%r374+8];
-	ld.shared.b32 	%r20, [%r374+16];
-	ld.shared.b32 	%r21, [%r374+24];
-	ld.shared.b32 	%r22, [%r374+32];
-	ld.shared.b32 	%r23, [%r374+40];
-	ld.shared.b32 	%r24, [%r374+48];
-	ld.shared.b32 	%r25, [%r374+56];
-	// begin inline asm
-	@%p2 st.global.b32 [ %rd8 + 0 ], { %r18 };
-	// end inline asm
-	mov.pred 	%p3, 0;
-	// begin inline asm
-	@%p3 st.global.b32 [ %rd9 + 0 ], { %r19 };
-	// end inline asm
-	// begin inline asm
-	@%p3 st.global.b32 [ %rd10 + 0 ], { %r20 };
-	// end inline asm
-	// begin inline asm
-	@%p3 st.global.b32 [ %rd11 + 0 ], { %r21 };
-	// end inline asm
-	// begin inline asm
-	@%p3 st.global.b32 [ %rd12 + 0 ], { %r22 };
-	// end inline asm
-	// begin inline asm
-	@%p3 st.global.b32 [ %rd13 + 0 ], { %r23 };
-	// end inline asm
-	// begin inline asm
-	@%p3 st.global.b32 [ %rd14 + 0 ], { %r24 };
-	// end inline asm
-	// begin inline asm
-	@%p3 st.global.b32 [ %rd15 + 0 ], { %r25 };
-	// end inline asm
-$L__BB0_1:                              // %common.ret
-	.loc	1 0 0                           // cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py:0
-	ret;
-$L__tmp3:
-$L__func_end0:
-                                        // -- End function
-}
-	.file	1 "/tmp/torchinductor_shangdiy/q4/cq46volrzma67indtwhj5a4n6zr367gqpkcwo2hlphynhjthn3uk.py"
-	.file	2 "/home/shangdiy/miniconda3/envs/pytorch-3.10/lib/python3.10/site-packages/triton/language/standard.py"
-	.section	.debug_abbrev
-	{
-.b8 1                                   // Abbreviation Code
-.b8 17                                  // DW_TAG_compile_unit
-.b8 1                                   // DW_CHILDREN_yes
-.b8 37                                  // DW_AT_producer
-.b8 8                                   // DW_FORM_string
-.b8 19                                  // DW_AT_language
-.b8 5                                   // DW_FORM_data2
-.b8 3                                   // DW_AT_name
-.b8 8                                   // DW_FORM_string
-.b8 16                                  // DW_AT_stmt_list
-.b8 6                                   // DW_FORM_data4
-.b8 27                                  // DW_AT_comp_dir
-.b8 8                                   // DW_FORM_string
-.b8 0                                   // EOM(1)
-.b8 0                                   // EOM(2)
-.b8 2                                   // Abbreviation Code
-.b8 46                                  // DW_TAG_subprogram
-.b8 0                                   // DW_CHILDREN_no
-.b8 3                                   // DW_AT_name
-.b8 8                                   // DW_FORM_string
-.b8 32                                  // DW_AT_inline
-.b8 11                                  // DW_FORM_data1
-.b8 0                                   // EOM(1)
-.b8 0                                   // EOM(2)
-.b8 3                                   // Abbreviation Code
-.b8 46                                  // DW_TAG_subprogram
-.b8 1                                   // DW_CHILDREN_yes
-.b8 17                                  // DW_AT_low_pc
-.b8 1                                   // DW_FORM_addr
-.b8 18                                  // DW_AT_high_pc
-.b8 1                                   // DW_FORM_addr
-.b8 49                                  // DW_AT_abstract_origin
-.b8 19                                  // DW_FORM_ref4
-.b8 0                                   // EOM(1)
-.b8 0                                   // EOM(2)
-.b8 4                                   // Abbreviation Code
-.b8 29                                  // DW_TAG_inlined_subroutine
-.b8 0                                   // DW_CHILDREN_no
-.b8 49                                  // DW_AT_abstract_origin
-.b8 19                                  // DW_FORM_ref4
-.b8 17                                  // DW_AT_low_pc
-.b8 1                                   // DW_FORM_addr
-.b8 18                                  // DW_AT_high_pc
-.b8 1                                   // DW_FORM_addr
-.b8 88                                  // DW_AT_call_file
-.b8 11                                  // DW_FORM_data1
-.b8 89                                  // DW_AT_call_line
-.b8 11                                  // DW_FORM_data1
-.b8 87                                  // DW_AT_call_column
-.b8 11                                  // DW_FORM_data1
-.b8 0                                   // EOM(1)
-.b8 0                                   // EOM(2)
-.b8 0                                   // EOM(3)
-	}
-	.section	.debug_info
-	{
-.b32 203                                // Length of Unit
-.b8 2                                   // DWARF version number
-.b8 0
-.b32 .debug_abbrev                      // Offset Into Abbrev. Section
-.b8 8                                   // Address Size (in bytes)
-.b8 1                                   // Abbrev [1] 0xb:0xc4 DW_TAG_compile_unit
-.b8 116                                 // DW_AT_producer
-.b8 114
-.b8 105
-.b8 116
-.b8 111
-.b8 110
-.b8 0
-.b8 2                                   // DW_AT_language
-.b8 0
-.b8 99                                  // DW_AT_name
-.b8 113
-.b8 52
-.b8 54
-.b8 118
-.b8 111
-.b8 108
-.b8 114
-.b8 122
-.b8 109
-.b8 97
-.b8 54
-.b8 55
-.b8 105
-.b8 110
-.b8 100
-.b8 116
-.b8 119
-.b8 104
-.b8 106
-.b8 53
-.b8 97
-.b8 52
-.b8 110
-.b8 54
-.b8 122
-.b8 114
-.b8 51
-.b8 54
-.b8 55
-.b8 103
-.b8 113
-.b8 112
-.b8 107
-.b8 99
-.b8 119
-.b8 111
-.b8 50
-.b8 104
-.b8 108
-.b8 112
-.b8 104
-.b8 121
-.b8 110
-.b8 104
-.b8 106
-.b8 116
-.b8 104
-.b8 110
-.b8 51
-.b8 117
-.b8 107
-.b8 46
-.b8 112
-.b8 121
-.b8 0
-.b32 .debug_line                        // DW_AT_stmt_list
-.b8 47                                  // DW_AT_comp_dir
-.b8 116
-.b8 109
-.b8 112
-.b8 47
-.b8 116
-.b8 111
-.b8 114
-.b8 99
-.b8 104
-.b8 105
-.b8 110
-.b8 100
-.b8 117
-.b8 99
-.b8 116
-.b8 111
-.b8 114
-.b8 95
-.b8 115
-.b8 104
-.b8 97
-.b8 110
-.b8 103
-.b8 100
-.b8 105
-.b8 121
-.b8 47
-.b8 113
-.b8 52
-.b8 0
-.b8 2                                   // Abbrev [2] 0x70:0x30 DW_TAG_subprogram
-.b8 109                                 // DW_AT_name
-.b8 111
-.b8 100
-.b8 101
-.b8 108
-.b8 95
-.b8 116
-.b8 114
-.b8 105
-.b8 116
-.b8 111
-.b8 110
-.b8 95
-.b8 116
-.b8 101
-.b8 109
-.b8 95
-.b8 102
-.b8 117
-.b8 115
-.b8 101
-.b8 100
-.b8 95
-.b8 97
-.b8 100
-.b8 100
-.b8 109
-.b8 109
-.b8 95
-.b8 114
-.b8 101
-.b8 108
-.b8 117
-.b8 95
-.b8 115
-.b8 105
-.b8 103
-.b8 109
-.b8 111
-.b8 105
-.b8 100
-.b8 95
-.b8 116
-.b8 95
-.b8 49
-.b8 0
-.b8 1                                   // DW_AT_inline
-.b8 3                                   // Abbrev [3] 0xa0:0x2e DW_TAG_subprogram
-.b64 $L__func_begin0                    // DW_AT_low_pc
-.b64 $L__func_end0                      // DW_AT_high_pc
-.b32 112                                // DW_AT_abstract_origin
-.b8 4                                   // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine
-.b32 112                                // DW_AT_abstract_origin
-.b64 $L__tmp1                           // DW_AT_low_pc
-.b64 $L__tmp2                           // DW_AT_high_pc
-.b8 1                                   // DW_AT_call_file
-.b8 99                                  // DW_AT_call_line
-.b8 22                                  // DW_AT_call_column
-.b8 0                                   // End Of Children Mark
-.b8 0                                   // End Of Children Mark
-	}
-	.section	.debug_macinfo	{	}
--- a/model2/data/aotinductor/model/model_triton_tem_fused_addmm_relu_t_0.cubin
+++ b/model2/data/aotinductor/model/model_triton_tem_fused_addmm_relu_t_0.cubin
--- a/model2/data/aotinductor/model/model_triton_tem_fused_addmm_relu_t_0.ptx
+++ b/model2/data/aotinductor/model/model_triton_tem_fused_addmm_relu_t_0.ptx
@ -1,727 +0,0 @@
-//
-// Generated by LLVM NVPTX Back-End
-//
-
-.version 8.7
-.target sm_89
-.address_size 64
-
-	// .globl	model_triton_tem_fused_addmm_relu_t_0 // -- Begin function model_triton_tem_fused_addmm_relu_t_0
-.extern .shared .align 16 .b8 global_smem[];
-                                        // @model_triton_tem_fused_addmm_relu_t_0
-.visible .entry model_triton_tem_fused_addmm_relu_t_0(
-	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_0,
-	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_1,
-	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_2,
-	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_3,
-	.param .u32 model_triton_tem_fused_addmm_relu_t_0_param_4,
-	.param .u64 .ptr .global .align 1 model_triton_tem_fused_addmm_relu_t_0_param_5
-)
-.reqntid 32
-{
-	.reg .pred 	%p<27>;
-	.reg .b32 	%r<398>;
-	.reg .b64 	%rd<29>;
-	.loc	1 18 0                          // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:18:0
-$L__func_begin0:
-	.loc	1 18 0                          // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:18:0
-
-// %bb.0:
-	ld.param.b32 	%r1, [model_triton_tem_fused_addmm_relu_t_0_param_4];
-$L__tmp0:
-	.loc	1 34 16                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:34:16
-	and.b32 	%r2, %r1, 268435455;
-	setp.ne.s32 	%p1, %r2, 0;
-	@%p1 bra 	$L__BB0_2;
-	bra.uni 	$L__BB0_1;
-$L__BB0_2:
-	.loc	1 0 16                          // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0:16
-	ld.param.b64 	%rd4, [model_triton_tem_fused_addmm_relu_t_0_param_3];
-	ld.param.b64 	%rd3, [model_triton_tem_fused_addmm_relu_t_0_param_2];
-	ld.param.b64 	%rd2, [model_triton_tem_fused_addmm_relu_t_0_param_1];
-	ld.param.b64 	%rd1, [model_triton_tem_fused_addmm_relu_t_0_param_0];
-	.loc	1 43 24                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:43:24
-	mov.u32 	%r51, %ctaid.x;
-	.loc	1 44 28                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:44:28
-	add.s32 	%r52, %r1, 15;
-	.loc	1 44 34                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:44:34
-	shr.s32 	%r53, %r52, 31;
-	shr.u32 	%r54, %r53, 28;
-	add.s32 	%r55, %r52, %r54;
-	shr.s32 	%r56, %r55, 4;
-	.loc	1 50 41                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:41
-	and.b32 	%r57, %r51, 2147483640;
-	.loc	1 50 30                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:30
-	sub.s32 	%r58, %r56, %r57;
-	.loc	1 50 50                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:50:50
-	min.s32 	%r59, %r58, 8;
-	.loc	1 51 40                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:51:40
-	rem.s32 	%r60, %r51, %r59;
-	.loc	1 51 34                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:51:34
-	add.s32 	%r61, %r60, %r57;
-	.loc	1 52 19                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:52:19
-	and.b32 	%r62, %r51, 7;
-	.loc	1 52 30                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:52:30
-	div.s32 	%r63, %r62, %r59;
-	.loc	1 56 17                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:17
-	shl.b32 	%r64, %r61, 4;
-	.loc	1 56 40                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:40
-	mov.u32 	%r65, %tid.x;
-	and.b32 	%r66, %r65, 4;
-	bfe.u32 	%r67, %r65, 2, 1;
-	shr.u32 	%r68, %r65, 2;
-	and.b32 	%r69, %r68, 6;
-	or.b32 	%r70, %r69, %r67;
-	bfe.u32 	%r71, %r65, 3, 2;
-	and.b32 	%r72, %r65, 1;
-	shl.b32 	%r73, %r72, 2;
-	shl.b32 	%r74, %r65, 2;
-	and.b32 	%r75, %r74, 12;
-	or.b32 	%r76, %r75, 2;
-	.loc	1 56 27                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:56:27
-	or.b32 	%r77, %r64, %r70;
-	or.b32 	%r78, %r77, 8;
-	.loc	1 57 17                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:57:17
-	shl.b32 	%r79, %r63, 4;
-	.loc	1 57 27                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:57:27
-	or.b32 	%r80, %r79, %r75;
-	.loc	1 0 0                           // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0
-	rem.s32 	%r81, %r78, %r1;
-	rem.s32 	%r82, %r77, %r1;
-	.loc	1 71 36                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:71:36
-	setp.lt.u32 	%p2, %r75, 10;
-	setp.lt.u32 	%p3, %r76, 10;
-	.loc	1 72 24                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:72:24
-	shl.b32 	%r83, %r65, 1;
-	and.b32 	%r84, %r83, 6;
-	shl.b32 	%r85, %r66, 1;
-	or.b32 	%r86, %r84, %r85;
-	.loc	1 72 36                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:72:36
-	setp.lt.u32 	%p6, %r86, 10;
-	.loc	1 79 28                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:79:28
-	mul.lo.s32 	%r87, %r82, 10;
-	mul.lo.s32 	%r88, %r81, 10;
-	.loc	1 79 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:79:25
-	add.s32 	%r89, %r87, %r75;
-	add.s32 	%r90, %r87, %r76;
-	add.s32 	%r91, %r88, %r75;
-	add.s32 	%r92, %r88, %r76;
-	.loc	1 80 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:25
-	mul.wide.s32 	%rd21, %r89, 4;
-	add.s64 	%rd5, %rd1, %rd21;
-	mul.wide.s32 	%rd22, %r90, 4;
-	add.s64 	%rd6, %rd1, %rd22;
-	mul.wide.s32 	%rd23, %r91, 4;
-	add.s64 	%rd7, %rd1, %rd23;
-	mul.wide.s32 	%rd24, %r92, 4;
-	add.s64 	%rd8, %rd1, %rd24;
-	mov.b32 	%r5, 0;
-	.loc	1 80 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:20
-	// begin inline asm
-	mov.u32 %r3, %r5;
-	mov.u32 %r4, %r5;
-	@%p2 ld.global.v2.b32 { %r3, %r4 }, [ %rd5 + 0 ];
-	// end inline asm
-	// begin inline asm
-	mov.u32 %r7, %r5;
-	mov.u32 %r8, %r5;
-	@%p3 ld.global.v2.b32 { %r7, %r8 }, [ %rd6 + 0 ];
-	// end inline asm
-	// begin inline asm
-	mov.u32 %r11, %r5;
-	mov.u32 %r12, %r5;
-	@%p2 ld.global.v2.b32 { %r11, %r12 }, [ %rd7 + 0 ];
-	// end inline asm
-	// begin inline asm
-	mov.u32 %r15, %r5;
-	mov.u32 %r16, %r5;
-	@%p3 ld.global.v2.b32 { %r15, %r16 }, [ %rd8 + 0 ];
-	// end inline asm
-	shl.b32 	%r93, %r75, 2;
-	mov.b32 	%r94, global_smem;
-	add.s32 	%r95, %r94, %r93;
-	shl.b32 	%r96, %r70, 6;
-	add.s32 	%r97, %r95, %r96;
-	st.shared.v4.b32 	[%r97], {%r3, %r4, %r7, %r8};
-	st.shared.v4.b32 	[%r97+512], {%r11, %r12, %r15, %r16};
-	.loc	1 85 50                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:50
-	mad.lo.s32 	%r98, %r71, 10, %r86;
-	.loc	1 85 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:25
-	mul.wide.u32 	%rd25, %r98, 4;
-	add.s64 	%rd9, %rd2, %rd25;
-	add.s64 	%rd10, %rd9, 160;
-	add.s64 	%rd11, %rd9, 320;
-	add.s64 	%rd12, %rd9, 480;
-	.loc	1 85 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
-	// begin inline asm
-	mov.u32 %r19, %r5;
-	mov.u32 %r20, %r5;
-	@%p6 ld.global.v2.b32 { %r19, %r20 }, [ %rd9 + 0 ];
-	// end inline asm
-	// begin inline asm
-	mov.u32 %r23, %r5;
-	mov.u32 %r24, %r5;
-	@%p6 ld.global.v2.b32 { %r23, %r24 }, [ %rd10 + 0 ];
-	// end inline asm
-	// begin inline asm
-	mov.u32 %r27, %r5;
-	mov.u32 %r28, %r5;
-	@%p6 ld.global.v2.b32 { %r27, %r28 }, [ %rd11 + 0 ];
-	// end inline asm
-	// begin inline asm
-	mov.u32 %r31, %r5;
-	mov.u32 %r32, %r5;
-	@%p6 ld.global.v2.b32 { %r31, %r32 }, [ %rd12 + 0 ];
-	// end inline asm
-	shl.b32 	%r99, %r86, 2;
-	add.s32 	%r100, %r94, 1024;
-	add.s32 	%r101, %r100, %r99;
-	shl.b32 	%r102, %r71, 6;
-	add.s32 	%r103, %r101, %r102;
-	st.shared.v2.b32 	[%r103], {%r19, %r20};
-	st.shared.v2.b32 	[%r103+256], {%r23, %r24};
-	st.shared.v2.b32 	[%r103+512], {%r27, %r28};
-	st.shared.v2.b32 	[%r103+768], {%r31, %r32};
-	.loc	1 96 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:20
-	setp.lt.s32 	%p14, %r77, %r1;
-	setp.lt.s32 	%p15, %r78, %r1;
-	.loc	1 96 34                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:34
-	setp.eq.s32 	%p16, %r63, 0;
-	.loc	1 96 26                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:96:26
-	and.pred 	%p10, %p16, %p14;
-	and.pred 	%p11, %p15, %p16;
-	.loc	1 100 30                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:30
-	mul.wide.u32 	%rd26, %r80, 4;
-	add.s64 	%rd14, %rd3, %rd26;
-	.loc	1 100 66                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
-	// begin inline asm
-	mov.u64 %rd15, 0x0;
-	createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0;
-	// end inline asm
-	// begin inline asm
-	mov.u32 %r35, 0x0;
-	mov.u32 %r36, 0x0;
-	mov.u32 %r37, 0x0;
-	mov.u32 %r38, 0x0;
-	@%p10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd14 + 0 ], %rd15;
-	// end inline asm
-	// begin inline asm
-	mov.u64 %rd18, 0x0;
-	createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0;
-	// end inline asm
-	// begin inline asm
-	mov.u32 %r39, 0x0;
-	mov.u32 %r40, 0x0;
-	mov.u32 %r41, 0x0;
-	mov.u32 %r42, 0x0;
-	@%p11 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r39, %r40, %r41, %r42 }, [ %rd14 + 0 ], %rd18;
-	// end inline asm
-	.loc	1 80 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:80:20
-	bar.sync 	0;
-	shl.b32 	%r104, %r69, 6;
-	add.s32 	%r105, %r94, %r104;
-	ld.shared.v4.b32 	{%r106, %r107, %r108, %r109}, [%r105+48];
-	ld.shared.v4.b32 	{%r110, %r111, %r112, %r113}, [%r105+112];
-	ld.shared.v4.b32 	{%r114, %r115, %r116, %r117}, [%r105+32];
-	ld.shared.v4.b32 	{%r118, %r119, %r120, %r121}, [%r105+96];
-	ld.shared.v4.b32 	{%r122, %r123, %r124, %r125}, [%r105+16];
-	ld.shared.v4.b32 	{%r126, %r127, %r128, %r129}, [%r105+80];
-	ld.shared.v4.b32 	{%r130, %r131, %r132, %r133}, [%r105];
-	ld.shared.v4.b32 	{%r134, %r135, %r136, %r137}, [%r105+64];
-	ld.shared.v4.b32 	{%r138, %r139, %r140, %r141}, [%r105+560];
-	ld.shared.v4.b32 	{%r142, %r143, %r144, %r145}, [%r105+624];
-	ld.shared.v4.b32 	{%r146, %r147, %r148, %r149}, [%r105+544];
-	ld.shared.v4.b32 	{%r150, %r151, %r152, %r153}, [%r105+608];
-	ld.shared.v4.b32 	{%r154, %r155, %r156, %r157}, [%r105+528];
-	ld.shared.v4.b32 	{%r158, %r159, %r160, %r161}, [%r105+592];
-	ld.shared.v4.b32 	{%r162, %r163, %r164, %r165}, [%r105+512];
-	ld.shared.v4.b32 	{%r166, %r167, %r168, %r169}, [%r105+576];
-	.loc	1 85 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
-	mad.lo.s32 	%r170, %r86, 60, %r101;
-	or.b32 	%r171, %r84, 1;
-	or.b32 	%r172, %r171, %r85;
-	shl.b32 	%r173, %r172, 6;
-	add.s32 	%r174, %r100, %r173;
-	ld.shared.b32 	%r175, [%r170+16];
-	ld.shared.b32 	%r176, [%r174+16];
-	ld.shared.b32 	%r177, [%r174+52];
-	ld.shared.b32 	%r178, [%r170+60];
-	.loc	1 100 66                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
-	shr.u32 	%r179, %r66, 1;
-	setp.eq.s32 	%p17, %r66, 0;
-	bfe.u32 	%r180, %r65, 1, 1;
-	or.b32 	%r181, %r73, %r180;
-	or.b32 	%r182, %r181, %r179;
-	and.b32 	%r183, %r65, 24;
-	or.b32 	%r184, %r182, %r183;
-	selp.b32 	%r185, %r35, %r37, %p17;
-	shfl.sync.idx.b32 	%r186, %r185, %r184, 31, -1;
-	selp.b32 	%r187, %r36, %r38, %p17;
-	shfl.sync.idx.b32 	%r188, %r187, %r184, 31, -1;
-	selp.b32 	%r189, %r37, %r35, %p17;
-	xor.b32 	%r190, %r184, 4;
-	shfl.sync.idx.b32 	%r191, %r189, %r190, 31, -1;
-	selp.b32 	%r192, %r38, %r36, %p17;
-	shfl.sync.idx.b32 	%r193, %r192, %r190, 31, -1;
-	selp.b32 	%r194, %r39, %r41, %p17;
-	shfl.sync.idx.b32 	%r195, %r194, %r184, 31, -1;
-	selp.b32 	%r196, %r40, %r42, %p17;
-	shfl.sync.idx.b32 	%r197, %r196, %r184, 31, -1;
-	selp.b32 	%r198, %r41, %r39, %p17;
-	shfl.sync.idx.b32 	%r199, %r198, %r190, 31, -1;
-	selp.b32 	%r200, %r42, %r40, %p17;
-	shfl.sync.idx.b32 	%r201, %r200, %r190, 31, -1;
-	setp.eq.s32 	%p18, %r72, 0;
-	.loc	1 85 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
-	ld.shared.v4.b32 	{%r202, %r203, %r204, %r205}, [%r170];
-	ld.shared.v4.b32 	{%r206, %r207, %r208, %r209}, [%r174];
-	.loc	1 100 66                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:100:66
-	selp.f32 	%r210, %r186, %r191, %p18;
-	selp.f32 	%r211, %r188, %r193, %p18;
-	selp.f32 	%r212, %r191, %r186, %p18;
-	selp.f32 	%r213, %r193, %r188, %p18;
-	selp.f32 	%r214, %r195, %r199, %p18;
-	selp.f32 	%r215, %r197, %r201, %p18;
-	selp.f32 	%r216, %r199, %r195, %p18;
-	selp.f32 	%r217, %r201, %r197, %p18;
-	.loc	1 88 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:88:25
-	fma.rn.f32 	%r218, %r166, %r206, %r217;
-	fma.rn.f32 	%r219, %r166, %r202, %r216;
-	fma.rn.f32 	%r220, %r162, %r206, %r215;
-	fma.rn.f32 	%r221, %r162, %r202, %r214;
-	fma.rn.f32 	%r222, %r134, %r206, %r213;
-	fma.rn.f32 	%r223, %r134, %r202, %r212;
-	fma.rn.f32 	%r224, %r130, %r206, %r211;
-	fma.rn.f32 	%r225, %r130, %r202, %r210;
-	fma.rn.f32 	%r226, %r131, %r203, %r225;
-	fma.rn.f32 	%r227, %r131, %r207, %r224;
-	fma.rn.f32 	%r228, %r135, %r203, %r223;
-	fma.rn.f32 	%r229, %r135, %r207, %r222;
-	fma.rn.f32 	%r230, %r163, %r203, %r221;
-	fma.rn.f32 	%r231, %r163, %r207, %r220;
-	fma.rn.f32 	%r232, %r167, %r203, %r219;
-	fma.rn.f32 	%r233, %r167, %r207, %r218;
-	fma.rn.f32 	%r234, %r168, %r208, %r233;
-	fma.rn.f32 	%r235, %r168, %r204, %r232;
-	fma.rn.f32 	%r236, %r164, %r208, %r231;
-	fma.rn.f32 	%r237, %r164, %r204, %r230;
-	fma.rn.f32 	%r238, %r136, %r208, %r229;
-	fma.rn.f32 	%r239, %r136, %r204, %r228;
-	fma.rn.f32 	%r240, %r132, %r208, %r227;
-	fma.rn.f32 	%r241, %r132, %r204, %r226;
-	fma.rn.f32 	%r242, %r133, %r205, %r241;
-	fma.rn.f32 	%r243, %r133, %r209, %r240;
-	fma.rn.f32 	%r244, %r137, %r205, %r239;
-	fma.rn.f32 	%r245, %r137, %r209, %r238;
-	fma.rn.f32 	%r246, %r165, %r205, %r237;
-	fma.rn.f32 	%r247, %r165, %r209, %r236;
-	fma.rn.f32 	%r248, %r169, %r205, %r235;
-	fma.rn.f32 	%r249, %r169, %r209, %r234;
-	fma.rn.f32 	%r250, %r158, %r176, %r249;
-	fma.rn.f32 	%r251, %r158, %r175, %r248;
-	fma.rn.f32 	%r252, %r154, %r176, %r247;
-	fma.rn.f32 	%r253, %r154, %r175, %r246;
-	fma.rn.f32 	%r254, %r126, %r176, %r245;
-	fma.rn.f32 	%r255, %r126, %r175, %r244;
-	fma.rn.f32 	%r256, %r122, %r176, %r243;
-	fma.rn.f32 	%r257, %r122, %r175, %r242;
-	.loc	1 85 20                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:85:20
-	ld.shared.b32 	%r258, [%r170+24];
-	ld.shared.b32 	%r259, [%r170+20];
-	ld.shared.b32 	%r260, [%r174+24];
-	ld.shared.b32 	%r261, [%r174+20];
-	ld.shared.b32 	%r262, [%r170+32];
-	ld.shared.b32 	%r263, [%r170+28];
-	ld.shared.b32 	%r264, [%r174+32];
-	ld.shared.b32 	%r265, [%r174+28];
-	ld.shared.b32 	%r266, [%r170+40];
-	ld.shared.b32 	%r267, [%r170+36];
-	ld.shared.b32 	%r268, [%r174+40];
-	ld.shared.b32 	%r269, [%r174+36];
-	ld.shared.b32 	%r270, [%r170+48];
-	ld.shared.b32 	%r271, [%r170+44];
-	ld.shared.b32 	%r272, [%r174+48];
-	ld.shared.b32 	%r273, [%r174+44];
-	ld.shared.b32 	%r274, [%r170+56];
-	ld.shared.b32 	%r275, [%r170+52];
-	ld.shared.v2.b32 	{%r276, %r277}, [%r174+56];
-	.loc	1 88 25                         // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:88:25
-	fma.rn.f32 	%r278, %r123, %r259, %r257;
-	fma.rn.f32 	%r279, %r123, %r261, %r256;
-	fma.rn.f32 	%r280, %r127, %r259, %r255;
-	fma.rn.f32 	%r281, %r127, %r261, %r254;
-	fma.rn.f32 	%r282, %r155, %r259, %r253;
-	fma.rn.f32 	%r283, %r155, %r261, %r252;
-	fma.rn.f32 	%r284, %r159, %r259, %r251;
-	fma.rn.f32 	%r285, %r159, %r261, %r250;
-	fma.rn.f32 	%r286, %r160, %r260, %r285;
-	fma.rn.f32 	%r287, %r160, %r258, %r284;
-	fma.rn.f32 	%r288, %r156, %r260, %r283;
-	fma.rn.f32 	%r289, %r156, %r258, %r282;
-	fma.rn.f32 	%r290, %r128, %r260, %r281;
-	fma.rn.f32 	%r291, %r128, %r258, %r280;
-	fma.rn.f32 	%r292, %r124, %r260, %r279;
-	fma.rn.f32 	%r293, %r124, %r258, %r278;
-	fma.rn.f32 	%r294, %r125, %r263, %r293;
-	fma.rn.f32 	%r295, %r125, %r265, %r292;
-	fma.rn.f32 	%r296, %r129, %r263, %r291;
-	fma.rn.f32 	%r297, %r129, %r265, %r290;
-	fma.rn.f32 	%r298, %r157, %r263, %r289;
-	fma.rn.f32 	%r299, %r157, %r265, %r288;
-	fma.rn.f32 	%r300, %r161, %r263, %r287;
-	fma.rn.f32 	%r301, %r161, %r265, %r286;
-	fma.rn.f32 	%r302, %r150, %r264, %r301;
-	fma.rn.f32 	%r303, %r150, %r262, %r300;
-	fma.rn.f32 	%r304, %r146, %r264, %r299;
-	fma.rn.f32 	%r305, %r146, %r262, %r298;
-	fma.rn.f32 	%r306, %r118, %r264, %r297;
-	fma.rn.f32 	%r307, %r118, %r262, %r296;
-	fma.rn.f32 	%r308, %r114, %r264, %r295;
-	fma.rn.f32 	%r309, %r114, %r262, %r294;
-	fma.rn.f32 	%r310, %r115, %r267, %r309;
-	fma.rn.f32 	%r311, %r115, %r269, %r308;
-	fma.rn.f32 	%r312, %r119, %r267, %r307;
-	fma.rn.f32 	%r313, %r119, %r269, %r306;
-	fma.rn.f32 	%r314, %r147, %r267, %r305;
-	fma.rn.f32 	%r315, %r147, %r269, %r304;
-	fma.rn.f32 	%r316, %r151, %r267, %r303;
-	fma.rn.f32 	%r317, %r151, %r269, %r302;
-	fma.rn.f32 	%r318, %r152, %r268, %r317;
-	fma.rn.f32 	%r319, %r152, %r266, %r316;
-	fma.rn.f32 	%r320, %r148, %r268, %r315;
-	fma.rn.f32 	%r321, %r148, %r266, %r314;
-	fma.rn.f32 	%r322, %r120, %r268, %r313;
-	fma.rn.f32 	%r323, %r120, %r266, %r312;
-	fma.rn.f32 	%r324, %r116, %r268, %r311;
-	fma.rn.f32 	%r325, %r116, %r266, %r310;
-	fma.rn.f32 	%r326, %r117, %r271, %r325;
-	fma.rn.f32 	%r327, %r117, %r273, %r324;
-	fma.rn.f32 	%r328, %r121, %r271, %r323;
-	fma.rn.f32 	%r329, %r121, %r273, %r322;
-	fma.rn.f32 	%r330, %r149, %r271, %r321;
-	fma.rn.f32 	%r331, %r149, %r273, %r320;
-	fma.rn.f32 	%r332, %r153, %r271, %r319;
-	fma.rn.f32 	%r333, %r153, %r273, %r318;
-	fma.rn.f32 	%r334, %r142, %r272, %r333;
-	fma.rn.f32 	%r335, %r142, %r270, %r332;
-	fma.rn.f32 	%r336, %r138, %r272, %r331;
-	fma.rn.f32 	%r337, %r138, %r270, %r330;
-	fma.rn.f32 	%r338, %r110, %r272, %r329;
-	fma.rn.f32 	%r339, %r110, %r270, %r328;
-	fma.rn.f32 	%r340, %r106, %r272, %r327;
-	fma.rn.f32 	%r341, %r106, %r270, %r326;
-	fma.rn.f32 	%r342, %r107, %r275, %r341;
-	fma.rn.f32 	%r343, %r107, %r177, %r340;
-	fma.rn.f32 	%r344, %r111, %r275, %r339;
-	fma.rn.f32 	%r345, %r111, %r177, %r338;
-	fma.rn.f32 	%r346, %r139, %r275, %r337;
-	fma.rn.f32 	%r347, %r139, %r177, %r336;
-	fma.rn.f32 	%r348, %r143, %r275, %r335;
-	fma.rn.f32 	%r349, %r143, %r177, %r334;
-	fma.rn.f32 	%r350, %r144, %r276, %r349;
-	fma.rn.f32 	%r351, %r144, %r274, %r348;
-	fma.rn.f32 	%r352, %r140, %r276, %r347;
-	fma.rn.f32 	%r353, %r140, %r274, %r346;
-	fma.rn.f32 	%r354, %r112, %r276, %r345;
-	fma.rn.f32 	%r355, %r112, %r274, %r344;
-	fma.rn.f32 	%r356, %r108, %r276, %r343;
-	fma.rn.f32 	%r357, %r108, %r274, %r342;
-	fma.rn.f32 	%r358, %r109, %r178, %r357;
-	fma.rn.f32 	%r359, %r109, %r277, %r356;
-	fma.rn.f32 	%r360, %r113, %r178, %r355;
-	fma.rn.f32 	%r361, %r113, %r277, %r354;
-	fma.rn.f32 	%r362, %r141, %r178, %r353;
-	fma.rn.f32 	%r363, %r141, %r277, %r352;
-	fma.rn.f32 	%r364, %r145, %r178, %r351;
-	fma.rn.f32 	%r365, %r145, %r277, %r350;
-$L__tmp1:
-	.loc	2 110 15                        // triton_helpers.py:110:15 @[ cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:103:40 ]
-	setp.lt.f32 	%p19, %r365, 0f00000000;
-	setp.lt.f32 	%p20, %r364, 0f00000000;
-	setp.lt.f32 	%p21, %r363, 0f00000000;
-	setp.lt.f32 	%p22, %r362, 0f00000000;
-	setp.lt.f32 	%p23, %r361, 0f00000000;
-	setp.lt.f32 	%p24, %r360, 0f00000000;
-	setp.lt.f32 	%p25, %r359, 0f00000000;
-	setp.lt.f32 	%p26, %r358, 0f00000000;
-	.loc	2 113 29                        // triton_helpers.py:113:29 @[ cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:103:40 ]
-	selp.f32 	%r366, 0f00000000, %r358, %p26;
-	selp.f32 	%r367, 0f00000000, %r359, %p25;
-	selp.f32 	%r368, 0f00000000, %r360, %p24;
-	selp.f32 	%r369, 0f00000000, %r361, %p23;
-	selp.f32 	%r370, 0f00000000, %r362, %p22;
-	selp.f32 	%r371, 0f00000000, %r363, %p21;
-	selp.f32 	%r372, 0f00000000, %r364, %p20;
-	selp.f32 	%r373, 0f00000000, %r365, %p19;
-$L__tmp2:
-	.loc	1 104 52                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:52
-	shl.b32 	%r374, %r77, 4;
-	shl.b32 	%r375, %r78, 4;
-	.loc	1 104 49                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:49
-	add.s32 	%r376, %r374, %r80;
-	add.s32 	%r377, %r80, %r375;
-	.loc	1 104 25                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:25
-	mul.wide.s32 	%rd27, %r376, 4;
-	add.s64 	%rd19, %rd4, %rd27;
-	mul.wide.s32 	%rd28, %r377, 4;
-	add.s64 	%rd20, %rd4, %rd28;
-	.loc	1 104 78                        // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:104:78
-	selp.f32 	%r378, %r366, %r368, %p18;
-	or.b32 	%r379, %r183, %r67;
-	or.b32 	%r380, %r379, %r84;
-	shfl.sync.idx.b32 	%r381, %r378, %r380, 31, -1;
-	selp.f32 	%r382, %r367, %r369, %p18;
-	shfl.sync.idx.b32 	%r383, %r382, %r380, 31, -1;
-	selp.f32 	%r384, %r368, %r366, %p18;
-	xor.b32 	%r385, %r171, %r67;
-	or.b32 	%r386, %r183, %r385;
-	shfl.sync.idx.b32 	%r387, %r384, %r386, 31, -1;
-	selp.f32 	%r388, %r369, %r367, %p18;
-	shfl.sync.idx.b32 	%r389, %r388, %r386, 31, -1;
-	selp.f32 	%r390, %r370, %r372, %p18;
-	shfl.sync.idx.b32 	%r391, %r390, %r380, 31, -1;
-	selp.f32 	%r392, %r371, %r373, %p18;
-	shfl.sync.idx.b32 	%r393, %r392, %r380, 31, -1;
-	selp.f32 	%r394, %r372, %r370, %p18;
-	shfl.sync.idx.b32 	%r395, %r394, %r386, 31, -1;
-	selp.f32 	%r396, %r373, %r371, %p18;
-	shfl.sync.idx.b32 	%r397, %r396, %r386, 31, -1;
-	selp.b32 	%r45, %r387, %r381, %p17;
-	selp.b32 	%r46, %r389, %r383, %p17;
-	selp.b32 	%r49, %r395, %r391, %p17;
-	selp.b32 	%r50, %r397, %r393, %p17;
-	selp.b32 	%r43, %r381, %r387, %p17;
-	selp.b32 	%r44, %r383, %r389, %p17;
-	// begin inline asm
-	@%p10 st.global.v4.b32 [ %rd19 + 0 ], { %r43, %r44, %r45, %r46 };
-	// end inline asm
-	selp.b32 	%r47, %r391, %r395, %p17;
-	selp.b32 	%r48, %r393, %r397, %p17;
-	// begin inline asm
-	@%p11 st.global.v4.b32 [ %rd20 + 0 ], { %r47, %r48, %r49, %r50 };
-	// end inline asm
-$L__BB0_1:                              // %common.ret
-	.loc	1 0 0                           // cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py:0
-	ret;
-$L__tmp3:
-$L__func_end0:
-                                        // -- End function
-}
-	.file	1 "/tmp/torchinductor_shangdiy/uw/cuwhbpecd2ukeso3jxekczvghb7ah2hk2zob67opc43aghe5wlv3.py"
-	.file	2 "/home/shangdiy/pytorch/torch/_inductor/runtime/triton_helpers.py"
-	.section	.debug_abbrev
-	{
-.b8 1                                   // Abbreviation Code
-.b8 17                                  // DW_TAG_compile_unit
-.b8 1                                   // DW_CHILDREN_yes
-.b8 37                                  // DW_AT_producer
-.b8 8                                   // DW_FORM_string
-.b8 19                                  // DW_AT_language
-.b8 5                                   // DW_FORM_data2
-.b8 3                                   // DW_AT_name
-.b8 8                                   // DW_FORM_string
-.b8 16                                  // DW_AT_stmt_list
-.b8 6                                   // DW_FORM_data4
-.b8 27                                  // DW_AT_comp_dir
-.b8 8                                   // DW_FORM_string
-.b8 0                                   // EOM(1)
-.b8 0                                   // EOM(2)
-.b8 2                                   // Abbreviation Code
-.b8 46                                  // DW_TAG_subprogram
-.b8 0                                   // DW_CHILDREN_no
-.b8 3                                   // DW_AT_name
-.b8 8                                   // DW_FORM_string
-.b8 32                                  // DW_AT_inline
-.b8 11                                  // DW_FORM_data1
-.b8 0                                   // EOM(1)
-.b8 0                                   // EOM(2)
-.b8 3                                   // Abbreviation Code
-.b8 46                                  // DW_TAG_subprogram
-.b8 1                                   // DW_CHILDREN_yes
-.b8 17                                  // DW_AT_low_pc
-.b8 1                                   // DW_FORM_addr
-.b8 18                                  // DW_AT_high_pc
-.b8 1                                   // DW_FORM_addr
-.b8 49                                  // DW_AT_abstract_origin
-.b8 19                                  // DW_FORM_ref4
-.b8 0                                   // EOM(1)
-.b8 0                                   // EOM(2)
-.b8 4                                   // Abbreviation Code
-.b8 29                                  // DW_TAG_inlined_subroutine
-.b8 0                                   // DW_CHILDREN_no
-.b8 49                                  // DW_AT_abstract_origin
-.b8 19                                  // DW_FORM_ref4
-.b8 17                                  // DW_AT_low_pc
-.b8 1                                   // DW_FORM_addr
-.b8 18                                  // DW_AT_high_pc
-.b8 1                                   // DW_FORM_addr
-.b8 88                                  // DW_AT_call_file
-.b8 11                                  // DW_FORM_data1
-.b8 89                                  // DW_AT_call_line
-.b8 11                                  // DW_FORM_data1
-.b8 87                                  // DW_AT_call_column
-.b8 11                                  // DW_FORM_data1
-.b8 0                                   // EOM(1)
-.b8 0                                   // EOM(2)
-.b8 0                                   // EOM(3)
-	}
-	.section	.debug_info
-	{
-.b32 195                                // Length of Unit
-.b8 2                                   // DWARF version number
-.b8 0
-.b32 .debug_abbrev                      // Offset Into Abbrev. Section
-.b8 8                                   // Address Size (in bytes)
-.b8 1                                   // Abbrev [1] 0xb:0xbc DW_TAG_compile_unit
-.b8 116                                 // DW_AT_producer
-.b8 114
-.b8 105
-.b8 116
-.b8 111
-.b8 110
-.b8 0
-.b8 2                                   // DW_AT_language
-.b8 0
-.b8 99                                  // DW_AT_name
-.b8 117
-.b8 119
-.b8 104
-.b8 98
-.b8 112
-.b8 101
-.b8 99
-.b8 100
-.b8 50
-.b8 117
-.b8 107
-.b8 101
-.b8 115
-.b8 111
-.b8 51
-.b8 106
-.b8 120
-.b8 101
-.b8 107
-.b8 99
-.b8 122
-.b8 118
-.b8 103
-.b8 104
-.b8 98
-.b8 55
-.b8 97
-.b8 104
-.b8 50
-.b8 104
-.b8 107
-.b8 50
-.b8 122
-.b8 111
-.b8 98
-.b8 54
-.b8 55
-.b8 111
-.b8 112
-.b8 99
-.b8 52
-.b8 51
-.b8 97
-.b8 103
-.b8 104
-.b8 101
-.b8 53
-.b8 119
-.b8 108
-.b8 118
-.b8 51
-.b8 46
-.b8 112
-.b8 121
-.b8 0
-.b32 .debug_line                        // DW_AT_stmt_list
-.b8 47                                  // DW_AT_comp_dir
-.b8 116
-.b8 109
-.b8 112
-.b8 47
-.b8 116
-.b8 111
-.b8 114
-.b8 99
-.b8 104
-.b8 105
-.b8 110
-.b8 100
-.b8 117
-.b8 99
-.b8 116
-.b8 111
-.b8 114
-.b8 95
-.b8 115
-.b8 104
-.b8 97
-.b8 110
-.b8 103
-.b8 100
-.b8 105
-.b8 121
-.b8 47
-.b8 117
-.b8 119
-.b8 0
-.b8 2                                   // Abbrev [2] 0x70:0x28 DW_TAG_subprogram
-.b8 109                                 // DW_AT_name
-.b8 111
-.b8 100
-.b8 101
-.b8 108
-.b8 95
-.b8 116
-.b8 114
-.b8 105
-.b8 116
-.b8 111
-.b8 110
-.b8 95
-.b8 116
-.b8 101
-.b8 109
-.b8 95
-.b8 102
-.b8 117
-.b8 115
-.b8 101
-.b8 100
-.b8 95
-.b8 97
-.b8 100
-.b8 100
-.b8 109
-.b8 109
-.b8 95
-.b8 114
-.b8 101
-.b8 108
-.b8 117
-.b8 95
-.b8 116
-.b8 95
-.b8 48
-.b8 0
-.b8 1                                   // DW_AT_inline
-.b8 3                                   // Abbrev [3] 0x98:0x2e DW_TAG_subprogram
-.b64 $L__func_begin0                    // DW_AT_low_pc
-.b64 $L__func_end0                      // DW_AT_high_pc
-.b32 112                                // DW_AT_abstract_origin
-.b8 4                                   // Abbrev [4] 0xad:0x18 DW_TAG_inlined_subroutine
-.b32 112                                // DW_AT_abstract_origin
-.b64 $L__tmp1                           // DW_AT_low_pc
-.b64 $L__tmp2                           // DW_AT_high_pc
-.b8 1                                   // DW_AT_call_file
-.b8 103                                 // DW_AT_call_line
-.b8 40                                  // DW_AT_call_column
-.b8 0                                   // End Of Children Mark
-.b8 0                                   // End Of Children Mark
-	}
-	.section	.debug_macinfo	{	}
--- a/model2/data/aotinductor/model/script.ld
+++ b/model2/data/aotinductor/model/script.ld
@ -1,8 +0,0 @@
-SECTIONS {
-  /* By default, in LLD 16, .lrodata is placed immediately after .rodata.
-   * However, .lrodata can be very large in our compiled models, which leads to
-   * relocation out-of-range errors for relative relocations. So we place it
-   * after other the sections that are referenced from .text using relative
-   * relocations. This is the default behavior in GNU ld. */
-  .lrodata : { *(.lrodata) }
- } INSERT AFTER .bss;
--- a/model2/main.cpp
+++ b/model2/main.cpp
@ -1,147 +0,0 @@
-// Windows for #include <dlfcn.h>
-#include <windows.h>
-#include <stdio.h>
-
-#include <iostream>
-#include <memory>
-#include <vector>
-#include <string>
-
-// Include the AOTInductor headers
-// #include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
-#include <torch/csrc/inductor/aoti_runtime/interface.h>
-// #include <torch/csrc/inductor/aoti_runtime/model_container.h>
-// #include <torch/csrc/inductor/aoti_torch/tensor_converter.h> // @manual
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
-#include <standalone/slim/core/Empty.h>
-#include <standalone/slim/cuda/Guard.h>
-#include <standalone/torch/csrc/inductor/aoti_torch/tensor_converter.h>
-
-static std::wstring u8u16(const char* s) {
-    int len = MultiByteToWideChar(CP_UTF8, 0, s, -1, NULL, 0);
-    std::wstring wbuf(len, L'\0');
-    MultiByteToWideChar(CP_UTF8, 0, s, -1, &wbuf[0], len);
-    if (!wbuf.empty() && wbuf.back() == L'\0') {
-        wbuf.pop_back();
-    }
-    return wbuf;
-}
-
-int main() {
-  try {
-
-    // Load the DLL (model.pyd is a DLL on Windows)
-HMODULE handle = nullptr;
-{
-    auto wname = u8u16(R"(C:\Users\shangdiy\source\repos\pytorch\model2\model.pyd)");
-
-    // Try LoadLibraryExW with safe search flags if supported
-    if (GetProcAddress(GetModuleHandleW(L"KERNEL32.DLL"), "AddDllDirectory") != NULL) {
-        handle = LoadLibraryExW(
-            wname.c_str(),
-            NULL,
-            LOAD_LIBRARY_SEARCH_DEFAULT_DIRS);
-    }
-
-    // Fallback if that failed
-    if (!handle) {
-        handle = LoadLibraryW(wname.c_str());
-    }
-
-    if (!handle) {
-        DWORD dw = GetLastError();
-        char buf[512];
-        FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
-                       NULL, dw, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
-                       buf, sizeof(buf), NULL);
-        std::cerr << "Failed to load model.pyd. WinError " << dw << ": " << buf << std::endl;
-        return 1;
-    } else {
-        std::cout << "Loaded model.pyd" << std::endl;
-    }
-}
-    decltype(&AOTInductorModelContainerCreateWithDevice) create_model{nullptr}; 
-    decltype(&AOTInductorModelContainerDelete) delete_model{nullptr}; 
-    decltype(&AOTInductorModelContainerRun) run_model{nullptr};
-
-
-#define AOTI_LOAD_SYMBOL(handle_, var, name_str) \
-    var = reinterpret_cast<decltype(var)>(GetProcAddress(handle_, name_str)); \
-    if (!var) { \
-        throw std::runtime_error("Could not GetProcAddress " name_str); \
-    }
-
-        AOTI_LOAD_SYMBOL(handle, create_model, "AOTInductorModelContainerCreateWithDevice");
-        AOTI_LOAD_SYMBOL(handle, run_model, "AOTInductorModelContainerRun");
-        AOTI_LOAD_SYMBOL(handle, delete_model, "AOTInductorModelContainerDelete");
-#undef AOTI_LOAD_SYMBOL
-
-    // Create array of input/output handles
-        slim::SlimTensor x = slim::empty({8, 10}, c10::kFloat, c10::Device(c10::kCUDA, 0));
-        float fill_value = 1.0;
-        x.fill_(fill_value);
-    // AOTInductorModel::run will steal the ownership of the input and output
-    // tensor pointers
-        std::vector<slim::SlimTensor> inputs = {x};
-        std::vector<AtenTensorHandle> input_handles =
-            unsafe_alloc_new_handles_from_tensors(inputs);
-
-        AtenTensorHandle output_handle;
-        AOTInductorModelContainerHandle container_handle;
-        cudaStream_t stream = slim::cuda::getCurrentCUDAStream(0);
-        // aoti_torch_get_current_cuda_stream(0, (void**)&stream);
-
-        // Reinterpret as the opaque handle for AOTInductor
-        AOTInductorStreamHandle stream_handle = reinterpret_cast<AOTInductorStreamHandle>(stream);
-
-        // Construct model
-       const char* cubin_dir = R"(C:\Users\shangdiy\source\repos\pytorch\model2\)";
-        AOTIRuntimeError err =
-            create_model(&container_handle, 1, "cuda", cubin_dir);
-        if (err != AOTI_RUNTIME_SUCCESS) {
-          throw std::runtime_error("Failed to create model container");
-        } else {
-          std::cout << "Created model\n";
-        }
-
-        // Run the model
-        err = run_model(container_handle, input_handles.data(),
-                          1, // num_inputs
-                          &output_handle,
-                          1,       // num_outputs
-                          stream_handle, // stream
-                          nullptr  // proxy_executor
-        );
-        if (err != AOTI_RUNTIME_SUCCESS) {
-          throw std::runtime_error("Failed to run model");
-        } else {
-          std::cout << "Finish model\n";
-        }
-
-        std::vector<slim::SlimTensor> outputs =
-            alloc_tensors_by_stealing_from_handles(&output_handle, 1);
-
-    // Print the result
-    slim::SlimTensor slim_tensor = outputs[0];
-    auto slim_cpu = slim_tensor.cpu();
-    float *slim_data = static_cast<float *>(slim_cpu.data_ptr());
-     std::cout << "Output" << std::endl;
-     std::cout << "slim_data ptr: " << slim_data << "\n";
-    size_t num_elements = slim_cpu.numel(); // or equivalent method
-     std::cout << num_elements << std::endl;
-
-    for (size_t i = 0; i <  num_elements; ++i) {
-      std::cout << slim_data[i] << "\n";
-    }
-
-    std::cout << "Done" << std::endl;
-
-    delete_model(container_handle);
-    FreeLibrary(handle);
-
-    return 0;
-  } catch (const std::exception &e) {
-    std::cerr << "Error: " << e.what() << std::endl;
-    return 1;
-  }
-}
--- a/model2/model.exp
+++ b/model2/model.exp
--- a/model2/model_triton_tem_fused_addmm_relu_sigmoid_t_1.cubin
+++ b/model2/model_triton_tem_fused_addmm_relu_sigmoid_t_1.cubin
--- a/model2/model_triton_tem_fused_addmm_relu_t_0.cubin
+++ b/model2/model_triton_tem_fused_addmm_relu_t_0.cubin
--- a/requirements.txt
+++ b/requirements.txt
@ -10,7 +10,7 @@ filelock
 fsspec>=0.8.5
 hypothesis
 jinja2
-lintrunner ; platform_machine != "s390x" and platform_machine != "riscv64"
+lintrunner ; platform_machine != "s390x"
 networkx>=2.5.1
 optree>=0.13.0
 psutil
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@ -2709,7 +2709,6 @@ TEST(ProfilerDisableInCallbackTest, Basic) {
 }

 TEST(RecordDebugHandles, Basic) {
-  GTEST_SKIP() << "Test is flaky and sometimes hangs on CI. ";
  // Enable the profiler in this thread
  const std::set<torch::autograd::profiler::ActivityType> activities(
      {torch::autograd::profiler::ActivityType::CPU});
--- a/test/cpp/nativert/CMakeLists.txt
+++ b/test/cpp/nativert/CMakeLists.txt
@ -36,9 +36,6 @@ set(NATIVERT_TEST_SRCS
  ${TORCH_ROOT}/torch/nativert/kernels/AutoFunctionalizeKernel.cpp
  ${TORCH_ROOT}/torch/nativert/kernels/CallTorchBindKernel.cpp
  ${TORCH_ROOT}/torch/nativert/kernels/HigherOrderKernel.cpp
-  ${TORCH_ROOT}/torch/nativert/graph/passes/SubgraphRewriter.cpp
-  ${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/GraphPasses.cpp
-  ${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/PassManager.cpp
 )

 add_executable(test_nativert
--- a/test/cpp/nativert/test_pass_manager.cpp
+++ b/test/cpp/nativert/test_pass_manager.cpp
@ -1,33 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <torch/nativert/graph/Graph.h>
-#include <torch/nativert/graph/passes/pass_manager/PassManager.h>
-
-#include <torch/csrc/jit/testing/file_check.h>
-
-using namespace ::testing;
-using namespace torch::nativert;
-
-TEST(PassManagerTest, TestEmptyPass) {
-  GraphPassManager manager({"EmptyPass"});
-  EXPECT_FALSE(manager.run(Graph::createGraph().get()));
-}
-
-TEST(PassPipelineTest, TestConcat) {
-  GraphPassPipeline p1({"test"});
-  EXPECT_EQ(p1.size(), 1);
-  EXPECT_EQ(p1.at(0), "test");
-  p1.concat({"test1", "test2"});
-  EXPECT_EQ(p1.at(0), "test");
-  EXPECT_EQ(p1.at(1), "test1");
-  EXPECT_EQ(p1.at(2), "test2");
-}
-
-TEST(PassPipelineTest, TestPushFront) {
-  GraphPassPipeline p1({"test"});
-  EXPECT_EQ(p1.size(), 1);
-  EXPECT_EQ(p1.at(0), "test");
-  p1.push_front("test1");
-  EXPECT_EQ(p1.at(0), "test1");
-  EXPECT_EQ(p1.at(1), "test");
-}
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@ -288,16 +288,6 @@ void boxed_empty_like(StableIValue* stack, uint64_t num_args, uint64_t num_outpu
  stack[0] = from(res);
 }

-bool my_is_cpu(Tensor t) {
-  return t.is_cpu();
-}
-
-
-void boxed_my_is_cpu(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_is_cpu(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
 Tensor fill_infinity(Tensor t) {
  auto value = std::numeric_limits<float>::infinity();
  return fill_(t, value);
@ -354,7 +344,6 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("my_transpose", &boxed_my_transpose);
  m.impl("my_empty_like", &boxed_empty_like);
  m.impl("fill_infinity", &boxed_fill_infinity);
-  m.impl("my_is_cpu", &boxed_my_is_cpu);
 }

 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
@ -373,8 +362,6 @@ void boxed_my_zero_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
-  m.def("my_is_cpu(Tensor t) -> bool");
-
 }

 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@ -51,19 +51,6 @@ def my_abs(t) -> Tensor:
    return torch.ops.libtorch_agnostic.my_abs.default(t)


-def my_is_cpu(t) -> bool:
-    """
-    Returns is_cpu on the input tensor.
-
-    Args:
-        t: any Tensor
-
-    Returns:
-        a bool
-    """
-    return torch.ops.libtorch_agnostic.my_is_cpu.default(t)
-
-
 def my_ones_like(tensor, device) -> Tensor:
    """
    Returns a new Tensor like the input tensor, but with all ones
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@ -209,13 +209,6 @@ if not IS_WINDOWS:
            self.assertEqual(id(out), id(t))
            self.assertEqual(out, torch.zeros_like(t))

-        def test_my_is_cpu(self, device):
-            import libtorch_agnostic
-
-            t = torch.rand(2, 7, device=device)
-            out = libtorch_agnostic.ops.my_is_cpu(t)
-            self.assertEqual(out, t.is_cpu)
-
        def test_fill_infinity(self, device):
            import libtorch_agnostic

--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				{"compiler": "/home/shangdiy/miniconda3/envs/pytorch-3.10/bin/x86_64-conda-linux-gnu-c++", "definitions": ["NOMINMAX", "TORCH_INDUCTOR_CPP_WRAPPER", "STANDALONE_TORCH_HEADER", " C10_USING_CUSTOM_GENERATED_MACROS", "CPU_CAPABILITY_AVX512", " USE_CUDA"], "include_dirs": ["/home/shangdiy/miniconda3/envs/pytorch-3.10/include/python3.10", "/home/shangdiy/miniconda3/envs/pytorch-3.10/Include", "/home/shangdiy/pytorch/torch/include", "/home/shangdiy/pytorch/torch/include/torch/csrc/api/include", "/usr/local/cuda-12/include"], "cflags": ["O2", "DLL", "MD", "std:c++20", "wd4819", "wd4251", "wd4244", "wd4267", "wd4275", "wd4018", "wd4190", "wd4624", "wd4067", "wd4068", "EHsc", "Zc:__cplusplus", "permissive-", "openmp", "openmp:experimental"], "ldflags": [], "libraries_dirs": [], "libraries": [], "passthrough_args": ["", "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512bf16"], "aot_mode": true, "use_relative_path": false, "compile_only": true}