add old way

added example for torch.is_storage (#162614 )
Fixes #162613 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162614 Approved by: https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
2025-10-22 22:25:10 +08:00 · 2025-09-11 16:43:31 -04:00 · 2025-09-11 20:25:26 +00:00 · 2025-09-11 20:24:15 +00:00 · 2025-09-11 20:24:15 +00:00 · 2025-09-11 20:24:15 +00:00
660 changed files with 7264 additions and 2814 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -5,9 +5,9 @@ GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}

 # Set CUDA architecture lists to match x86 build_cuda.sh
 if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;8.0;9.0"
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
 elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;10.0;12.0"
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
 elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
 fi
@ -42,9 +42,6 @@ else
        echo "Bundling CUDA libraries with wheel for aarch64."
    else
        echo "Using nvidia libs from pypi for aarch64."
-        # Fix platform constraints in PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64
-        # Replace 'platform_machine == "x86_64"' with 'platform_machine == "aarch64"'
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS//platform_machine == \'x86_64\'/platform_machine == \'aarch64\'}"
        echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
        export USE_NVIDIA_PYPI_LIBS=1
    fi
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -138,6 +138,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
    folder = os.path.dirname(wheel_path)
    os.mkdir(f"{folder}/tmp")
    os.system(f"unzip {wheel_path} -d {folder}/tmp")
+    # Delete original wheel since it will be repackaged
+    os.system(f"rm {wheel_path}")

    # Check if we should use PyPI NVIDIA libraries or bundle system libraries
    use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
@ -211,7 +213,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        ]

        # CUDA version-specific libraries
-        if "130" in desired_cuda:
+        if "13" in desired_cuda:
+            minor_version = desired_cuda[-1]
            version_specific_libs = [
                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
                "/usr/local/cuda/lib64/libcublas.so.13",
@ -221,7 +224,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
                "/usr/local/cuda/lib64/libcusolver.so.12",
                "/usr/local/cuda/lib64/libnvJitLink.so.13",
                "/usr/local/cuda/lib64/libnvrtc.so.13",
-                "/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
+                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
            ]
        elif "12" in desired_cuda:
            # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
@ -237,6 +240,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
                "/usr/local/cuda/lib64/libnvrtc.so.12",
                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
            ]
+        else:
+            raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")

        # Combine all libraries
        libs_to_copy = common_libs + version_specific_libs
@ -275,14 +280,7 @@ def complete_wheel(folder: str) -> str:
            f"/{folder}/dist/{repaired_wheel_name}",
        )
    else:
-        repaired_wheel_name = wheel_name.replace(
-            "linux_aarch64", "manylinux_2_28_aarch64"
-        )
-        print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
-        os.rename(
-            f"/{folder}/dist/{wheel_name}",
-            f"/{folder}/dist/{repaired_wheel_name}",
-        )
+        repaired_wheel_name = list_dir(f"/{folder}/dist")[0]

    print(f"Copying {repaired_wheel_name} to artifacts")
    shutil.copy2(
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -56,9 +56,13 @@ ENV INSTALLED_VISION ${VISION}

 # Install rocm
 ARG ROCM_VERSION
+RUN mkdir ci_commit_pins
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
 COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
-RUN rm install_rocm.sh
+RUN rm install_rocm.sh common_utils.sh
+RUN rm -r ci_commit_pins
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
 RUN rm install_rocm_magma.sh
--- a/.ci/docker/ci_commit_pins/rocm-composable-kernel.txt
+++ b/.ci/docker/ci_commit_pins/rocm-composable-kernel.txt
@ -0,0 +1 @@
+7fe50dc3da2069d6645d9deb8c017a876472a977
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-fccfc522864cf8bc172abe0cd58ae5581e2d44b9
+70cbcaca84471df49e81ddc56873c9241b671f8d
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -2,6 +2,11 @@

 set -ex

+# for pip_install function
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+ROCM_COMPOSABLE_KERNEL_VERSION="$(cat $(dirname $0)/../ci_commit_pins/rocm-composable-kernel.txt)"
+
 ver() {
    printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
 }
@ -113,6 +118,8 @@ EOF
        rm -rf HIP clr
    fi

+    pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
+
    # Cleanup
    apt-get autoclean && apt-get clean
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
@ -176,6 +183,8 @@ install_centos() {
      sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
  done

+  pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
+
  # Cleanup
  yum clean all
  rm -rf /var/cache/yum
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -52,9 +52,13 @@ ENV INSTALLED_VISION ${VISION}

 # Install rocm
 ARG ROCM_VERSION
+RUN mkdir ci_commit_pins
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
 COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
-RUN rm install_rocm.sh
+RUN rm install_rocm.sh common_utils.sh
+RUN rm -r ci_commit_pins
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
 RUN rm install_rocm_magma.sh
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -258,11 +258,19 @@ function install_torchrec_and_fbgemm() {
      git clone --recursive https://github.com/pytorch/fbgemm
      pushd fbgemm/fbgemm_gpu
      git checkout "${fbgemm_commit}" --recurse-submodules
-      python setup.py bdist_wheel \
-        --build-variant=rocm \
-        -DHIP_ROOT_DIR="${ROCM_PATH}" \
-        -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
-        -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
+      # until the fbgemm_commit includes the tbb patch
+      patch <<'EOF'
+--- a/FbgemmGpu.cmake
+++ b/FbgemmGpu.cmake
+@@ -184,5 +184,6 @@ gpu_cpp_library(
+     fbgemm_gpu_tbe_cache
+     fbgemm_gpu_tbe_optimizers
+     fbgemm_gpu_tbe_utils
+    tbb
+   DESTINATION
+     fbgemm_gpu)
+EOF
+      python setup.py bdist_wheel --build-variant=rocm
      popd

      # Save the wheel before cleaning up
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -35,10 +35,11 @@ fi

 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
+  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
-  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
-  # backends (specifically the gloo backend), so test that this case works too
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd

-python -mpip install -r requirements.txt
-
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1

-python -mpip install --no-input -r requirements.txt
-
 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
  # This environment variable makes ProcessGroupGloo default to
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -386,8 +386,8 @@ def smoke_test_compile(device: str = "cpu") -> None:


 def smoke_test_nvshmem() -> None:
-    if not torch.cuda.is_available():
-        print("CUDA is not available, skipping NVSHMEM test")
+    if not torch.cuda.is_available() or target_os == "windows":
+        print("Windows platform or CUDA is not available, skipping NVSHMEM test")
        return

    # Check if NVSHMEM is compiled in current build
@ -396,7 +396,9 @@ def smoke_test_nvshmem() -> None:
    except ImportError:
        # Not built with NVSHMEM support.
        # torch is not compiled with NVSHMEM prior to 2.9
-        if torch.__version__ < "2.9":
+        from torch.torch_version import TorchVersion
+
+        if TorchVersion(torch.__version__) < (2, 9):
            return
        else:
            # After 2.9: NVSHMEM is expected to be compiled in current build
--- a/.ci/pytorch/windows/internal/driver_update.bat
+++ b/.ci/pytorch/windows/internal/driver_update.bat
@ -1,9 +1,9 @@
-set WIN_DRIVER_VN=528.89
-set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" & REM @lint-ignore
-curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe
+set WIN_DRIVER_VN=580.88
+set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe" & REM @lint-ignore
+curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe
 if errorlevel 1 exit /b 1

-start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe -s -noreboot
+start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe -s -noreboot
 if errorlevel 1 exit /b 1

-del %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe || ver > NUL
+del %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe || ver > NUL
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -189,8 +189,7 @@ pip install requests ninja typing-extensions
 retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
 retry brew install libomp

-# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
-# is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

 export USE_MKLDNN=OFF
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-27fc2493d383354a008106f22f3be232badee9a1
+fa5142928ee157aa65137c4ecff2fe9b1a9e0648
--- a/.github/ci_commit_pins/fbgemm_rocm.txt
+++ b/.github/ci_commit_pins/fbgemm_rocm.txt
@ -1 +1 @@
-7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8
+08ae0af1395c8d8471f4025deb6af9aef90b342f
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1,2 @@
-e10fef08838612b4560e9c72e5cb1414a5edfa13
+f510715882304796a96e33028b4f6de1b026c2c7
+
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -43,55 +43,55 @@ CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"]

 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
    "12.6": (
-        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
+        "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | "
+        "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | "
+        "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | "
+        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
+        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
+        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
+        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
+        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
    ),
    "12.8": (
-        "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
+        "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | "
+        "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | "
+        "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | "
+        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
+        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
+        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
+        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
+        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
    ),
    "13.0": (
-        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
+        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
+        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
+        "nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
+        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
+        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
+        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
+        "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
+        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
+        "nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
    ),
    "xpu": (
        "intel-cmplr-lib-rt==2025.2.1 | "
--- a/.github/scripts/prepare_vllm_wheels.sh
+++ b/.github/scripts/prepare_vllm_wheels.sh
@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+
+set -eux
+
+torch_version=$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+nightly=$(echo ${torch_version} | cut -d'.' -f4)
+
+# Copied from .ci/manywheel/build_common.sh
+make_wheel_record() {
+  fpath=$1
+  if echo $fpath | grep RECORD >/dev/null 2>&1; then
+    echo "$fpath,,"
+  else
+    fhash=$(openssl dgst -sha256 -binary $fpath | openssl base64 | sed -e 's/+/-/g' | sed -e 's/\//_/g' | sed -e 's/=//g')
+    fsize=$(ls -nl $fpath | awk '{print $5}')
+    echo "$fpath,sha256=$fhash,$fsize"
+  fi
+}
+
+change_wheel_version() {
+  local package=$1
+  local wheel=$2
+  local f_version=$3
+  local t_version=$4
+
+  # Extract the wheel
+  ${PYTHON_EXECUTABLE} -mwheel unpack $wheel
+
+  mv "${package}-${f_version}" "${package}-${t_version}"
+  # Change the version from f_version to t_version in the dist-info dir
+  pushd "${package}-${t_version}"
+  mv "${package}-${f_version}.dist-info" "${package}-${t_version}.dist-info"
+
+  pushd "${package}-${t_version}.dist-info"
+  sed -i "s/${package}-${f_version}.dist-info/${package}-${t_version}.dist-info/g" RECORD
+
+  # Update the version in METADATA and its SHA256 hash
+  sed -i "s/Version: ${f_version}/Version: ${t_version}/g" METADATA
+  # then add PyTorch nightly dependency of vLLM
+  if [[ "${package}" == vllm ]] || [[ "${package}" == xformers ]]; then
+    sed -i "/License-File/a\Requires-Dist: torch==${torch_version}" METADATA
+  fi
+  sed -i '/METADATA,sha256/d' RECORD
+  popd
+
+  make_wheel_record "${package}-${t_version}.dist-info/METADATA" >> "${package}-${t_version}.dist-info/RECORD"
+  popd
+
+  # Repack the wheel
+  ${PYTHON_EXECUTABLE} -mwheel pack "${package}-${t_version}"
+
+  # Clean up
+  rm -rf "${package}-${t_version}"
+}
+
+repackage_wheel() {
+  local package=$1
+  pushd $package
+
+  local orig_wheel=$(find . -name *${package//-/_}*)
+  local orig_version=$(unzip -p $orig_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+
+  local version=""
+  if [[ "${package}" == vllm ]]; then
+    # Copied from vllm/.buildkite/scripts/upload-wheels.sh
+    version=1.0.0
+  else
+    version=$(echo $orig_version | tr '.+' '.' | cut -d'.' -f1-3)
+  fi
+  local nightly_version=$version.$nightly
+
+  # Use nightly version
+  change_wheel_version ${package//-/_} $orig_wheel $orig_version $nightly_version
+  # Clean up
+  rm "${orig_wheel}"
+
+  auditwheel repair --plat $PLATFORM *.whl \
+    --exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
+  local repair_wheel=$(find wheelhouse -name *${PLATFORM}*)
+  local repair_wheel=$(basename ${repair_wheel})
+  popd
+
+  cp ${package}/wheelhouse/${repair_wheel} .
+  rm -rf $package
+}
+
+pushd externals/vllm/wheels
+for package in xformers flashinfer-python vllm; do
+  repackage_wheel $package
+done
+popd
--- a/.github/workflows/build-vllm-wheel.yml
+++ b/.github/workflows/build-vllm-wheel.yml
@ -59,20 +59,6 @@ jobs:
        run: |
          set -eux

-          # Keep PyTorch nightly wheel here so that we can install it later during
-          # vLLM build process
-          mkdir -p "${RUNNER_TEMP}/artifacts/"
-
-          container_name=$(docker run \
-            --tty \
-            --detach \
-            -e PLATFORM \
-            -v "${GITHUB_WORKSPACE}:/pytorch" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w /artifacts/ \
-            "${MANYLINUX_IMAGE}"
-          )
-
          # Determine python executable for given version (copied from build-triton-wheel)
          case $PY_VERS in
          3.10)
@ -102,6 +88,21 @@ jobs:
            ;;
          esac

+          # Keep PyTorch nightly wheel here so that we can install it later during
+          # vLLM build process
+          mkdir -p "${RUNNER_TEMP}/artifacts/"
+
+          container_name=$(docker run \
+            --tty \
+            --detach \
+            -e PLATFORM \
+            -e PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
+            -v "${GITHUB_WORKSPACE}:/pytorch" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w /artifacts/ \
+            "${MANYLINUX_IMAGE}"
+          )
+
          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \
            --pre torch torchvision torchaudio \
            --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
@ -113,7 +114,6 @@ jobs:
            --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"

          # Save this for later
-          echo "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" >> "$GITHUB_ENV"
          echo "container_name=${container_name}" >> "$GITHUB_ENV"

      - name: Build vLLM wheel
@ -131,36 +131,7 @@ jobs:
          set -eux

          # Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh
-          docker exec -t "${container_name}" bash -c "
-            set -eux
-
-            nightly=\$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2 | cut -d'.' -f4)
-
-            pushd externals/vllm/wheels
-            for package in xformers flashinfer-python vllm; do
-              pushd \$package
-              auditwheel repair --plat \$PLATFORM *.whl \
-                --exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
-              repair_wheel=\$(find wheelhouse -name *\${PLATFORM}*)
-              repair_wheel=\$(basename \${repair_wheel})
-              popd
-
-              cp \${package}/wheelhouse/\${repair_wheel} .
-              version=\$(unzip -p \$repair_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-
-              if [[ \$package == vllm ]]; then
-                new_wheel=\${repair_wheel/\$version/1.0.0.\$nightly}
-              else
-                major_version=\$(echo \$version | tr '.+' '.' | cut -d'.' -f1-3)
-                new_wheel=\${repair_wheel/\$version/\$major_version.\$nightly}
-              fi
-
-              mv -- \$repair_wheel \$new_wheel
-              rm -rf \$package
-            done
-            popd
-          "
-
+          docker exec -t "${container_name}" bash -c /pytorch/.github/scripts/prepare_vllm_wheels.sh
          docker exec -t "${container_name}" chown -R 1000:1000 /artifacts

      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -132,7 +132,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -178,7 +178,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -224,7 +224,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -335,7 +335,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -381,7 +381,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -427,7 +427,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -538,7 +538,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -584,7 +584,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -630,7 +630,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -741,7 +741,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -787,7 +787,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -833,7 +833,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -944,7 +944,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -990,7 +990,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1036,7 +1036,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1147,7 +1147,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1193,7 +1193,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1239,7 +1239,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1350,7 +1350,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1396,7 +1396,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1442,7 +1442,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -60,7 +60,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -127,7 +127,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_6-test:  # Testing
@ -193,7 +193,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_8-test:  # Testing
@ -259,7 +259,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda13_0-test:  # Testing
@ -719,7 +719,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_6-test:  # Testing
@ -785,7 +785,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_8-test:  # Testing
@ -851,7 +851,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda13_0-test:  # Testing
@ -1311,7 +1311,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_6-test:  # Testing
@ -1377,7 +1377,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
@ -1443,7 +1443,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda13_0-test:  # Testing
@ -1903,7 +1903,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_6-test:  # Testing
@ -1969,7 +1969,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_8-test:  # Testing
@ -2035,7 +2035,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda13_0-test:  # Testing
@ -2495,7 +2495,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_6-test:  # Testing
@ -2561,7 +2561,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_8-test:  # Testing
@ -2627,7 +2627,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda13_0-test:  # Testing
@ -3087,7 +3087,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_6-test:  # Testing
@ -3153,7 +3153,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_8-test:  # Testing
@ -3219,7 +3219,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda13_0-test:  # Testing
@ -3679,7 +3679,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_6-test:  # Testing
@ -3745,7 +3745,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_8-test:  # Testing
@ -3811,7 +3811,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda13_0-test:  # Testing
--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@ -43,6 +43,11 @@ on:
        required: false
        type: boolean
        default: false
+      freezing:
+        description: Run freezing?
+        required: false
+        type: boolean
+        default: true
      benchmark_configs:
        description: The list of configs used the benchmark
        required: false
@ -102,7 +107,7 @@ jobs:
    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
+      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
@ -116,10 +121,9 @@ jobs:
    name: inductor-test
    uses: ./.github/workflows/_linux-test.yml
    needs: inductor-build
-    if: github.event_name == 'workflow_dispatch'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
+      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -54,7 +54,7 @@ jobs:
      - get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
      docker-image: ${{ needs.docs-build.outputs.docker-image }}
      push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }}
      run-doxygen: true
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -22,6 +22,7 @@ COMMON_COPTS = [
    "-DHAVE_SHM_UNLINK=1",
    "-D_FILE_OFFSET_BITS=64",
    "-DUSE_FBGEMM",
+    "-DUSE_DISTRIBUTED",
    "-DAT_PER_OPERATOR_HEADERS",
    "-DATEN_THREADING=NATIVE",
    "-DNO_CUDNN_DESTROY_HANDLE",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -181,9 +181,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
  set(CPU_POWER ON)
 endif()

-# For non-supported platforms, turn USE_DISTRIBUTED off by default.
-# NB: USE_DISTRIBUTED simply disables the backend; distributed code
-# still gets built
+# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
+# tested and likely won't work without additional changes.
 if(NOT LINUX AND NOT WIN32)
  set(USE_DISTRIBUTED
      OFF
@ -234,6 +233,7 @@ cmake_dependent_option(INSTALL_TEST "Install test binaries if BUILD_TEST is on"
 option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
 option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
 option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
+option(USE_LSAN "Use Leak Sanitizer" OFF)
 option(USE_TSAN "Use Thread Sanitizer" OFF)
 option(USE_CUDA "Use CUDA" ON)
 option(USE_XPU "Use XPU" ON)
@ -262,11 +262,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
-option(USE_DISTRIBUTED "Enable default distributed backends" ON)
+option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
-                       "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
+                       "USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@ -431,10 +431,11 @@ if(WIN32)
      PATH_SUFFIXES lib
      NO_DEFAULT_PATH)
    if(NOT libuv_tmp_LIBRARY)
+      set(USE_DISTRIBUTED OFF)
      set(USE_GLOO OFF)
      message(
        WARNING
-          "Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
+          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
          "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
      )
    else()
@ -889,9 +890,9 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
  set(USE_FBGEMM_GENAI off)
 endif()

-# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100
-if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0a")
-  message(WARNING "Setting USE_FBGEMM_GENAI to ON for CUDA build on SM100")
+# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
+if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
+  message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
  set(USE_FBGEMM_GENAI ON)
 endif()

--- a/RELEASE.md
+++ b/RELEASE.md
@ -50,6 +50,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:

 | PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
 | --- | --- | --- | --- | --- | --- |
+| 2.9 | >=3.10, <=(3.14, 3.14t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 13.0 (CUDNN 9.13.0.50) | ROCm 6.4 |
 | 2.8 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 12.9 (CUDNN 9.10.2.21) | ROCm 6.4 |
 | 2.7 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8 (CUDNN 9.1.0.70), CUDA 12.6 (CUDNN 9.5.1.17) | CUDA 12.8 (CUDNN 9.7.1.26) | ROCm 6.3 |
 | 2.6 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8, CUDA 12.4 (CUDNN 9.1.0.70) | CUDA 12.6 (CUDNN 9.5.1.17) | ROCm 6.2.4 |
--- a/SECURITY.md
+++ b/SECURITY.md
@ -16,6 +16,8 @@ However, if you believe you have found a security vulnerability in PyTorch, we e

 Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new

+All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
+
 Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:

 https://www.facebook.com/whitehat
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -265,6 +265,14 @@ IF(USE_FBGEMM_GENAI)
      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
    list(FILTER fbgemm_genai_native_cuda_cu INCLUDE REGEX ${FBGEMM_CUTLASS_KERNELS_REGEX})

+    # PyTorch is not built for 10.0a in CI, due to lack of portability,
+    # so we need to explicitly build these files for 10.0a.
+    foreach(cu_file ${fbgemm_genai_native_cuda_cu})
+      _BUILD_FOR_ADDITIONAL_ARCHS(
+        "${cu_file}"
+        "100a")
+    endforeach()
+
    file(GLOB_RECURSE fbgemm_genai_native_cuda_cpp
      "${FBGEMM_GENAI_SRCS}/common/*.cpp"
    )
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -133,12 +133,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
        "resize_ called on tensor with symbolic shape")
    TORCH_CHECK(
        sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
-        "number of dimensions must be sparse_dim (",
+        "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
+        size.size(),
+        ", sparse_dim = ",
        sparse_dim,
-        ") + dense_dim (",
-        dense_dim,
-        "), but got ",
-        size.size());
+        ", dense_dim = ",
+        dense_dim);
    if (nnz() > 0) {
      [[maybe_unused]] auto constexpr alt_options_msg =
          "You could try the following options:\n\
@ -254,12 +254,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
        "resize_and_clear_ called on tensor with symbolic shape")
    TORCH_CHECK(
        sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
-        "number of dimensions must be sparse_dim (",
+        "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
+        size.size(),
+        ", sparse_dim = ",
        sparse_dim,
-        ") + dense_dim (",
-        dense_dim,
-        "), but got ",
-        size.size());
+        ", dense_dim = ",
+        dense_dim);

    set_sizes_and_strides(size, std::vector<int64_t>(size.size()));
    sparse_dim_ = sparse_dim;
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -644,6 +644,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
  void * beta_ptr = &fbeta;
 #ifdef USE_ROCM
  int flag = 0;
+  rocblas_datatype c_type = std::is_same<C_Dtype, float>::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r;
+  rocblas_datatype d_type = c_type;
 #if USE_GEMM_FLAGS_FP16_ALT_IMPL
  flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
 #endif
@ -652,8 +654,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
                                   hipOperationToRocOperation(opb), (int)m, (int)n, (int)k,
                                   (void*)alpha_ptr, a, rocblas_datatype_f16_r, (int)lda, stridea,
                                   b, rocblas_datatype_f16_r, (int)ldb, strideb,
-                                   (void*)beta_ptr, c, rocblas_datatype_f16_r, (int)ldc, stridec,
-                                   c, rocblas_datatype_f16_r, (int)ldc, stridec,
+                                   (void*)beta_ptr, c, c_type, (int)ldc, stridec,
+                                   c, d_type, (int)ldc, stridec,
                                   (int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard,
                                   0, flag)));
 #else
@ -1096,6 +1098,8 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
  GEMM_CHECK_ARGVALUES(at::Half);
 #ifdef USE_ROCM
  int flag = 0;
+  rocblas_datatype c_type = std::is_same<C_Dtype, float>::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r;
+  rocblas_datatype d_type = c_type;
 #if USE_GEMM_FLAGS_FP16_ALT_IMPL
  flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
 #endif
@ -1115,10 +1119,10 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
      ldb,
      beta_ptr,
      c,
-      rocblas_datatype_f16_r,
+      c_type,
      ldc,
      c,
-      rocblas_datatype_f16_r,
+      d_type,
      ldc,
      rocblas_datatype_f32_r,
      rocblas_gemm_algo_standard,
--- a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
+++ b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
@ -45,6 +45,24 @@ struct OffsetCalculator {

  C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
    offset_type offsets;
+
+#if defined(USE_ROCM)
+    if ((dims > 0) && (dims <= 2)) {
+      auto divmod = sizes_[0].divmod(linear_idx);
+#pragma unroll
+      for (int arg = 0; arg < NARGS; arg++)
+        offsets[arg] = divmod.mod * strides_[0][arg];
+      if (dims >= 2) {
+        divmod = sizes_[1].divmod(divmod.div);
+#pragma unroll
+        for (int arg = 0; arg < NARGS; arg++)
+          offsets[arg] += divmod.mod * strides_[1][arg];
+      }
+      // [...]
+      return offsets;
+    }
+#endif
+
    #pragma unroll
    for (int arg = 0; arg < NARGS; arg++) {
      offsets[arg] = 0;
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@ -457,24 +457,9 @@ void gemm(
    return;
  }
 #endif
-  // for the fallback path, first compute gemm with beta = 0,
-  // and then add c in full precision.
-  int64_t c_size = n * m;
-  std::vector<float> float_c(c_size, 0.f);
  gemm_no_downcast_stub(
      at::kCPU, at::kBFloat16,
-      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
-  for (const auto j : c10::irange(n)) {
-    for (const auto i : c10::irange(m)) {
-      auto offset = j * ldc + i;
-      // beta == 0 won't propagate NaN from C
-      if (beta == 0.f) {
-        c[offset] = float_c[j * m + i];
-      } else {
-        c[offset] = beta * c[offset] + float_c[j * m + i];
-      }
-    }
-  }
+      transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }

 void gemm(
@ -493,24 +478,9 @@ void gemm(
    return;
  }
 #endif
-  // for the fallback path, first compute gemm with beta = 0,
-  // and then add c in full precision.
-  int64_t c_size = n * m;
-  std::vector<float> float_c(c_size, 0.f);
  gemm_no_downcast_stub(
      at::kCPU, at::kHalf,
-      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
-  for (const auto j : c10::irange(n)) {
-    for (const auto i : c10::irange(m)) {
-      auto offset = j * ldc + i;
-      // beta == 0 won't propagate NaN from C
-      if (beta == 0.f) {
-        c[offset] = float_c[j * m + i];
-      } else {
-        c[offset] = beta * c[offset] + float_c[j * m + i];
-      }
-    }
-  }
+      transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }

 void gemm(
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@ -624,7 +624,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
  if (backend == BatchNormBackend::Miopen) {
    return std::tuple_cat(
             at::miopen_batch_norm(
-               input.contiguous(), weight.contiguous(), bias.contiguous(),
+               input.contiguous(input.suggest_memory_format()),
+               weight.contiguous(),
+               bias.contiguous(),
               running_mean.defined() ? running_mean.contiguous() : running_mean,
               running_var.defined() ? running_var.contiguous() : running_var,
               training, momentum, eps),
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -1080,16 +1080,6 @@ static bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=fals
 #endif
 }

-static bool _grouped_mm_allowed_device() {
-#ifdef USE_ROCM
-    return false;
-#else
-    auto dprops = at::cuda::getCurrentDeviceProperties();
-    // CUDA capability 8.0 and greater
-    return dprops->major >= 8;
-#endif
-}
-
 #ifdef USE_ROCM
 static bool _scaled_mm_is_fnuz() {
    return at::detail::getCUDAHooks().isGPUArch({"gfx942"});
@ -1786,14 +1776,19 @@ Tensor _grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
 const std::optional<at::Tensor>& offs,
 const std::optional<at::Tensor>& bias,
 std::optional<c10::ScalarType> out_dtype) {
-#ifndef USE_ROCM
  _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
  bool a_b_and_out_are_bf16 = (
    mat_a.dtype() == at::kBFloat16 &&
    mat_b.dtype() == at::kBFloat16 &&
    out_dtype.value_or(at::kBFloat16) == at::kBFloat16
  );
+#ifndef USE_ROCM
  bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16;
+#else
+  // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
+  // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
+  bool use_fast_path = false;
+#endif
  const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
  if (use_fast_path) {
@ -1803,9 +1798,6 @@ std::optional<c10::ScalarType> out_dtype) {
    _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
  }
  return out;
-#else
-  TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
-#endif
 }

 Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@ -7,6 +7,7 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
 #include <ATen/ops/miopen_batch_norm_native.h>
 #include <ATen/ops/miopen_batch_norm_backward_native.h>
 #endif
@ -102,7 +103,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
    mode = miopenBNSpatial;
  }

-  auto output_t = at::empty(input->sizes(), input->options());
+  auto output_t = at::empty_like(input_t, input_t.options(), input_t.suggest_memory_format());
  TensorArg output{ output_t, "output", 0 };

  auto handle = getMiopenHandle();
@ -170,20 +171,15 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
    const std::optional<Tensor>& save_var_t_opt,
    double epsilon) {
  // See [Note: hacky wrapper removal for optional tensor]
-  const Tensor& running_mean =
-      running_mean_opt.value_or(Tensor());
-  const Tensor& running_var =
-      running_var_opt.value_or(Tensor());
-  const Tensor& save_mean_t =
-      save_mean_t_opt.value_or(Tensor());
-  const Tensor& save_var_t =
-      save_var_t_opt.value_or(Tensor());
+  const Tensor& save_mean_t = save_mean_t_opt.value_or(Tensor());
+  const Tensor& save_var_t = save_var_t_opt.value_or(Tensor());

-  TensorArg input{ input_t, "input", 1 },
-            grad_output{ grad_output_t, "grad_output", 2 },
-            weight{ weight_t, "weight", 3 },
-            save_mean{ save_mean_t, "save_mean", 4 },
-            save_var{ save_var_t, "save_var", 5 };
+  auto grad_output_contig =
+      grad_output_t.contiguous(input_t.suggest_memory_format());
+  TensorArg input{input_t, "input", 1},
+      grad_output{grad_output_contig, "grad_output", 2},
+      weight{weight_t, "weight", 3}, save_mean{save_mean_t, "save_mean", 4},
+      save_var{save_var_t, "save_var", 5};
  CheckedFrom c = "miopen_batch_norm_backward";

  checkAllDefined(c, {input, grad_output, weight, save_mean, save_var});
@ -195,7 +191,11 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
  }
  checkAllSameType(c, {input, grad_output});
  checkAllSameType(c, {weight, save_mean, save_var});
-  checkAllContiguous(c, {input, grad_output, save_mean, save_var});
+  // TODO: is weight required to be contiguous?
+  checkAllContiguous(c, {save_mean, save_var});
+  // TODO: TensorArg check should start handle memory format
+  TORCH_CHECK(input->is_contiguous(input->suggest_memory_format()));
+  TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format()));
  checkDimRange(c, input, 2, 6 /* exclusive */);
  checkSameSize(c, input, grad_output);
  auto num_features = input->size(1);
@ -210,7 +210,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
    mode = miopenBNSpatial;
  }

-  auto grad_input_t  = at::empty(input->sizes(), input->options());
+  auto grad_input_t  = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
  auto grad_weight_t = at::empty(weight->sizes(), weight->options());
  auto grad_bias_t   = at::empty(weight->sizes(), weight->options());

--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -1798,7 +1798,7 @@
  device_guard: False
  dispatch:
    MkldnnCPU: copy_mkldnn_
-    SparseCPU, SparseCUDA: copy_sparse_wrapper_
+    SparseCPU, SparseCUDA, SparseMPS: copy_sparse_wrapper_
    CompositeExplicitAutograd: copy_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_
@ -2160,7 +2160,7 @@
  variants: function, method
  structured_delegate: div.out
  dispatch:
-    SparseCPU, SparseCUDA: div_sparse
+    SparseCPU, SparseCUDA, SparseMPS: div_sparse
    ZeroTensor: div_zerotensor
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor
  tags: [core, pointwise]
@ -2170,7 +2170,7 @@
  variants: method
  structured_delegate: div.out
  dispatch:
-    SparseCPU, SparseCUDA: div_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: div_sparse_
  tags: pointwise

 - func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@ -2179,7 +2179,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS, MTIA: div_out
-    SparseCPU, SparseCUDA: div_out_sparse_zerodim
+    SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
  tags: pointwise

 - func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
@ -2187,7 +2187,7 @@
  variants: function, method
  structured_delegate: div.out_mode
  dispatch:
-    SparseCPU, SparseCUDA: div_sparse
+    SparseCPU, SparseCUDA, SparseMPS: div_sparse
  tags: [core, pointwise]

 - func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
@ -2195,7 +2195,7 @@
  variants: method
  structured_delegate: div.out_mode
  dispatch:
-    SparseCPU, SparseCUDA: div_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: div_sparse_
  tags: pointwise

 - func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
@ -2204,7 +2204,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: div_out_mode
-    SparseCPU, SparseCUDA: div_out_sparse_zerodim
+    SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
  tags: pointwise

 # For C++ only, until we have conversion from C++ numbers to Tensor
@ -2768,20 +2768,20 @@
  variants: function, method
  dispatch:
    CPU, CUDA, MPS, MTIA: floor_divide
-    SparseCPU, SparseCUDA: floor_divide_sparse
+    SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse

 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  variants: method
  dispatch:
    CPU, CUDA, MPS: floor_divide_
-    SparseCPU, SparseCUDA: floor_divide_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse_

 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
    CPU, CUDA, MPS: floor_divide_out
-    SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
+    SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim

 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
  device_check: NoCheck   # TensorIterator
@ -4273,7 +4273,7 @@
  structured_delegate: mul.out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA: mul_sparse
+    SparseCPU, SparseCUDA, SparseMPS: mul_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr
    MkldnnCPU: mkldnn_mul
    ZeroTensor: mul_zerotensor
@ -4285,7 +4285,7 @@
  structured_delegate: mul.out
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA: mul_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: mul_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_
    MkldnnCPU: mkldnn_mul_
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor
@ -4299,6 +4299,7 @@
    CPU, CUDA, MPS, MTIA: mul_out
    SparseCPU: mul_out_sparse_cpu
    SparseCUDA: mul_out_sparse_cuda
+    SparseMPS: mul_out_sparse_mps
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_out_sparse_csr
    MkldnnCPU: mkldnn_mul_out
  tags: pointwise
@ -5848,7 +5849,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: sum
-    SparseCPU, SparseCUDA, SparseMeta: sum_coo
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
  autogen: sum.out

@ -5859,7 +5860,7 @@
  variants: function, method
  dispatch:
    NestedTensorCPU: NestedTensor_sum_dim_CPU
-    SparseCPU, SparseCUDA: sum_sparse_coo
+    SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
  tags: core

@ -6975,7 +6976,7 @@
    CPU, CUDA: sub_out
    MPS: sub_out_mps
    MTIA: sub_out_mtia
-    SparseCPU, SparseCUDA: sub_out_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sub_out_sparse
  tags: pointwise

 - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
@ -6983,7 +6984,7 @@
  variants: function, method
  structured_delegate: sub.out
  dispatch:
-    SparseCPU, SparseCUDA: sub_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sub_sparse
    ZeroTensor: sub_zerotensor
    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor
  tags: [core, pointwise]
@ -6993,7 +6994,7 @@
  variants: method
  structured_delegate: sub.out
  dispatch:
-    SparseCPU, SparseCUDA: sub_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: sub_sparse_
  tags: pointwise
 # For C++ only, until we have conversion from C++ numbers to Tensor

@ -10342,7 +10343,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA: pow_Tensor_Scalar_out
-    SparseCPU, SparseCUDA: pow_out_sparse_scalar
+    SparseCPU, SparseCUDA, SparseMPS: pow_out_sparse_scalar
    MPS: pow_tensor_scalar_out_mps
  tags: pointwise

@ -10351,7 +10352,7 @@
  structured_delegate: pow.Tensor_Scalar_out
  variants: function, method
  dispatch:
-    SparseCPU, SparseCUDA: pow_sparse_scalar
+    SparseCPU, SparseCUDA, SparseMPS: pow_sparse_scalar
  tags: [core, pointwise]

 - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
--- a/aten/src/ATen/native/sparse/SoftMax.cpp
+++ b/aten/src/ATen/native/sparse/SoftMax.cpp
@ -2,6 +2,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 #include <ATen/Dispatch.h>
+#include <ATen/AccumulateType.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/native/sparse/ParamUtils.h>
 #include <ATen/native/SparseTensorUtils.h>
@ -295,6 +296,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
    to exp functions as well as reuse of softmax implementation for
    log_softmax.
  */
+  using accscalar_t = at::acc_type<scalar_t, false>;
  auto sparse_dim = input.sparse_dim();
  auto indices = input._indices().contiguous();
  auto values = input._values().contiguous();
@ -340,14 +342,14 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
          continue;

        /* Prepare scratch space */
-        std::vector<scalar_t> mx_row(nvalues, -std::numeric_limits<scalar_t>::infinity());
-        std::vector<scalar_t> exp_sums_row(nvalues, 0);
+        std::vector<accscalar_t> mx_row(nvalues, -std::numeric_limits<accscalar_t>::infinity());
+        std::vector<accscalar_t> exp_sums_row(nvalues, 0);

        /* Compute mx */
        for (int64_t i : pool_indices) {
          auto values_row = values_accessor[i];
          for (const auto j : c10::irange(nvalues)) {
-            mx_row[j] = std::max(mx_row[j], values_row[j]);
+            mx_row[j] = std::max(mx_row[j], accscalar_t(values_row[j]));
          }
        }

--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@ -391,13 +391,13 @@ void _validate_sparse_coo_tensor_args(
  int64_t sparse_dim = indices.size(0);
  int64_t dense_dim = values.dim() - 1;
  TORCH_CHECK(
-      static_cast<int64_t>(size.size()) == sparse_dim + dense_dim,
-      "number of dimensions must be sparse_dim (",
+    sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
+    "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
+    size.size(),
+    ", sparse_dim = ",
    sparse_dim,
-      ") + dense_dim (",
-      dense_dim,
-      "), but got ",
-      size.size());
+    ", dense_dim = ",
+    dense_dim);

  if (check_pinning) {
    TORCH_CHECK(
--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@ -10,6 +10,7 @@
 #include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
 #include <ATen/ops/cat.h>
 #include <ATen/ops/add_native.h>
+#include <ATen/ops/mul_native.h>
 #include <ATen/ops/empty_native.h>
 #include <ATen/ops/zeros_native.h>
 #include <ATen/ops/result_type.h>
@ -20,10 +21,265 @@
 namespace at::native {

 using namespace at::sparse;
+using namespace mps;

-Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha);
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/Mul_metallib.h>
+#endif

-Tensor& add_out_dense_sparse_mps(
+static SparseTensor& mul_out_dense_sparse_mps(
+    const Tensor& dense,
+    const Tensor& sparse,
+    SparseTensor& out) {
+
+  TORCH_CHECK(sparse.is_sparse(), "mul: expected 'sparse' to be sparse COO");
+  TORCH_CHECK(sparse.is_mps(), "mul: expected 'sparse' to be MPS, got ", sparse.device());
+  TORCH_CHECK(out.is_mps(), "mul: expected 'out' to be MPS, got ", out.device());
+
+  const bool scalar_like = (dense.dim() == 0) || (dense.numel() == 1);
+  TORCH_CHECK(dense.is_mps() || scalar_like,
+              "mul: expected 'dense' to be MPS or scalar-like, got ", dense.device());
+
+  const int64_t nnz = sparse._nnz();
+  out.resize_as_(sparse);
+
+  auto commonDtype = at::result_type(dense, sparse);
+  TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
+              "Can't convert result type ", commonDtype, " to output ", out.scalar_type());
+
+  auto indices = sparse._indices().contiguous();
+  auto values  = sparse._values().to(commonDtype).contiguous();
+
+  if (nnz == 0) {
+    auto empty_vals = values.narrow(0, 0, 0);
+    alias_into_sparse(out,
+                      indices.narrow(1, 0, 0),
+                      (out.scalar_type() == commonDtype) ? empty_vals
+                                                          : empty_vals.to(out.scalar_type()));
+    out._coalesced_(sparse.is_coalesced());
+    return out;
+  }
+
+  if (scalar_like) {
+    auto scalar = dense;
+    if (dense.numel() == 1 && dense.dim() > 0) {
+      scalar = dense.view({});
+    }
+    scalar = scalar.to(values.options());
+    auto out_vals = values.mul(scalar);
+    if (out.scalar_type() != commonDtype) {
+      out_vals = out_vals.to(out.scalar_type());
+    }
+
+    alias_into_sparse(out, indices, out_vals);
+    out._coalesced_(sparse.is_coalesced());
+    return out;
+  }
+
+  TORCH_CHECK(dense.sizes().equals(sparse.sizes()),
+              "mul(dense, sparse): sizes must match exactly (no broadcasting): ",
+              dense.sizes(), " vs ", sparse.sizes());
+
+  const int64_t ndim_i = sparse.sparse_dim();
+  const int64_t ndim = dense.dim();
+  TORCH_CHECK(
+    ndim_i <= ndim,
+    "mul(dense, sparse): sparse_dim=", ndim_i, " exceeds dense.dim()=", ndim);
+
+  // Prepare shapes
+  int64_t view_rows = 1, view_cols = 1;
+  for (int64_t i = 0; i < ndim_i; ++i) view_rows *= sparse.size(i);
+  for (int64_t i = ndim_i; i < ndim; ++i) view_cols *= sparse.size(i);
+
+  auto dense_mps = dense.to(commonDtype).contiguous().reshape({view_rows, view_cols});
+  auto out_vals = at::empty_like(values, values.options());
+
+  const uint32_t u_view_cols = static_cast<uint32_t>(view_cols);
+  const uint32_t u_nnz = static_cast<uint32_t>(nnz);
+  const uint32_t u_ndim_i = static_cast<uint32_t>(ndim_i);
+
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("dense_sparse_mul_kernel_" + mps::scalarToMetalTypeString(values));
+      auto computeEncoder = stream->commandEncoder();
+      [computeEncoder setComputePipelineState:pso];
+
+      const uint32_t gridWidth = u_view_cols;
+      const uint32_t gridDepth = u_nnz;
+      MTLSize gridSize = MTLSizeMake(gridWidth, 1, gridDepth);
+
+      const uint32_t maxThreadsPerGroup = pso.maxTotalThreadsPerThreadgroup;
+      const uint32_t tew = pso.threadExecutionWidth;
+      uint32_t tgWidth  = std::min(gridWidth, tew);
+      MTLSize threadgroupSize = MTLSizeMake(tgWidth, 1, 1);
+
+      mtl_setArgs(
+        computeEncoder,
+        dense_mps,
+        values,
+        out_vals,
+        indices,
+        sparse.sizes(),
+        std::array<uint32_t, 3>{u_nnz, u_ndim_i, u_view_cols}
+      );
+
+      [computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadgroupSize];
+    }
+  });
+
+  Tensor final_vals = out_vals;
+  if (out.scalar_type() != commonDtype) {
+    final_vals = final_vals.to(out.scalar_type());
+  }
+
+  alias_into_sparse(out, indices, final_vals);
+  out._coalesced_(sparse.is_coalesced());
+  return out;
+}
+
+
+SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTensor& r_) {
+  TORCH_CHECK(r_.is_mps(), "mul: expected 'out' to be MPS, but got ", r_.device());
+
+  // Dense x sparse fallback (keep dense first)
+  if (!t_.is_sparse() || !src_.is_sparse()) {
+    const Tensor& dense  = t_.is_sparse() ? src_ : t_;
+    const Tensor& sparse = t_.is_sparse() ? t_   : src_;
+    return mul_out_dense_sparse_mps(dense, sparse, r_);
+  }
+
+  TORCH_CHECK(t_.is_mps(),   "mul: expected 'self' to be MPS, but got ", t_.device());
+  TORCH_CHECK(src_.is_mps(), "mul: expected 'other' to be MPS, but got ", src_.device());
+  TORCH_CHECK(t_.sparse_dim() == src_.sparse_dim(),
+              "mul(sparse, sparse): must have same sparse_dim, got ",
+              t_.sparse_dim(), " vs ", src_.sparse_dim());
+  TORCH_CHECK(t_.sizes().equals(src_.sizes()),
+              "mul(sparse, sparse): sizes must match exactly (no broadcasting).");
+
+  // Coalesce and early-exit on structurally empty operands
+  auto lhs = t_.coalesce();
+  auto rhs = src_.coalesce();
+  const int64_t lhs_nnz = lhs._nnz();
+  const int64_t rhs_nnz = rhs._nnz();
+  if (!lhs_nnz || !rhs_nnz) {
+    r_.resize_as_(lhs);
+    return r_.zero_();
+  }
+
+  // dtype checks and promotion
+  auto commonDtype = at::result_type(lhs, rhs);
+  TORCH_CHECK(canCast(commonDtype, r_.scalar_type()),
+              "Can't convert result type ", commonDtype, " to output ", r_.scalar_type());
+
+  const int64_t ndim_i = lhs.sparse_dim();
+
+  // ndim_i == 0, at most one structural entry
+  if (ndim_i == 0) {
+    r_.resize_as_(lhs);
+    const bool has = (lhs_nnz && rhs_nnz);
+
+    auto out_indices = lhs._indices().narrow(1, 0, has ? 1 : 0);
+
+    Tensor lhs_vals = lhs._values().to(commonDtype);
+    Tensor rhs_vals = rhs._values().to(commonDtype);
+    lhs_vals = lhs_vals.narrow(0, 0, has ? 1 : 0);
+    rhs_vals = rhs_vals.narrow(0, 0, has ? 1 : 0);
+
+    Tensor out_values = lhs_vals.mul(rhs_vals);
+    if (r_.scalar_type() != commonDtype) {
+      out_values = out_values.to(r_.scalar_type());
+    }
+
+    alias_into_sparse(r_, out_indices, out_values);
+    r_._coalesced_(true);
+    return r_;
+  }
+
+  // General path, intersect keys, then gather + multiply on GPU
+  const auto device = r_.device();
+  auto stream = getCurrentMPSStream();
+
+  auto lhs_indices = lhs._indices();
+  auto rhs_indices = rhs._indices();
+  auto lhs_values  = lhs._values().to(commonDtype);
+  auto rhs_values  = rhs._values().to(commonDtype);
+
+  // Flatten sparse indices to keys
+  auto lhs_keys = flatten_indices(lhs_indices, lhs.sizes());
+  auto rhs_keys = flatten_indices(rhs_indices, rhs.sizes());
+
+  // Intersect sorted keys (search the shorter in the longer)
+  const bool A_is_lhs = (lhs_nnz <= rhs_nnz);
+  const int64_t lenA = A_is_lhs ? lhs_nnz : rhs_nnz;
+  const int64_t lenB = A_is_lhs ? rhs_nnz : lhs_nnz;
+  auto A_keys = A_is_lhs ? lhs_keys : rhs_keys;
+  auto B_keys = A_is_lhs ? rhs_keys : lhs_keys;
+
+  auto outA_idx = at::empty({lenA}, at::device(device).dtype(kLong));
+  auto outB_idx = at::empty({lenA}, at::device(device).dtype(kLong));
+  auto counter = at::zeros({1}, at::device(device).dtype(kInt));
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+      mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
+                  static_cast<uint32_t>(lenB), A_is_lhs);
+      mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
+    }
+  });
+
+  const uint32_t M = counter.item<int32_t>(); // number of structural matches
+
+  r_.resize_as_(lhs);
+
+  auto out_indices = at::empty({ndim_i, static_cast<int64_t>(M)}, at::device(device).dtype(at::kLong));
+  auto lhs_match = outA_idx.narrow(0, 0, M);
+  auto rhs_match = outB_idx.narrow(0, 0, M);
+  auto out_val_sizes = lhs_values.sizes().vec();
+  out_val_sizes[0] = static_cast<int64_t>(M);
+  auto out_values = at::empty(out_val_sizes, lhs_values.options());
+
+  const uint32_t cols = static_cast<uint32_t>(
+      lhs_values.numel() / std::max<int64_t>(1, lhs_nnz));
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc(
+          "fused_gather_mul_kernel_" + mps::scalarToMetalTypeString(lhs_values));
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+
+      const uint32_t tew  = pso.threadExecutionWidth;
+      uint32_t tgW = std::min(cols, tew);
+      MTLSize grid = MTLSizeMake(cols, 1, M);
+      MTLSize tgs  = MTLSizeMake(tgW, 1, 1);
+
+      mtl_setArgs(enc,
+                  lhs_values, rhs_values,
+                  lhs_match, rhs_match,
+                  lhs_indices, out_indices,
+                  out_values,
+                  std::array<uint32_t, 2>{static_cast<uint32_t>(ndim_i), static_cast<uint32_t>(lhs_nnz)},
+                  std::array<uint32_t, 2>{M, cols});
+      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
+    }
+  });
+
+  if (r_.scalar_type() != commonDtype) {
+    out_values = out_values.to(r_.scalar_type());
+  }
+
+  alias_into_sparse(r_, out_indices, out_values);
+  r_._coalesced_(true);
+  return r_;
+}
+
+static Tensor& add_out_dense_sparse_mps(
    Tensor& out,
    const Tensor& dense,
    const SparseTensor& sparse,
--- a/aten/src/ATen/native/sparse/mps/kernels/Mul.metal
+++ b/aten/src/ATen/native/sparse/mps/kernels/Mul.metal
@ -0,0 +1,150 @@
+#include <metal_stdlib>
+#include <c10/metal/indexing.h>
+using namespace metal;
+
+
+template <typename T>
+kernel void dense_sparse_mul_kernel(
+    device const T* dense         [[buffer(0)]],
+    device const T* values        [[buffer(1)]],
+    device T* out_values          [[buffer(2)]],
+    device const long* indices    [[buffer(3)]],
+    device const long* sizes      [[buffer(4)]],
+    constant uint3& sparse_params [[buffer(5)]],
+    uint3 gid                     [[thread_position_in_grid]])
+{
+  uint col = gid.x;
+  uint i = gid.z;
+  uint nnz = sparse_params.x;
+  uint ndim_i = sparse_params.y;
+  uint view_cols = sparse_params.z;
+
+  long key = 0;
+  for (uint d = 0; d < ndim_i; ++d) {
+    long idx_d = indices[(ulong)d * (ulong)nnz + (ulong)i];
+    const auto sz_d  = sizes[d];
+    key = key * sz_d + idx_d;
+  }
+
+  ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
+  ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;
+
+  const auto a = static_cast<float>(values[val_idx]);
+  const auto b = static_cast<float>(dense[dense_idx]);
+  out_values[val_idx] = static_cast<T>(a * b);
+}
+
+kernel void intersect_binary_search(
+    device const long*  keysA        [[buffer(0)]],
+    device const long*  keysB        [[buffer(1)]],
+    device long*        outA_idx     [[buffer(2)]],
+    device long*        outB_idx     [[buffer(3)]],
+    device atomic_uint* counter      [[buffer(4)]],
+    constant uint&      lenB         [[buffer(5)]],
+    constant bool&      A_is_lhs     [[buffer(6)]],
+    uint3               tid_in_grid  [[thread_position_in_grid]])
+{
+  uint gid = tid_in_grid.x;
+
+  long key = keysA[gid];
+
+  // lower_bound in B
+  uint lo = 0;
+  uint hi = lenB;
+  while (lo < hi) {
+    uint mid = (lo + hi) >> 1;
+    long v = keysB[mid];
+    if (v < key) lo = mid + 1;
+    else         hi = mid;
+  }
+
+  if (lo < lenB && keysB[lo] == key) {
+    uint pos = atomic_fetch_add_explicit(counter, 1u, memory_order_relaxed);
+    if (A_is_lhs) {
+      outA_idx[pos] = (long)gid;
+      outB_idx[pos] = (long)lo;
+    } else {
+      outA_idx[pos] = (long)lo;
+      outB_idx[pos] = (long)gid;
+    }
+  }
+}
+
+
+template <typename T>
+kernel void fused_gather_mul_kernel(
+    device const T*    lhs_vals      [[buffer(0)]],
+    device const T*    rhs_vals      [[buffer(1)]],
+    device const long* lhs_sel       [[buffer(2)]],
+    device const long* rhs_sel       [[buffer(3)]],
+    device const long* lhs_indices   [[buffer(4)]],
+    device long*       out_indices   [[buffer(5)]],
+    device T*          out_vals      [[buffer(6)]],
+    constant uint2&    dims_input    [[buffer(7)]],
+    constant uint2&    dims_output   [[buffer(8)]],
+    uint3              gid           [[thread_position_in_grid]])
+{
+  const uint col = gid.x;
+  const uint k = gid.z;
+  const uint n_dim_i = dims_input.x;
+  const uint L = dims_input.y;
+  const uint M = dims_output.x;
+  const uint view_cols = dims_output.y;
+
+  const long iL = lhs_sel[k];
+  const long iR = rhs_sel[k];
+
+  if (col < view_cols) {
+    const ulong offL = (ulong)iL * (ulong)view_cols + (ulong)col;
+    const ulong offR = (ulong)iR * (ulong)view_cols + (ulong)col;
+    const ulong offO = (ulong)k  * (ulong)view_cols + (ulong)col;
+
+    const float a = (float)lhs_vals[offL];
+    const float b = (float)rhs_vals[offR];
+    out_vals[offO] = (T)(a * b);
+  }
+
+  // One thread per match copies the indices column
+  if (col == 0) {
+    const ulong uL = (ulong)L;
+    const ulong uM = (ulong)M;
+    const ulong src_col = (ulong)iL; // gather from lhs
+    for (uint d = 0; d < n_dim_i; ++d) {
+      const long v = lhs_indices[(ulong)d * uL + src_col];
+      out_indices[(ulong)d * uM + (ulong)k] = v;
+    }
+  }
+}
+
+#define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE)                                 \
+  template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void     \
+  dense_sparse_mul_kernel<DTYPE>(                                           \
+      device const DTYPE* dense         [[buffer(0)]],                      \
+      device const DTYPE* values        [[buffer(1)]],                      \
+      device DTYPE* out_values          [[buffer(2)]],                      \
+      device const long* indices        [[buffer(3)]],                      \
+      device const long* sizes          [[buffer(4)]],                      \
+      constant uint3& sparse_params     [[buffer(5)]],                      \
+      uint3 gid                         [[thread_position_in_grid]]);
+
+INSTANTIATE_DENSE_SPARSE_MUL(float);
+INSTANTIATE_DENSE_SPARSE_MUL(half);
+INSTANTIATE_DENSE_SPARSE_MUL(bfloat);
+
+#define INSTANTIATE_FUSED_GATHER_MUL(DTYPE)                                  \
+  template [[host_name("fused_gather_mul_kernel_" #DTYPE)]] kernel void      \
+  fused_gather_mul_kernel<DTYPE>(                                            \
+      device const DTYPE* lhs_vals      [[buffer(0)]],                       \
+      device const DTYPE* rhs_vals      [[buffer(1)]],                       \
+      device const long*  lhs_sel       [[buffer(2)]],                       \
+      device const long*  rhs_sel       [[buffer(3)]],                       \
+      device const long*  lhs_indices   [[buffer(4)]],                       \
+      device long*        out_indices   [[buffer(5)]],                       \
+      device DTYPE*       out_vals      [[buffer(6)]],                       \
+      constant uint2&     dims_input    [[buffer(7)]],                       \
+      constant uint2&     dims_output   [[buffer(8)]],                       \
+      uint3               gid           [[thread_position_in_grid]]);
+
+INSTANTIATE_FUSED_GATHER_MUL(float);
+INSTANTIATE_FUSED_GATHER_MUL(half);
+INSTANTIATE_FUSED_GATHER_MUL(bfloat);
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@ -4,6 +4,7 @@ import csv
 import functools
 import json
 import os
+import platform
 import timeit
 from collections import namedtuple
 from dataclasses import asdict, dataclass
@ -191,6 +192,11 @@ class BenchmarkRunner:
        self.predefined_minimum_secs = 1
        self.max_iters = 1e6
        self.use_jit = args.use_jit
+        self.use_compile = args.use_compile
+        if self.use_jit and self.use_compile:
+            raise ValueError(
+                "use_jit and use_compile are mutually exclusive, please specify one."
+            )
        self.num_runs = args.num_runs
        self.print_per_iter = False
        self.output_csv = args.output_csv
@ -222,7 +228,7 @@ class BenchmarkRunner:
            if self.args.operators:
                print(f"# {self.args.operators}")

-    def _print_perf_result(self, reported_run_time_us, test_case):
+    def _print_perf_result(self, results, test_case):
        if self.args.report_aibench:
            # Output for AIBench
            # Print out per iteration execution time instead of avg time
@ -236,12 +242,14 @@ class BenchmarkRunner:
                            "type": test_name,
                            "metric": "latency",
                            "unit": "us",
-                            "value": str(reported_run_time_us[run]),
+                            "value": str(results["reported_run_time_us"[run]]),
                        }
                    )
                )
        else:
-            print(f"# Mode: {'JIT' if self.use_jit else 'Eager'}")
+            print(
+                f"# Mode: {'JIT' if self.use_jit else 'Compile' if self.use_compile else 'Eager'}"
+            )
            print(
                f"# Name: {test_case.test_config.test_name}\n# Input: {test_case.test_config.input_config}"
            )
@ -250,25 +258,33 @@ class BenchmarkRunner:
            if self.num_runs > 1:
                for run in range(self.num_runs):
                    print(
-                        f"Run: {run}, {mode} Execution Time (us) : {reported_run_time_us[run]:.3f}"
+                        f"Run: {run}, {mode} Execution Time (us) : {results['reported_run_time_us'][run]:.3f}"
                    )
                print()
            else:
-                print(f"{mode} Execution Time (us) : {reported_run_time_us[0]:.3f}\n")
+                print(
+                    f"{mode} Execution Time (us) : {results['reported_run_time_us'][0]:.3f}"
+                )
+                print(f"Peak Memory (KB) : {results['peak_memory']}\n")

-    def _perf_result_to_dict(self, reported_run_time_us, test_case):
+    def _perf_result_to_dict(self, results, test_case):
        """This function is the parallel of _print_perf_result, which instead of
        writing information to terminal, returns a dictionary.
        """
        if self.args.report_aibench:
            return {}
+
        out = {
            "test_name": test_case.test_config.test_name,
            "input_config": test_case.test_config.input_config,
-            "mode": "JIT" if self.use_jit else "Eager",
+            "runtime": (
+                "JIT" if self.use_jit else "Compile" if self.use_compile else "Eager"
+            ),
            "run": "Backward" if test_case.test_config.run_backward else "Forward",
-            "latency": round(reported_run_time_us[0], 3),
+            "latency": round(results["reported_run_time_us"][0], 3),
            "latency unit": "us",
+            "peak memory": results["peak_memory"],
+            "memory unit": "KB",
        }

        # parsing test_case.test_config.input_config, adding it as entries to the 'out' dictionary
@ -330,6 +346,8 @@ class BenchmarkRunner:
        func = test_case.run_forward
        if self.use_jit:
            func = test_case.run_jit_forward
+        if self.use_compile:
+            func = test_case.run_compile_forward
        forward_time = timeit.timeit(
            functools.partial(func, iters, print_per_iter, cuda_sync), number=1
        )
@ -346,7 +364,7 @@ class BenchmarkRunner:
        )
        return backward_time

-    def _measure_time(self, launch_test, test_case, iters, print_per_iter):
+    def _measure_metrics(self, launch_test, test_case, iters, print_per_iter):
        """
        This function execute the operator for <iters> iterations then look at the time.
        If it's not significant, the number of iterations will be increased before rerun.
@ -354,8 +372,20 @@ class BenchmarkRunner:
        """
        curr_test_total_time = 0
        time_trace = []
+        peak_memory = 0
+        sample_input = next(iter(test_case.op_bench.inputs.values()))
+        device = sample_input.device
+        device_module = torch.get_device_module(device.type)
+        # TODO: add support for cpu memory measurement
        while True:
+            if hasattr(device_module, "reset_peak_memory_stats"):
+                device_module.reset_peak_memory_stats(device)
            run_time_sec = launch_test(test_case, iters, print_per_iter)
+            if hasattr(device_module, "synchronize"):
+                device_module.synchronize(device)
+            # Memory measurement process
+            if hasattr(device_module, "max_memory_allocated"):
+                peak_memory = device_module.max_memory_allocated(device)
            curr_test_total_time += run_time_sec
            # Analyze time after each run to decide if the result is stable
            results_are_significant = self._iteration_result_is_significant(
@ -369,7 +399,13 @@ class BenchmarkRunner:
            time_trace.append(report_run_time)
            # Print out the time spent in each epoch in ms
            if self.args.report_aibench:
-                mode = "JIT" if self.use_jit else "Eager"
+                mode = (
+                    "JIT"
+                    if self.use_jit
+                    else "Compile"
+                    if self.use_compile
+                    else "Eager"
+                )
                test_name = "_".join(
                    [test_case.framework, test_case.test_config.test_name, mode]
                )
@ -381,7 +417,7 @@ class BenchmarkRunner:
                            "metric": "latency",
                            "unit": "ms",
                            "value": str(report_run_time / 1e3),
-                        }
+                        },
                    )
                )
            if results_are_significant:
@ -391,7 +427,7 @@ class BenchmarkRunner:
            # iteration count, and run the benchmark again...
            iters = self._predict_num_iter_needed(iters)
        reported_run_time_us = np.percentile(np.array(time_trace), 50)
-        return reported_run_time_us
+        return reported_run_time_us, peak_memory / 1024

    def _check_keep(self, test_flag, cmd_flag):
        return cmd_flag is None or test_flag == cmd_flag
@ -478,6 +514,7 @@ class BenchmarkRunner:
        self,
        perf_list,
        output_file,
+        benchmark_name="PyTorch operator benchmark",
    ):
        """
        Write the result into JSON format, so that it can be uploaded to the benchmark database
@ -495,8 +532,10 @@ class BenchmarkRunner:
            input_config = perf_item.get("input_config", "")
            run_type = perf_item.get("run")
            latency = perf_item.get("latency", 0)
-
-            dtype = "float32"  # default
+            peak_memory = perf_item.get("peak memory", 0)
+            device = perf_item.get("device", "unknown")
+            dtype = perf_item.get("dtype", "torch.float").split(".")[1]
+            runtime = perf_item.get("runtime", None)

            # Extract mode based on run_type
            mode = None
@ -505,6 +544,22 @@ class BenchmarkRunner:
            elif run_type == "Backward":
                mode = "training"

+            # Extract use_compile from it
+            if runtime == "Compile":
+                use_compile = True
+            elif runtime == "Eager":
+                use_compile = False
+            else:
+                use_compile = None
+
+            device_arch = (
+                torch.cuda.get_device_name(0)
+                if device == "cuda"
+                else platform.processor()
+                if device == "cpu"
+                else "unknown"
+            )
+
            # Create the record
            @dataclass
            class BenchmarkInfo:
@ -532,12 +587,18 @@ class BenchmarkRunner:
                model: ModelInfo
                metric: MetricInfo

-            record = BenchmarkRecord(
+            # Add record for latency
+            record_latency = BenchmarkRecord(
                benchmark=BenchmarkInfo(
-                    name="PyTorch operator benchmark",
+                    name=benchmark_name,
                    mode=mode,
                    dtype=dtype,
-                    extra_info={"input_config": input_config},
+                    extra_info={
+                        "input_config": input_config,
+                        "device": device,
+                        "arch": device_arch,
+                        "use_compile": use_compile,
+                    },
                ),
                model=ModelInfo(
                    name=test_name, type="micro-benchmark", origins=["pytorch"]
@ -549,8 +610,17 @@ class BenchmarkRunner:
                    target_value=None,
                ),
            )
+            records.append(asdict(record_latency))

-            records.append(asdict(record))
+            # Add record for peak memory
+            record_memory = copy.deepcopy(record_latency)
+            record_memory.metric = MetricInfo(
+                name="peak memory",
+                unit="KB",
+                benchmark_values=[peak_memory],
+                target_value=None,
+            )
+            records.append(asdict(record_memory))

        # Write all records to the output file
        with open(output_file, "w", encoding="utf-8") as f:
@ -566,6 +636,7 @@ class BenchmarkRunner:
            "tag",
            "run_backward",
            "Execution Time",
+            "Peak Memory (KB)",
        ]

        if self.args.output_json or self.args.output_json_for_dashboard:
@ -603,13 +674,16 @@ class BenchmarkRunner:
                    test_case, self.args.warmup_iterations, print_per_iter=False
                )
                # Actual Execution
-                reported_time = [
-                    self._measure_time(
+                results = [
+                    self._measure_metrics(
                        launch_func, test_case, self.iters, self.print_per_iter
                    )
                    for _ in range(self.num_runs)
                ]
-                self._print_perf_result(reported_time, test_case)
+                result_dict = dict()
+                result_dict["reported_run_time_us"] = [r[0] for r in results]
+                result_dict["peak_memory"] = results[0][1]
+                self._print_perf_result(results=result_dict, test_case=test_case)

                # output results to csv
                self._output_csv(
@ -625,16 +699,17 @@ class BenchmarkRunner:
                        ),
                        test_case.test_config.tag,
                        test_case.test_config.run_backward,
-                        reported_time[0],
+                        result_dict["reported_run_time_us"][0],
+                        result_dict["peak_memory"],
                    ],
                )
                if self.args.output_json or self.args.output_json_for_dashboard:
-                    perf_list.append(
-                        self._perf_result_to_dict(reported_time, test_case)
-                    )
+                    perf_list.append(self._perf_result_to_dict(result_dict, test_case))

        if self.args.output_json_for_dashboard:
-            self._output_json(perf_list, self.args.output_json_for_dashboard)
+            self._output_json(
+                perf_list, self.args.output_json_for_dashboard, self.args.benchmark_name
+            )

        if self.args.output_json:
            with open(self.args.output_json, "w") as f:
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@ -4,6 +4,15 @@ import time
 import torch


+# Import the C++ extension to register the _consume operator
+try:
+    import benchmark_cpp_extension  # noqa: F401
+except ImportError as err:
+    # If the extension isn't built, the script must raise an error
+    raise ImportError(
+        "Failed to import C++ extension, please build it using \ncd pt_extension \npython -m pip install ."
+    ) from err
+
 """PyTorch performance microbenchmarks.

 This module contains PyTorch-specific functionalities for performance
@ -71,6 +80,16 @@ class TorchBenchmarkBase(torch.nn.Module):
        for _ in range(iters):
            torch.ops.operator_benchmark._consume(self.forward_impl())

+    def forward_impl_eager(self):
+        # This is to supply the inputs to the forward function which
+        # will be called in both the eager and compile mode of local runs
+        return self.forward(*self.get_inputs())
+
+    def forward_consume_eager(self, iters: int):
+        # Eager version of forward_consume without decorators (compilation handled by torch.compile)
+        for _ in range(iters):
+            torch.ops.operator_benchmark._consume(self.forward_impl_eager())
+
    def module_name(self):
        """this is used to label the operator being benchmarked"""
        if self.user_given_name:
@ -117,18 +136,32 @@ class PyTorchOperatorTestCase:
        self.framework = "PyTorch"
        self.time_series = []
        self._jit_forward_graph = None
+        self._compile_forward_graph = None

    def _generate_jit_forward_graph(self):
        """generate a graph for the forward function via scripting"""
        scripted_op_bench = torch.jit.script(self.op_bench)
        return scripted_op_bench.forward_consume

+    def _generate_compile_forward_graph(self):
+        """generate a compiled graph for the forward function via torch.compile"""
+        compiled_forward_consume = torch.compile(
+            self.op_bench.forward_consume_eager, backend="inductor"
+        )
+        return compiled_forward_consume
+
    def run_jit_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
        """Run the forward path of an op with JIT mode"""
        if self._jit_forward_graph is None:
            self._jit_forward_graph = self._generate_jit_forward_graph()
        self._jit_forward_graph(num_runs)

+    def run_compile_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
+        """Run the forward path of an op with compile mode"""
+        if self._compile_forward_graph is None:
+            self._compile_forward_graph = self._generate_compile_forward_graph()
+        self._compile_forward_graph(num_runs)
+
    def _print_per_iter(self):
        # print last 50 values
        length = min(len(self.time_series), 50)
@ -150,14 +183,14 @@ class PyTorchOperatorTestCase:
        if print_per_iter:
            for _ in range(num_runs):
                start_time = time.time()
-                self.output = self.op_bench.forward_impl()
+                self.output = self.op_bench.forward_impl_eager()
                if cuda_sync:
                    torch.cuda.synchronize(torch.cuda.current_device())
                end_time = time.time()
                self.time_series.append((end_time - start_time) * 1e3)
        else:
            for _ in range(num_runs):
-                self.output = self.op_bench.forward_impl()
+                self.output = self.op_bench.forward_impl_eager()
            if cuda_sync:
                torch.cuda.synchronize(torch.cuda.current_device())

--- a/benchmarks/operator_benchmark/benchmark_runner.py
+++ b/benchmarks/operator_benchmark/benchmark_runner.py
@ -62,6 +62,13 @@ def parse_args():
        default=None,
    )

+    parser.add_argument(
+        "--benchmark-name",
+        "--benchmark_name",
+        help="Name of the benchmark to store results to",
+        default="PyTorch operator benchmark",
+    )
+
    parser.add_argument(
        "--list-tests",
        "--list_tests",
@ -135,6 +142,16 @@ def parse_args():
        help="Run operators with PyTorch JIT mode",
    )

+    parser.add_argument(
+        "--use-compile",
+        "--use_compile",
+        type=benchmark_utils.str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="Run operators with PyTorch Compile mode",
+    )
+
    parser.add_argument(
        "--forward-only",
        "--forward_only",
@ -162,7 +179,7 @@ def parse_args():
        "--output-json-for-dashboard",
        "--output_json_for_dashboard",
        help="Save results in JSON format for display on the OSS dashboard",
-        default="False",
+        default="benchmark-results.json",
    )

    args, _ = parser.parse_known_args()
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@ -948,7 +948,6 @@ def define_buck_targets(
            [
                ("torch/csrc/api/include", "torch/**/*.h"),
                ("", "torch/csrc/**/*.h"),
-                ("", "torch/csrc/**/*.hpp"),
                ("", "torch/nativert/**/*.h"),
                ("", "torch/headeronly/**/*.h"),
                ("", "torch/script.h"),
@ -2034,7 +2033,6 @@ def define_buck_targets(
                ("", "caffe2/utils/*.h"),
                ("", "caffe2/core/*.h"),
                ("", "torch/csrc/*.h"),
-                ("", "torch/csrc/*.hpp"),
                ("", "torch/csrc/api/include/torch/*.h"),
                ("", "torch/csrc/autograd/*.h"),
                ("", "torch/csrc/autograd/*/*.h"),
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@ -78,7 +78,7 @@ int device_count_impl(bool fail_if_no_driver) {
          "would like to use GPUs, turn off ASAN.");
      break;
 #endif // C10_ASAN_ENABLED
-#if _WIN32 && CUDA_VERSION >= 13000
+#if defined(_WIN32) && CUDA_VERSION >= 13000
    // Workaround for CUDA-13.0 error handling on Windows, see
    // https://github.com/pytorch/pytorch/issues/162333#issuecomment-3267929585
    case cudaErrorNotSupported:
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@ -196,20 +196,25 @@ TTarget* assign_ptr_(TTarget* rhs) {
  }
 }

-// Increment needs to be acquire-release to make use_count() and
-// unique() reliable.
+// The only requirement for refcount increment is that it happens-before
+// decrement, so no additional memory ordering is needed.
 inline uint32_t atomic_refcount_increment(std::atomic<uint32_t>& refcount) {
-  return refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
+  return refcount.fetch_add(1, std::memory_order_relaxed) + 1;
 }

-// weak_use_count() is only used for testing, so we don't need it to
-// be reliable. Relaxed should be fine.
 inline uint32_t atomic_weakcount_increment(std::atomic<uint32_t>& weakcount) {
  return weakcount.fetch_add(1, std::memory_order_relaxed) + 1;
 }

-// Both decrements need to be acquire-release for correctness. See
-// e.g. std::shared_ptr implementation.
+// The requirement is that all modifications to the managed object happen-before
+// invocation of the managed object destructor, and that allocation of the
+// managed object storage happens-before deallocation of the storage.
+//
+// To get this ordering, all non-final decrements must synchronize-with the
+// final decrement. So all non-final decrements have to store-release while the
+// final decrement has to load-acquire, either directly or with the help of
+// fences. But it's easiest just to have all decrements be acq-rel. And it turns
+// out, on modern architectures and chips, it's also fastest.
 inline uint32_t atomic_refcount_decrement(std::atomic<uint32_t>& refcount) {
  return refcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
 }
@ -332,7 +337,7 @@ class intrusive_ptr final {
  intrusive_ptr() noexcept
      : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}

-  intrusive_ptr(std::nullptr_t) noexcept
+  /* implicit */ intrusive_ptr(std::nullptr_t) noexcept
      : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}

  // This constructor will not increase the ref counter for you.
@ -445,14 +450,14 @@ class intrusive_ptr final {
    if (target_ == NullType::singleton()) {
      return 0;
    }
-    return target_->refcount_.load(std::memory_order_acquire);
+    return target_->refcount_.load(std::memory_order_relaxed);
  }

  uint32_t weak_use_count() const noexcept {
    if (target_ == NullType::singleton()) {
      return 0;
    }
-    return target_->weakcount_.load(std::memory_order_acquire);
+    return target_->weakcount_.load(std::memory_order_relaxed);
  }

  bool unique() const noexcept {
@ -851,14 +856,14 @@ class weak_intrusive_ptr final {
      return 0;
    }
    return target_->refcount_.load(
-        std::memory_order_acquire); // refcount, not weakcount!
+        std::memory_order_relaxed); // refcount, not weakcount!
  }

  uint32_t weak_use_count() const noexcept {
    if (target_ == NullType::singleton()) {
      return 0;
    }
-    return target_->weakcount_.load(std::memory_order_acquire);
+    return target_->weakcount_.load(std::memory_order_relaxed);
  }

  bool expired() const noexcept {
@ -866,18 +871,22 @@ class weak_intrusive_ptr final {
  }

  intrusive_ptr<TTarget, NullType> lock() const noexcept {
-    if (expired()) {
+    if (target_ == NullType::singleton()) {
      return intrusive_ptr<TTarget, NullType>();
    } else {
-      auto refcount = target_->refcount_.load(std::memory_order_seq_cst);
+      auto refcount = target_->refcount_.load(std::memory_order_relaxed);
      do {
        if (refcount == 0) {
          // Object already destructed, no strong references left anymore.
          // Return nullptr.
          return intrusive_ptr<TTarget, NullType>();
        }
-      } while (
-          !target_->refcount_.compare_exchange_weak(refcount, refcount + 1));
+      } while (!target_->refcount_.compare_exchange_weak(
+          refcount,
+          refcount + 1,
+          std::memory_order_acquire,
+          std::memory_order_relaxed));
+
      return intrusive_ptr<TTarget, NullType>(
          target_, raw::DontIncreaseRefcount{});
    }
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -540,16 +540,23 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
    ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
  )

+  if(USE_DISTRIBUTED)
    append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
    if(NOT WIN32)
      append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
    endif()
+  endif()
 endif()

 if(USE_CUDA OR USE_ROCM)
  append_filelist("libtorch_cuda_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS)
 endif()

+if(USE_CUDA)
+  # eventually do rocm
+  append_filelist("libtorch_nativert_cuda_sources" Caffe2_GPU_SRCS)
+endif()
+
 if(USE_CUDA)
  list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
  add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
@ -566,6 +573,7 @@ if(USE_CUDA)
    list(APPEND Caffe2_GPU_SRCS
      ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
  endif()
+  if(USE_DISTRIBUTED)
    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
    if(NOT WIN32)
      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
@ -591,6 +599,7 @@ if(USE_CUDA)
    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
      set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
    endif()
+  endif()
  set_source_files_properties(
    ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
    PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
@ -622,10 +631,12 @@ if(USE_ROCM)
    list(APPEND Caffe2_HIP_SRCS
      ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
  endif()
+  if(USE_DISTRIBUTED)
    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
    if(NOT WIN32)
      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
    endif()
+  endif()
  # caffe2_nvrtc's stubs to driver APIs are useful for HIP.
  # See NOTE [ ATen NVRTC Stub and HIP ]
  add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
@ -1345,11 +1356,13 @@ if(BUILD_TEST)
    add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
    add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
    add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
+    if(USE_DISTRIBUTED)
      add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
      if(NOT WIN32)
        add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
        add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
      endif()
+    endif()
    if(NOT NO_API)
      add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
    endif()
@ -1453,40 +1466,46 @@ if(BUILD_LITE_INTERPRETER)
  endif()
 endif()

-if(USE_GLOO AND USE_C10D_GLOO)
+
+# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
+# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
+if(USE_DISTRIBUTED)
+  target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED)
+  if(USE_GLOO AND USE_C10D_GLOO)
    target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
-endif()
-if(USE_UCC AND USE_C10D_UCC)
+  endif()
+  if(USE_UCC AND USE_C10D_UCC)
    target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
    if(USE_CUDA)
      target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
    endif()
-endif()
-if(USE_NCCL AND USE_C10D_NCCL)
+  endif()
+  if(USE_NCCL AND USE_C10D_NCCL)
    if(USE_ROCM)
      target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
    else()
      target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
    endif()
-endif()
-if(USE_MPI AND USE_C10D_MPI)
+  endif()
+  if(USE_MPI AND USE_C10D_MPI)
    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
      set_source_files_properties(
        "${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
        PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
    endif()
    target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
-endif()
-# Pass USE_RPC in order to reduce use of
-# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
-# need to be removed when RPC is supported
-if(NOT WIN32)
+  endif()
+  # Pass USE_RPC in order to reduce use of
+  # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
+  # need to be removed when RPC is supported
+  if(NOT WIN32)
    target_compile_definitions(torch_cpu PUBLIC USE_RPC)
-endif()
-# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
-# can only be compiled with USE_TENSORPIPE is set.
-if(USE_TENSORPIPE)
+  endif()
+  # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
+  # can only be compiled with USE_TENSORPIPE is set.
+  if(USE_TENSORPIPE)
    target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
+  endif()
 endif()

 if(NOT INTERN_BUILD_MOBILE)
@ -1830,6 +1849,12 @@ if(BUILD_TEST)
              target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::undefined)
            endif()
          endif()
+          if(USE_LSAN AND TARGET Sanitizer::leak)
+            target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::leak)
+          endif()
+          if(USE_TSAN AND TARGET Sanitizer::thread)
+            target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::thread)
+          endif()
        else()
          add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -108,24 +108,32 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_BUILD_MOBILE)
  enable_ubsan()
 endif()

-if(USE_ASAN OR USE_TSAN)
+if(USE_ASAN OR USE_LSAN OR USE_TSAN)
  find_package(Sanitizer REQUIRED)
  if(USE_ASAN)
    if(TARGET Sanitizer::address)
      list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::address)
    else()
-      message(WARNING "Not ASAN found. Suppress this warning with -DUSE_ASAN=OFF.")
+      message(WARNING "ASAN not found. Suppress this warning with -DUSE_ASAN=OFF.")
      caffe2_update_option(USE_ASAN OFF)
    endif()
    if(TARGET Sanitizer::undefined)
      list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::undefined)
    endif()
  endif()
+  if(USE_LSAN)
+    if(TARGET Sanitizer::leak)
+      list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::leak)
+    else()
+      message(WARNING "LSAN not found. Suppress this warning with -DUSE_LSAN=OFF.")
+      caffe2_update_option(USE_LSAN OFF)
+    endif()
+  endif()
  if(USE_TSAN)
    if(TARGET Sanitizer::thread)
      list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::thread)
    else()
-      message(WARNING "Not TSAN found. Suppress this warning with -DUSE_TSAN=OFF.")
+      message(WARNING "TSAN not found. Suppress this warning with -DUSE_TSAN=OFF.")
      caffe2_update_option(USE_TSAN OFF)
    endif()
  endif()
@ -1126,7 +1134,7 @@ if(USE_CUDA AND CUDA_VERSION VERSION_LESS 13.0)
  include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
 endif()

-if(USE_TENSORPIPE)
+if(USE_DISTRIBUTED AND USE_TENSORPIPE)
  if(MSVC)
    message(WARNING "Tensorpipe cannot be used on Windows.")
  else()
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -66,6 +66,7 @@ function(caffe2_print_configuration_summary)
    message(STATUS "    LAPACK              : ${LAPACK_INFO}")
  endif()
  message(STATUS "  USE_ASAN              : ${USE_ASAN}")
+  message(STATUS "  USE_LSAN              : ${USE_LSAN}")
  message(STATUS "  USE_TSAN              : ${USE_TSAN}")
  message(STATUS "  USE_CPP_CODE_COVERAGE : ${USE_CPP_CODE_COVERAGE}")
  message(STATUS "  USE_CUDA              : ${USE_CUDA}")
@ -191,11 +192,13 @@ function(caffe2_print_configuration_summary)
  message(STATUS "  USE_PYTORCH_QNNPACK   : ${USE_PYTORCH_QNNPACK}")
  message(STATUS "  USE_XNNPACK           : ${USE_XNNPACK}")
  message(STATUS "  USE_DISTRIBUTED       : ${USE_DISTRIBUTED}")
+  if(${USE_DISTRIBUTED})
    message(STATUS "    USE_MPI               : ${USE_MPI}")
    message(STATUS "    USE_GLOO              : ${USE_GLOO}")
    message(STATUS "    USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
    message(STATUS "    USE_GLOO_IBVERBS      : ${USE_GLOO_IBVERBS}")
    message(STATUS "    USE_TENSORPIPE        : ${USE_TENSORPIPE}")
+  endif()
  if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
    message(STATUS "  SELECTED_OP_LIST    : ${SELECTED_OP_LIST}")
  endif()
--- a/docs/source/accelerator/index.md
+++ b/docs/source/accelerator/index.md
@ -2,6 +2,10 @@

 Since PyTorch 2.1, the community has made significant progress in streamlining the process of integrating new accelerators into the PyTorch ecosystem. These improvements include, but are not limited to: refinements to the `PrivateUse1` Dispatch Key, the introduction and enhancement of core subsystem extension mechanisms, and the device-agnostic refactoring of key modules (e.g., `torch.accelerator`, `memory management`). Taken together, these advances provide the foundation for a **robust**, **flexible**, and **developer-friendly** pathway for accelerator integration.

+```{note}
+This guide is a work in progress. For more details, please refer to the [roadmap](https://github.com/pytorch/pytorch/issues/158917).
+```
+
 ## Why Does This Matter?

 This integration pathway offers several major benefits:
@ -10,16 +14,6 @@ This integration pathway offers several major benefits:
 * **Future-proofing**: This is the default integration path for all future PyTorch features, meaning that as new modules and features are added, they will automatically support scaling to new accelerators if this path is followed.
 * **Autonomy**: Vendors maintain full control over their accelerator integration timelines, enabling fast iteration cycles and reducing reliance on upstream coordination.

-## About This Document
-
-This guide aims to provide a **comprehensive overview of the modern integration pathway** for new accelerator in PyTorch. It walks through the full integration surface, from low-level device primitives to higher-level domain modules like compilation and quantization. The structure follows a **modular and scenario-driven approach**, where each topic is paired with corresponding code examples from [torch_openreg][OpenReg URL], an official reference implementation.
-
-The goal is to help developers:
-
-* Understand the full scope of accelerator integration;
-* Follow best practices to quickly launch new accelerators;
-* Avoid common pitfalls through clear, targeted examples.
-
 ## Target Audience

 This document is intended for:
@ -27,20 +21,22 @@ This document is intended for:
 * **Accelerator Developers** who are integrating accelerator into PyTorch;
 * **Advanced PyTorch Users** interested in the inner workings of key modules;

-## Quick Overview
+## About This Document

-This document outlines the key processes and practical scenarios involved in integrating new devices into PyTorch, providing developers with a comprehensive and detailed guide for bringing up new backends. The discussion is structured around four major axes:
+This guide aims to provide a **comprehensive overview of the modern integration pathway** for new accelerator in PyTorch. It walks through the full integration surface, from low-level device primitives to higher-level domain modules like compilation and quantization. The structure follows a **modular and scenario-driven approach**, where each topic is paired with corresponding code examples from [torch_openreg][OpenReg URL], an official reference implementation, and this series is structured around four major axes:

 * **Runtime**: Covers core components such as Event, Stream, Memory, Generator, Guard, Hooks, as well as the supporting C++ scaffolding.
 * **Operators**: Involve the minimum necessary set of operators, forward and backward operators, fallback operators, fallthroughs, STUBs, etc. in both C++ and Python implementations.
 * **Python Frontend**: Focuses on Python bindings for modules and device-agnostic APIs.
 * **High-level Modules**: Explores integration with major subsystems such as `AMP`, `Compiler`, `ONNX`, and `Distributed` and so on.

-Next, we will officially embark on the integration journey for a new PyTorch accelerator.
+The goal is to help developers:

-```{note}
-This guide is a work in progress. For more details, please refer to the [roadmap](https://github.com/pytorch/pytorch/issues/158917).
-```
+* Understand the full scope of accelerator integration;
+* Follow best practices to quickly launch new accelerators;
+* Avoid common pitfalls through clear, targeted examples.
+
+Next, we will delve into each chapter of this guide. Each chapter focuses on a key aspect of integration, providing detailed explanations and illustrative examples. Since some chapters build upon previous ones, readers are encouraged to follow the sequence to achieve a more coherent understanding.

 ```{toctree}
 :glob:
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -3333,6 +3333,13 @@ def coverage_post_process(app, exception):
    if not isinstance(app.builder, CoverageBuilder):
        return

+    if not torch.distributed.is_available():
+        raise RuntimeError(
+            "The coverage tool cannot run with a version "
+            "of PyTorch that was built with USE_DISTRIBUTED=0 "
+            "as this module's API changes."
+        )
+
    # These are all the modules that have "automodule" in an rst file
    # These modules are the ones for which coverage is checked
    # Here, we make sure that no module is missing from that list
--- a/docs/source/distributed.tensor.parallel.md
+++ b/docs/source/distributed.tensor.parallel.md
@ -5,7 +5,7 @@
 # Tensor Parallelism - torch.distributed.tensor.parallel

 Tensor Parallelism(TP) is built on top of the PyTorch DistributedTensor
-(DTensor)[https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md]
+([DTensor](https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md))
 and provides different parallelism styles: Colwise, Rowwise, and Sequence Parallelism.

 :::{warning}
--- a/docs/source/onnx.md
+++ b/docs/source/onnx.md
@ -102,6 +102,7 @@ also be interested in reading our [development wiki](https://github.com/pytorch/
    onnx_export
    onnx_ops
    onnx_verification
+    onnx_testing
 ```

 ### Deprecated APIs
--- a/docs/source/onnx_testing.md
+++ b/docs/source/onnx_testing.md
@ -0,0 +1,9 @@
+# torch.onnx.testing
+
+```{eval-rst}
+.. automodule:: torch.onnx.testing
+```
+
+```{eval-rst}
+.. autofunction:: torch.onnx.testing.assert_onnx_program
+```
--- a/test/cpp/dist_autograd/CMakeLists.txt
+++ b/test/cpp/dist_autograd/CMakeLists.txt
@ -1,4 +1,4 @@
-if(NOT WIN32)
+if(USE_DISTRIBUTED AND NOT WIN32)
  set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
  set(DIST_AUTOGRAD_TEST_SOURCES
    ${TORCH_ROOT}/test/cpp/common/main.cpp
--- a/test/cpp/nativert/CMakeLists.txt
+++ b/test/cpp/nativert/CMakeLists.txt
@ -40,26 +40,30 @@ set(NATIVERT_TEST_SRCS
  ${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/GraphPasses.cpp
  ${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/PassManager.cpp
  ${TORCH_ROOT}/torch/nativert/kernels/KernelHandlerRegistry.cpp
-  ${TORCH_ROOT}/torch/nativert/kernels/TritonKernel.cpp
  ${TORCH_ROOT}/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
+  ${TORCH_ROOT}/torch/nativert/kernels/TritonKernel.cpp
  ${TORCH_ROOT}/torch/nativert/executor/DelegateExecutor.cpp
 )

 if(USE_CUDA)
  list(APPEND NATIVERT_TEST_SRCS ${TORCH_ROOT}/torch/nativert/executor/triton/CudaTritonKernelManager.cpp)
-endif(MSVC)
-
+endif()

 add_executable(test_nativert
  ${TORCH_ROOT}/test/cpp/common/main.cpp
  ${NATIVERT_TEST_SRCS}
 )

+if(MSVC)
+  target_compile_definitions(test_nativert PRIVATE NATIVERT_MSVC_TEST)
+endif()
+
 # TODO temporary until we can delete the old gtest polyfills.
 target_compile_definitions(test_nativert PRIVATE USE_GTEST)

 set(NATIVERT_TEST_DEPENDENCIES torch gtest_main)

+target_link_libraries(test_nativert PRIVATE ${CMAKE_DL_LIBS})
 target_link_libraries(test_nativert PRIVATE ${NATIVERT_TEST_DEPENDENCIES})
 target_link_libraries(test_nativert PRIVATE fmt::fmt-header-only)
 target_include_directories(test_nativert PRIVATE ${ATen_CPU_INCLUDE})
--- a/test/cpp/nativert/test_triton_kernel_manager_registration.cpp
+++ b/test/cpp/nativert/test_triton_kernel_manager_registration.cpp
@ -6,9 +6,20 @@ using namespace ::testing;
 using namespace torch::nativert;

 TEST(TritonKernelManagerRegistrationTests, TestRegister) {
-#ifndef USE_CUDA
-  EXPECT_TRUE(create_cuda_triton_kernel_manager == nullptr);
+  EXPECT_TRUE(TritonKernelManagerRegistry()->Has(at::kCPU));
+
+#ifdef USE_CUDA
+#ifdef USE_ROCM
+  EXPECT_TRUE(TritonKernelManagerRegistry()->Has(at::kHIP));
+  EXPECT_FALSE(TritonKernelManagerRegistry()->Has(at::kCUDA));
+
 #else
-  EXPECT_FALSE(create_cuda_triton_kernel_manager == nullptr);
+  EXPECT_TRUE(TritonKernelManagerRegistry()->Has(at::kCUDA));
+  EXPECT_FALSE(TritonKernelManagerRegistry()->Has(at::kHIP));
+
+#endif // USE_ROCM
+#else
+  EXPECT_FALSE(TritonKernelManagerRegistry()->Has(at::kCUDA));
+  EXPECT_FALSE(TritonKernelManagerRegistry()->Has(at::kHIP));
 #endif // USE_CUDA
 }
--- a/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
@ -28,7 +28,11 @@ from torch.testing._internal.common_fsdp import (
    patch_reduce_scatter,
    reduce_scatter_with_assert,
 )
-from torch.testing._internal.common_utils import run_tests, skipIfRocm, TEST_HPU
+from torch.testing._internal.common_utils import (
+    run_tests,
+    skipIfRocmVersionLessThan,
+    TEST_HPU,
+)


 device_type = torch.device(get_devtype())
@ -86,7 +90,7 @@ class TestFullyShardMixedPrecisionTraining(FSDPTest):
            use_shard_placement_fn_vals.append(True)
        return use_shard_placement_fn_vals

-    @skipIfRocm  # regressed in ROCm 6.4, but ROCm 6.5 fixes it
+    @skipIfRocmVersionLessThan((7, 0))
    @skip_if_lt_x_gpu(2)
    @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
    def test_compute_dtype(self):
@ -166,7 +170,7 @@ class TestFullyShardMixedPrecisionTraining(FSDPTest):
            self.assertEqual(fsdp_loss, ref_loss)
            check_sharded_parity(self, ref_model, model)

-    @skipIfRocm  # regressed in ROCm 6.4, but ROCm 6.5 fixes it
+    @skipIfRocmVersionLessThan((7, 0))
    @skip_if_lt_x_gpu(2)
    @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
    def test_reduce_dtype(self):
--- a/test/distributed/_tools/test_mem_tracker.py
+++ b/test/distributed/_tools/test_mem_tracker.py
@ -7,7 +7,6 @@ import torch.nn as nn
 from torch.distributed._tools.mem_tracker import MemTracker
 from torch.testing._internal.common_utils import (
    run_tests,
-    skipIfRocm,
    skipIfTorchDynamo,
    TEST_CUDA,
    TEST_XPU,
@ -34,7 +33,6 @@ class TestMemTracker(TestCase):
    @unittest.skipIf(
        not TEST_CUDA and not TEST_XPU, "Neither CUDA or XPU is not available"
    )
-    @skipIfRocm()
    def test_accelerator_tracker_equivalence(
        self,
    ):
--- a/test/distributed/elastic/multiprocessing/test_api.py
+++ b/test/distributed/elastic/multiprocessing/test_api.py
@ -0,0 +1,331 @@
+#!/usr/bin/env python3
+# Owner(s): ["oncall: r2p"]
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import signal
+from unittest.mock import MagicMock, patch
+
+from torch.distributed.elastic.multiprocessing.api import (
+    _terminate_process_handler,
+    PContext,
+    SignalException,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class SignalHandlingTest(TestCase):
+    def setUp(self):
+        # Save original environment variable if it exists
+        self.original_signals_env = os.environ.get(
+            "TORCHELASTIC_SIGNALS_TO_HANDLE", None
+        )
+
+    def tearDown(self):
+        # Restore original environment variable
+        if self.original_signals_env is not None:
+            os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = self.original_signals_env
+        elif "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
+            del os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"]
+
+    def test_terminate_process_handler(self):
+        """Test that the terminate process handler raises SignalException with the correct signal."""
+        signum = signal.SIGTERM
+        with self.assertRaises(SignalException) as cm:
+            _terminate_process_handler(signum, None)
+
+        self.assertEqual(cm.exception.sigval, signal.SIGTERM)
+        # The signal is represented as a number in the string representation
+        self.assertIn(f"Process {os.getpid()} got signal: {signum}", str(cm.exception))
+
+    @patch("torch.distributed.elastic.multiprocessing.api.threading")
+    @patch("torch.distributed.elastic.multiprocessing.api.signal")
+    @patch("torch.distributed.elastic.multiprocessing.api.logger")
+    def test_start_registers_default_signals(
+        self, mock_logger, mock_signal, mock_threading
+    ):
+        """Test that the start method registers the default signals."""
+        # Setup
+        mock_threading.current_thread.return_value = (
+            mock_threading.main_thread.return_value
+        )
+        mock_pcontext = MagicMock(spec=PContext)
+        # Mock the _stdout_tail and _stderr_tail attributes
+        mock_pcontext._stdout_tail = MagicMock()
+        mock_pcontext._stderr_tail = MagicMock()
+
+        # Remove environment variable if it exists to test default behavior
+        if "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
+            del os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"]
+
+        # Call the start method
+        PContext.start(mock_pcontext)
+
+        # Verify that the signal handler was registered for the default signals
+        expected_signals = ["SIGTERM", "SIGINT", "SIGHUP", "SIGQUIT"]
+
+        # Count the number of calls to signal.signal
+        signal_calls = 0
+        for call in mock_signal.signal.call_args_list:
+            args, _ = call
+            sig, handler = args
+            signal_calls += 1
+            # Verify the handler is our _terminate_process_handler
+            self.assertEqual(handler, _terminate_process_handler)
+
+        # Verify we registered the expected number of signals
+        self.assertEqual(signal_calls, len(expected_signals))
+
+        # Verify _start was called
+        mock_pcontext._start.assert_called_once()
+        # Verify _stdout_tail.start() and _stderr_tail.start() were called
+        mock_pcontext._stdout_tail.start.assert_called_once()
+        mock_pcontext._stderr_tail.start.assert_called_once()
+
+    @patch("torch.distributed.elastic.multiprocessing.api.threading")
+    @patch("torch.distributed.elastic.multiprocessing.api.signal")
+    @patch("torch.distributed.elastic.multiprocessing.api.logger")
+    def test_start_registers_custom_signals(
+        self, mock_logger, mock_signal, mock_threading
+    ):
+        """Test that the start method registers custom signals from the environment variable."""
+        # Setup
+        mock_threading.current_thread.return_value = (
+            mock_threading.main_thread.return_value
+        )
+        mock_pcontext = MagicMock(spec=PContext)
+        # Mock the _stdout_tail and _stderr_tail attributes
+        mock_pcontext._stdout_tail = MagicMock()
+        mock_pcontext._stderr_tail = MagicMock()
+
+        # Set custom signals in the environment variable
+        os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = "SIGTERM,SIGUSR1,SIGUSR2"
+
+        # Call the start method
+        PContext.start(mock_pcontext)
+
+        # Verify that the signal handler was registered for the custom signals
+        expected_signals = ["SIGTERM", "SIGUSR1", "SIGUSR2"]
+
+        # Count the number of calls to signal.signal
+        signal_calls = 0
+        for call in mock_signal.signal.call_args_list:
+            args, _ = call
+            sig, handler = args
+            signal_calls += 1
+            # Verify the handler is our _terminate_process_handler
+            self.assertEqual(handler, _terminate_process_handler)
+
+        # Verify we registered the expected number of signals
+        self.assertEqual(signal_calls, len(expected_signals))
+
+        # Verify _start was called
+        mock_pcontext._start.assert_called_once()
+
+    @patch("torch.distributed.elastic.multiprocessing.api.threading")
+    @patch("torch.distributed.elastic.multiprocessing.api.signal")
+    @patch("torch.distributed.elastic.multiprocessing.api.logger")
+    def test_start_handles_invalid_signals(
+        self, mock_logger, mock_signal, mock_threading
+    ):
+        """Test that the start method handles invalid signals gracefully."""
+        # Setup
+        mock_threading.current_thread.return_value = (
+            mock_threading.main_thread.return_value
+        )
+        mock_pcontext = MagicMock(spec=PContext)
+        # Mock the _stdout_tail and _stderr_tail attributes
+        mock_pcontext._stdout_tail = MagicMock()
+        mock_pcontext._stderr_tail = MagicMock()
+
+        # Set invalid signals in the environment variable
+        os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = "SIGTERM,INVALID_SIGNAL"
+
+        # Mock the signal module to not have the INVALID_SIGNAL attribute
+        # but have SIGTERM
+        mock_signal.SIGTERM = signal.SIGTERM
+        # Remove INVALID_SIGNAL attribute if it exists
+        if hasattr(mock_signal, "INVALID_SIGNAL"):
+            delattr(mock_signal, "INVALID_SIGNAL")
+
+        # Call the start method
+        PContext.start(mock_pcontext)
+
+        # Verify that the warning was logged for the invalid signal
+        # The exact message may vary, so let's check if warning was called with INVALID_SIGNAL
+        warning_calls = [
+            call
+            for call in mock_logger.warning.call_args_list
+            if "INVALID_SIGNAL" in str(call)
+        ]
+        self.assertTrue(len(warning_calls) > 0, "Expected warning about INVALID_SIGNAL")
+
+        # Verify _start was called
+        mock_pcontext._start.assert_called_once()
+
+    @patch("torch.distributed.elastic.multiprocessing.api.threading")
+    @patch("torch.distributed.elastic.multiprocessing.api.signal")
+    @patch("torch.distributed.elastic.multiprocessing.api.logger")
+    def test_start_handles_windows_signals(
+        self, mock_logger, mock_signal, mock_threading
+    ):
+        """Test that the start method handles Windows-specific signal behavior."""
+        # Setup
+        mock_threading.current_thread.return_value = (
+            mock_threading.main_thread.return_value
+        )
+        mock_pcontext = MagicMock(spec=PContext)
+        # Mock the _stdout_tail and _stderr_tail attributes
+        mock_pcontext._stdout_tail = MagicMock()
+        mock_pcontext._stderr_tail = MagicMock()
+
+        # Set signals including ones not supported on Windows
+        os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = "SIGTERM,SIGHUP,SIGUSR1"
+
+        # Mock signal attributes
+        mock_signal.SIGTERM = signal.SIGTERM
+        mock_signal.SIGHUP = signal.SIGHUP
+        mock_signal.SIGUSR1 = signal.SIGUSR1
+
+        # Mock IS_WINDOWS to be True
+        with patch("torch.distributed.elastic.multiprocessing.api.IS_WINDOWS", True):
+            # Mock signal.signal to raise RuntimeError for Windows-unsupported signals
+            def signal_side_effect(sig, handler):
+                if sig in [signal.SIGHUP, signal.SIGUSR1]:
+                    raise RuntimeError("Signal not supported on Windows")
+
+            mock_signal.signal.side_effect = signal_side_effect
+
+            # Call the start method
+            PContext.start(mock_pcontext)
+
+            # Verify that the info was logged for the unsupported signals
+            # Check if any info calls contain the expected messages
+            info_calls = [str(call) for call in mock_logger.info.call_args_list]
+            sighup_logged = any(
+                "SIGHUP" in call and "Windows" in call for call in info_calls
+            )
+            sigusr1_logged = any(
+                "SIGUSR1" in call and "Windows" in call for call in info_calls
+            )
+
+            self.assertTrue(
+                sighup_logged,
+                f"Expected SIGHUP Windows message in info calls: {info_calls}",
+            )
+            self.assertTrue(
+                sigusr1_logged,
+                f"Expected SIGUSR1 Windows message in info calls: {info_calls}",
+            )
+
+            # Verify _start was called
+            mock_pcontext._start.assert_called_once()
+
+    @patch("torch.distributed.elastic.multiprocessing.api.threading")
+    @patch("torch.distributed.elastic.multiprocessing.api.logger")
+    def test_start_not_main_thread(self, mock_logger, mock_threading):
+        """Test that the start method warns when not called from the main thread."""
+        # Setup
+        mock_threading.current_thread.return_value = MagicMock()  # Not the main thread
+        mock_threading.main_thread.return_value = MagicMock()
+        mock_pcontext = MagicMock(spec=PContext)
+        # Mock the _stdout_tail and _stderr_tail attributes
+        mock_pcontext._stdout_tail = MagicMock()
+        mock_pcontext._stderr_tail = MagicMock()
+
+        # Call the start method
+        PContext.start(mock_pcontext)
+
+        # Verify that the warning was logged
+        mock_logger.warning.assert_called_with(
+            "Failed to register signal handlers since torchelastic is running on a child thread. "
+            "This could lead to orphaned worker processes if the torchrun is terminated."
+        )
+
+        # Verify _start was called
+        mock_pcontext._start.assert_called_once()
+
+    @patch("torch.distributed.elastic.multiprocessing.api.threading")
+    @patch("torch.distributed.elastic.multiprocessing.api.signal")
+    @patch("torch.distributed.elastic.multiprocessing.api.logger")
+    def test_start_supports_sigusr1_and_sigusr2(
+        self, mock_logger, mock_signal, mock_threading
+    ):
+        """Test that the start method properly supports SIGUSR1 and SIGUSR2 signals."""
+        # Setup
+        mock_threading.current_thread.return_value = (
+            mock_threading.main_thread.return_value
+        )
+        mock_pcontext = MagicMock(spec=PContext)
+        # Mock the _stdout_tail and _stderr_tail attributes
+        mock_pcontext._stdout_tail = MagicMock()
+        mock_pcontext._stderr_tail = MagicMock()
+
+        # Set environment variable to include SIGUSR1 and SIGUSR2
+        os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = "SIGUSR1,SIGUSR2"
+
+        # Mock signal attributes to have SIGUSR1 and SIGUSR2
+        mock_signal.SIGUSR1 = signal.SIGUSR1
+        mock_signal.SIGUSR2 = signal.SIGUSR2
+
+        # Call the start method
+        PContext.start(mock_pcontext)
+
+        # Verify that signal.signal was called for both SIGUSR1 and SIGUSR2
+        signal_calls = mock_signal.signal.call_args_list
+        registered_signals = [
+            call[0][0] for call in signal_calls
+        ]  # Extract the signal from each call
+
+        # Verify both SIGUSR1 and SIGUSR2 were registered
+        self.assertIn(
+            signal.SIGUSR1, registered_signals, "SIGUSR1 should be registered"
+        )
+        self.assertIn(
+            signal.SIGUSR2, registered_signals, "SIGUSR2 should be registered"
+        )
+
+        # Verify the correct handler was registered for both signals
+        for call in signal_calls:
+            sig, handler = call[0]
+            if sig in [signal.SIGUSR1, signal.SIGUSR2]:
+                self.assertEqual(
+                    handler,
+                    _terminate_process_handler,
+                    f"Signal {sig} should use _terminate_process_handler",
+                )
+
+        # Verify that info messages were logged for successful registration
+        info_calls = [str(call) for call in mock_logger.info.call_args_list]
+        sigusr1_logged = any(
+            "SIGUSR1" in call and "Registered signal handler" in call
+            for call in info_calls
+        )
+        sigusr2_logged = any(
+            "SIGUSR2" in call and "Registered signal handler" in call
+            for call in info_calls
+        )
+
+        self.assertTrue(
+            sigusr1_logged,
+            f"Expected SIGUSR1 registration message in info calls: {info_calls}",
+        )
+        self.assertTrue(
+            sigusr2_logged,
+            f"Expected SIGUSR2 registration message in info calls: {info_calls}",
+        )
+
+        # Verify _start was called
+        mock_pcontext._start.assert_called_once()
+        # Verify _stdout_tail.start() and _stderr_tail.start() were called
+        mock_pcontext._stdout_tail.start.assert_called_once()
+        mock_pcontext._stderr_tail.start.assert_called_once()
+
+
+if __name__ == "__main__":
+    run_tests()
--- a/test/distributed/elastic/utils/distributed_test.py
+++ b/test/distributed/elastic/utils/distributed_test.py
@ -116,7 +116,6 @@ class DistributedUtilTest(TestCase):
                timeout=1,
            )

-    @skipIfRocm
    def test_create_store_timeout_on_worker(self):
        with self.assertRaises(DistNetworkError):
            # use any available port (port 0) since timeout is expected
--- a/test/distributed/fsdp/test_fsdp_optim_state.py
+++ b/test/distributed/fsdp/test_fsdp_optim_state.py
@ -38,7 +38,6 @@ from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
    parametrize,
    run_tests,
-    skipIfRocm,
    TEST_WITH_DEV_DBG_ASAN,
 )

@ -514,7 +513,6 @@ class TestFSDPOptimState(FSDPTest):
                    continue
                self.assertEqual(full_osd_value, ref_osd_pg[name])

-    @skipIfRocm
    @skip_if_lt_x_gpu(2)
    @parametrize("state_dict_type", STATE_DICT_TYPES)
    @parametrize("use_multiple_param_groups", [False, True])
--- a/test/distributed/launcher/test_api.py
+++ b/test/distributed/launcher/test_api.py
@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+# Owner(s): ["oncall: r2p"]
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from unittest.mock import MagicMock, patch
+
+from torch.distributed.launcher.api import launch_agent, LaunchConfig
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class LauncherApiTest(TestCase):
+    def setUp(self):
+        # Save original environment variable if it exists
+        self.original_signals_env = os.environ.get(
+            "TORCHELASTIC_SIGNALS_TO_HANDLE", None
+        )
+
+    def tearDown(self):
+        # Restore original environment variable
+        if self.original_signals_env is not None:
+            os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = self.original_signals_env
+        elif "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
+            del os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"]
+
+    @patch("torch.distributed.launcher.api.LocalElasticAgent")
+    @patch("torch.distributed.launcher.api.rdzv_registry.get_rendezvous_handler")
+    def test_launch_agent_sets_signals_env_var(self, mock_get_handler, mock_agent):
+        """Test that launch_agent sets the TORCHELASTIC_SIGNALS_TO_HANDLE environment variable."""
+        # Setup
+        config = LaunchConfig(
+            min_nodes=1,
+            max_nodes=1,
+            nproc_per_node=1,
+            signals_to_handle="SIGTERM,SIGUSR1,SIGUSR2",
+        )
+        entrypoint = "dummy_script.py"
+        args = []
+
+        # Make sure the environment variable doesn't exist before the test
+        if "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
+            del os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"]
+
+        # Mock agent.run() to return a MagicMock
+        mock_agent_instance = MagicMock()
+        mock_agent_instance.run.return_value = MagicMock(
+            is_failed=lambda: False, return_values={}
+        )
+        mock_agent.return_value = mock_agent_instance
+
+        # Call launch_agent
+        launch_agent(config, entrypoint, args)
+
+        # Verify that the environment variable was set correctly
+        self.assertEqual(
+            os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"], "SIGTERM,SIGUSR1,SIGUSR2"
+        )
+
+    @patch("torch.distributed.launcher.api.LocalElasticAgent")
+    @patch("torch.distributed.launcher.api.rdzv_registry.get_rendezvous_handler")
+    def test_launch_agent_default_signals(self, mock_get_handler, mock_agent):
+        """Test that launch_agent uses the default signals if not specified."""
+        # Setup
+        config = LaunchConfig(
+            min_nodes=1,
+            max_nodes=1,
+            nproc_per_node=1,
+            # Not specifying signals_to_handle, should use default
+        )
+        entrypoint = "dummy_script.py"
+        args = []
+
+        # Make sure the environment variable doesn't exist before the test
+        if "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
+            del os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"]
+
+        # Mock agent.run() to return a MagicMock
+        mock_agent_instance = MagicMock()
+        mock_agent_instance.run.return_value = MagicMock(
+            is_failed=lambda: False, return_values={}
+        )
+        mock_agent.return_value = mock_agent_instance
+
+        # Call launch_agent
+        launch_agent(config, entrypoint, args)
+
+        # Verify that the environment variable was set to the default value
+        self.assertEqual(
+            os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"],
+            "SIGTERM,SIGINT,SIGHUP,SIGQUIT",
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
--- a/test/distributed/tensor/test_dtensor.py
+++ b/test/distributed/tensor/test_dtensor.py
@ -1,7 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]

-import os
 import pathlib
 import tempfile
 import unittest
@ -33,7 +32,6 @@ from torch.testing._internal.distributed._tensor.common_dtensor import (
    DTensorTestBase,
    with_comms,
 )
-from torch.testing._internal.logging_utils import LoggingTestCase


 c10d_functional = torch.ops.c10d_functional
@ -1012,36 +1010,5 @@ class TestDTensorPlacementTypes(DTensorTestBase):
                assert_array_equal(expected_is_tensor_empty, is_tensor_empty)


-class DTensorLogTest(LoggingTestCase):
-    def test_dtensor_log(self):
-        if not torch.distributed.is_available() or not torch.cuda.is_available():
-            return
-
-        env = dict(os.environ)
-        env["TORCH_LOGS"] = "+dtensor"
-        env["RANK"] = "0"
-        env["WORLD_SIZE"] = "1"
-        env["MASTER_PORT"] = "12345"
-        env["MASTER_ADDR"] = "localhost"
-
-        _, stderr = self.run_process_no_exception(
-            """\
-import logging
-import torch
-from torch.distributed.device_mesh import init_device_mesh
-from torch.distributed.tensor import distribute_tensor, Shard
-
-mesh = init_device_mesh("cuda", (1,), mesh_dim_names=("dp",))
-placements = [Shard(0)]
-tensor = torch.randn(12, 8, 8)
-dtensor = distribute_tensor(tensor, mesh, placements)
-dtensor.max()
-""",
-            env=env,
-        )
-        self.assertIn("_dispatch.py", stderr.decode("utf-8"))
-        self.assertIn("redistribute=False", stderr.decode("utf-8"))
-
-
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/tensor/test_dtensor_compile.py
+++ b/test/distributed/tensor/test_dtensor_compile.py
@ -183,7 +183,7 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase):
        )
        torch.utils._pytree.register_constant(DeviceMesh)

-        ep = torch.export.export_for_training(
+        ep = torch.export.export(
            Foo(), (torch.randn(4, 4, dtype=torch.float64),), strict=False
        )
        self.assertExpectedInline(
--- a/test/distributed/tensor/test_fake.py
+++ b/test/distributed/tensor/test_fake.py
@ -1,41 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates
-# Owner(s): ["oncall: distributed"]
-
-import torch
-from torch._subclasses.fake_tensor import FakeTensorMode
-from torch.distributed.tensor import DTensor
-from torch.distributed.tensor.placement_types import Shard
-from torch.testing._internal.common_utils import run_tests, TestCase
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-
-class TestFakeDTensor(TestCase):
-    def test_fake_dtensor_operations(self):
-        # Use FakeTensorMode to handle CUDA tensors without actual CUDA
-        fake_mode = FakeTensorMode()
-        world_size = 4
-
-        fake_store = FakeStore()
-        torch.distributed.init_process_group(
-            "fake", store=fake_store, rank=0, world_size=world_size
-        )
-        device_mesh = torch.distributed.device_mesh.init_device_mesh(
-            "cuda",
-            (2, world_size // 2),
-        )
-
-        # Create fake CUDA tensor using FakeTensorMode
-        with fake_mode:
-            x = torch.randn(1, 1, device="cuda")
-            x = DTensor.from_local(x, device_mesh, [Shard(0), Shard(1)])
-
-            # Test basic DTensor operations
-            self.assertIsInstance(x, DTensor)
-
-            # Test sum operation
-            r = x.sum(1)
-            self.assertIsInstance(r, DTensor)
-
-
-if __name__ == "__main__":
-    run_tests()
--- a/test/distributed/tensor/test_math_ops.py
+++ b/test/distributed/tensor/test_math_ops.py
@ -24,7 +24,7 @@ from torch.distributed.tensor.parallel import (
    RowwiseParallel,
    SequenceParallel,
 )
-from torch.testing._internal.common_utils import run_tests, skipIfRocm
+from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
    DTensorTestBase,
    skip_unless_torch_gpu,
@ -695,7 +695,6 @@ class DistMathOpsTest(DTensorTestBase):
        self.assertEqual(grad1_norm.device_mesh, mesh_y)

    @with_comms
-    @skipIfRocm
    def test_foreach_add_different_mesh(self):
        mesh_shape = (2, self.world_size // 2)
        mesh_2d = init_device_mesh(
--- a/test/distributed/tensor/test_tensor_ops.py
+++ b/test/distributed/tensor/test_tensor_ops.py
@ -1,6 +1,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]

+import itertools
+
 import torch
 from torch.distributed.tensor import (
    DeviceMesh,
@ -93,6 +95,19 @@ class DistTensorOpsTest(DTensorTestBase):
            dst_tensor.copy_(src_tensor)
            self.assertEqual(dst_dtensor.full_tensor(), dst_tensor)

+        # as a pointwise op, need to keep Partial placements without redistribute
+        src_tensor = torch.randn((64, 1))
+        dst_tensor = torch.zeros(16, 32, 64, 128)
+        src_specs = [[Partial()]]
+        dst_specs = [[Partial()]]
+        for dst_spec, src_spec in zip(dst_specs, src_specs):
+            src_dtensor = DTensor.from_local(src_tensor, device_mesh, src_spec)
+            dst_dtensor = DTensor.from_local(dst_tensor, device_mesh, dst_spec)
+            dst_dtensor.copy_(src_dtensor)
+            dst_tensor.copy_(src_tensor)
+            self.assertEqual(dst_dtensor.placements, (Partial(),))
+            self.assertEqual(dst_dtensor._local_tensor, dst_tensor)
+
    @with_comms
    def test_contiguous(self):
        device_mesh = self.build_device_mesh()
@ -776,6 +791,36 @@ class DistTensorOpsTest(DTensorTestBase):
            dim=split_dim,
        )

+    @with_comms
+    def test_unbind(self):
+        device_mesh = self.build_device_mesh()
+        shard_dims = [0, 1]
+        unbind_dims = [0, 1]
+        local_tensor = torch.randn(4, 8, requires_grad=True)
+        for shard_dim, unbind_dim in itertools.product(shard_dims, unbind_dims):
+            dist_tensor = distribute_tensor(
+                local_tensor, device_mesh, (Shard(shard_dim),)
+            )
+
+            if shard_dim == unbind_dim:
+                with self.assertRaisesRegex(
+                    RuntimeError, "Sharding propagation failed"
+                ):
+                    dist_tensor.unbind(dim=unbind_dim)
+            else:
+                unbinded_dist_tensors = dist_tensor.unbind(dim=unbind_dim)
+                new_shard_dim = shard_dim if shard_dim < unbind_dim else shard_dim - 1
+                self.assertTrue(
+                    all(
+                        elem.placements[0].is_shard(dim=new_shard_dim)
+                        for elem in unbinded_dist_tensors
+                    )
+                )
+                for x, y in zip(
+                    unbinded_dist_tensors, local_tensor.unbind(dim=unbind_dim)
+                ):
+                    self.assertEqual(x.full_tensor(), y)
+

 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@ -43,6 +43,7 @@ from torch.testing._internal.common_utils import (
    retry_on_connect_failures,
    run_tests,
    TEST_WITH_DEV_DBG_ASAN,
+    TEST_XPU,
    TestCase,
 )
 from torch.utils.checkpoint import checkpoint
@ -63,6 +64,8 @@ else:

 torch.backends.cuda.matmul.allow_tf32 = False

+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+

 def gpus_for_rank(world_size):
    """Multigpu tests are designed to simulate the multi nodes with multi
@ -70,8 +73,9 @@ def gpus_for_rank(world_size):
    On a single node, all visible GPUs are evenly
    divided to subsets, each process only uses a subset.
    """
-    visible_devices = list(range(torch.cuda.device_count()))
-    gpus_per_process = torch.cuda.device_count() // world_size
+    device_count = torch.accelerator.device_count()
+    visible_devices = list(range(device_count))
+    gpus_per_process = device_count // world_size
    gpus_for_rank = []
    for rank in range(world_size):
        gpus_for_rank.append(
@ -401,7 +405,7 @@ class CommonDistributedDataParallelTest:
            gradient_as_bucket_view=gradient_as_bucket_view,
        )

-        input = torch.randn(global_batch_size, 2).cuda(devices[0])
+        input = torch.randn(global_batch_size, 2).to(devices[0])
        target = torch.randn(global_batch_size, 4)

        return model, ddp_model, input, target
@ -435,10 +439,10 @@ class CommonDistributedDataParallelTest:
        allow_none_grads=False,
    ):
        # to reproduce the same training results
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
        torch.manual_seed(31415)
-        model = copy.deepcopy(input_model).cuda()
-        ddp_model = copy.deepcopy(input_model).cuda()
+        model = copy.deepcopy(input_model).to(device_type)
+        ddp_model = copy.deepcopy(input_model).to(device_type)
        ddp_model = nn.parallel.DistributedDataParallel(
            ddp_model,
            bucket_cap_mb=1,
@ -554,8 +558,8 @@ class CommonDistributedDataParallelTest:
    def _prepare_dummy_data(self):
        ddp_bs = 16
        bs = ddp_bs * self.world_size
-        input = torch.rand((bs, 20), device="cuda", requires_grad=True)
-        target = torch.randn((bs, 20), device="cuda")
+        input = torch.rand((bs, 20), device=device_type, requires_grad=True)
+        target = torch.randn((bs, 20), device=device_type)
        offset = self.rank * ddp_bs
        ddp_input = input[offset : offset + ddp_bs]
        ddp_target = target[offset : offset + ddp_bs]
@ -715,7 +719,7 @@ class CommonDistributedDataParallelTest:
        Test that checkpointing with weight sharing works.
        """
        process_group = self._get_process_group()
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
        for use_bucket_view, static_graph in product((False, True), (False, True)):
            torch.manual_seed(31415)
            l1 = nn.Linear(20, 20)
@ -738,7 +742,7 @@ class CommonDistributedDataParallelTest:
        same layer twice and having weights shared across layers.
        """
        process_group = self._get_process_group()
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
        for use_bucket_view in (True, False):
            self._test_ddp_checkpointing(
                self.CheckpointTwiceModuleWeightSharing(),
@ -1162,7 +1166,7 @@ class AbstractCommTest:

        # Verify sequence numbers are appropriately incremented
        for i in range(10):
-            t = torch.ones(1, device=torch.cuda.current_device())
+            t = torch.ones(1, device=device_type)
            dist.all_reduce(t, group=process_group)
            if not c10d._rank_not_in_group(process_group):
                seq_num = self._verify_sequence_number_across_pg(
@ -1193,7 +1197,7 @@ class AbstractCommTest:
                self.assertEqual(rank_to_seq_num[0] + 1, rank_to_seq_num[1])

    def _test_sequence_num_incremented_default_group(self, backend_name):
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
        store = dist.FileStore(self.file_name, self.world_size)
        dist.init_process_group(
            backend_name,
@ -1207,7 +1211,7 @@ class AbstractCommTest:
        )

    def _test_sequence_num_incremented_subgroup(self, backend_name):
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
        store = dist.FileStore(self.file_name, self.world_size)
        dist.init_process_group(
            backend_name,
@ -1262,8 +1266,8 @@ class AbstractCommTest:
        in_group_ranks = list(filter(lambda x: x % 2 == 0, range(self.world_size)))
        group = dist.new_group(in_group_ranks)

-        x = torch.zeros(2, 2).cuda(self.rank)
-        xs = [torch.zeros(2, 2).cuda(self.rank) for _ in range(len(in_group_ranks))]
+        x = torch.zeros(2, 2).to(self.rank)
+        xs = [torch.zeros(2, 2).to(self.rank) for _ in range(len(in_group_ranks))]
        if self.rank not in in_group_ranks:
            msg = ".*{}.*does not belong to.*"
            with self.assertWarnsOnceRegex(UserWarning, msg.format("all_gather")):
@ -1392,7 +1396,7 @@ class AbstractCommTest:
            rank=self.rank,
            store=store,
        )
-        device = "cuda" if backend == "nccl" else "cpu"
+        device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
        # test alltoall_base
        tensor = torch.tensor([1, 0, 0, 1], dtype=torch.bool, device=device)
        zeros = torch.tensor([0, 0, 0, 0], dtype=torch.bool, device=device)
@ -1574,8 +1578,8 @@ class CommTest(AbstractCommTest, MultiProcessTestCase):

 class DummyWork(dist._Work):
    def wait(self, timeout=5.0):
-        if torch.cuda.is_available():
-            torch.cuda.current_stream().synchronize()
+        if torch.accelerator.is_available():
+            torch.accelerator.current_stream().synchronize()
        return True


@ -1790,6 +1794,18 @@ class PythonProcessGroupExtensionTest(MultiProcessTestCase):
            ("cpu:gloo,cuda:nccl", "cpu:gloo,cuda:nccl"),
        ]

+        if TEST_XPU:
+            # Override backend_config_strings_and_expected_values for Intel GPU.
+            backend_config_strings_and_expected_values[4:10] = [
+                (dist.Backend.DUMMY, "cpu:dummy,cuda:dummy,xpu:dummy"),
+                ("DUMMY", "cpu:dummy,cuda:dummy,xpu:dummy"),
+                ("dummy", "cpu:dummy,cuda:dummy,xpu:dummy"),
+                ("cpu:dummy,xpu:dummy", "cpu:dummy,xpu:dummy"),
+                ("cpu:dummy,xpu:xccl", "cpu:dummy,xpu:xccl"),
+                ("cpu:gloo,xpu:dummy", "cpu:gloo,xpu:dummy"),
+                ("cpu:gloo,xpu:xccl", "cpu:gloo,xpu:xccl"),
+            ]
+
        for config_str, expected_value in backend_config_strings_and_expected_values:
            with self.subTest(config_str):
                # ensures these configs strings are valid and no ValueError is raised
@ -1800,6 +1816,8 @@ class PythonProcessGroupExtensionTest(MultiProcessTestCase):
        invalid_backend_config_strings = [
            "cpu:gloo,cuda:nccl,",  # trailing comma
            "cpu:gloo,cuda:nccl,cpu:dummy",  # duplicate device
+            "cpu:gloo,xpu:xccl,",  # trailing comma
+            "cpu:gloo,xpu:xccl,cpu:dummy",  # duplicate device
        ]
        for config_str in invalid_backend_config_strings:
            with self.subTest(config_str):
@ -1814,7 +1832,7 @@ class PythonProcessGroupExtensionTest(MultiProcessTestCase):
        os.environ["MASTER_ADDR"] = "localhost"
        os.environ["MASTER_PORT"] = "6789"
        dist.init_process_group(
-            "cpu:dummy,cuda:dummy", rank=self.rank, world_size=self.world_size
+            "cpu:dummy,cuda:dummy,xpu:dummy", rank=self.rank, world_size=self.world_size
        )

        # test all_gather
@ -2053,7 +2071,7 @@ dist.init_process_group(rank=0, world_size=1, store=dist.HashStore())
        # correctly dispatched

        # TODO: this will be updated in the future to not be backend specific
-        device = "cuda" if backend == "nccl" else "cpu"
+        device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
        # ensure supported devices (cpu, cuda) succeeds during dispatch call
        tensor = torch.zeros(2, 2, device=torch.device(device))
        # multi tensor collectives
@ -2119,7 +2137,7 @@ dist.init_process_group(rank=0, world_size=1, store=dist.HashStore())
            rank=self.rank,
            store=store,
        )
-        device = "cuda" if backend == "nccl" else "cpu"
+        device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
        # test alltoall_base
        input_tensor = torch.ones(2, 2, device=torch.device(device))
        output_tensor = torch.zeros(2, 2, device=torch.device(device))
@ -2251,8 +2269,9 @@ class LocalRankTest(MultiProcessTestCase):


 if __name__ == "__main__":
-    assert not torch.cuda._initialized, (
-        "test_distributed must not have initialized CUDA context on main process"
+    if device_type != "cpu":
+        assert not torch.get_device_module()._initialized, (
+            "test_distributed must not have initialized {device_type} context on main process"
        )

    run_tests()
--- a/test/distributed/test_c10d_functional_native.py
+++ b/test/distributed/test_c10d_functional_native.py
@ -24,7 +24,7 @@ from torch.distributed._functional_collectives import (
 from torch.testing._internal.common_cuda import SM90OrLater
 from torch.testing._internal.common_distributed import (
    MultiProcessTestCase,
-    requires_nccl,
+    requires_accelerator_dist_backend,
    skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (  # type: ignore[attr-defined]
@ -59,7 +59,7 @@ if not dist.is_available():
    sys.exit(0)


-@requires_nccl()
+@requires_accelerator_dist_backend(["nccl", "xccl"])
 class TestWithNCCL(MultiProcessTestCase):
    def setUp(self) -> None:
        super().setUp()
@ -75,13 +75,15 @@ class TestWithNCCL(MultiProcessTestCase):

    @property
    def device(self) -> torch.device:
-        return torch.device(f"cuda:{self.rank}")
+        return torch.device(self.rank)

    def _init_process_group(self) -> None:
-        torch.cuda.set_device(self.device)
+        torch.accelerator.set_device_index(self.rank)
        store = dist.FileStore(self.file_name, self.world_size)
+        backend = dist.get_default_backend_for_device(self.device.type)
+
        dist.init_process_group(
-            backend="nccl",
+            backend=backend,
            world_size=self.world_size,
            rank=self.rank,
            store=store,
@ -273,7 +275,7 @@ class TestWithNCCL(MultiProcessTestCase):
        )
        # check memory leak
        for i in range(1, 10):
-            mem_usage[i] = torch.cuda.max_memory_allocated()
+            mem_usage[i] = torch.accelerator.max_memory_allocated()
            compiled(arg)

        assert mem_usage[9] == mem_usage[8]
@ -370,14 +372,16 @@ class TestWithNCCL(MultiProcessTestCase):
    @skip_if_lt_x_gpu(2)
    def test_all_to_all_single(self) -> None:
        self._init_process_group()
-        torch.cuda.set_device(self.device)
+        torch.accelerator.set_device_index(self.rank)

        torch.manual_seed(42)
        send_sz_matrix = torch.randint(0, 20, (self.world_size, self.world_size))

        input_split_sizes = send_sz_matrix[self.rank].tolist()
        output_split_sizes = send_sz_matrix[:, self.rank].tolist()
-        input = torch.full((sum(input_split_sizes),), float(self.rank)).cuda()
+        input = torch.full((sum(input_split_sizes),), float(self.rank)).to(
+            self.device.type
+        )

        output = torch.ops._c10d_functional.all_to_all_single(
            input,
@ -388,7 +392,7 @@ class TestWithNCCL(MultiProcessTestCase):
        output = torch.ops._c10d_functional.wait_tensor(output)
        expect = torch.cat(
            [
-                torch.full((sz,), float(rank)).cuda()
+                torch.full((sz,), float(rank)).to(self.device.type)
                for rank, sz in enumerate(output_split_sizes)
            ]
        )
@ -464,7 +468,7 @@ class TestWithNCCL(MultiProcessTestCase):
    @fresh_cache()
    def test_threading(self):
        self._init_process_group()
-        device = torch.device(f"cuda:{self.rank}")
+        device = self.device

        def func(arg: torch.Tensor) -> torch.Tensor:
            buf0 = arg + 42
@ -546,9 +550,9 @@ class TestWithNCCL(MultiProcessTestCase):
            return in_grad, w_grad

        m, n, k = 128, 256, 64
-        in_ = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
-        w = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
-        out_grad = torch.randn((m, n), device="cuda", dtype=torch.bfloat16)
+        in_ = torch.randn((m, k), device=self.device.type, dtype=torch.bfloat16)
+        w = torch.randn((n, k), device=self.device.type, dtype=torch.bfloat16)
+        out_grad = torch.randn((m, n), device=self.device.type, dtype=torch.bfloat16)

        eager_in_grad, eager_w_grad = fp8_rowwise_backward(in_, w, out_grad)
        compile_in_grad, compile_w_grad = torch.compile(fp8_rowwise_backward)(
@ -777,7 +781,8 @@ class CompileTest(TestCase):

        self.rank = 0
        self.world_size = 2
-        torch.cuda.set_device("cuda:0")
+        torch.accelerator.set_device_index(0)
+        self.device = torch.accelerator.current_accelerator()

        store = FakeStore()
        dist.init_process_group(
@ -803,7 +808,7 @@ class CompileTest(TestCase):
            ar1 = funcol.wait_tensor(ar1)
            return ar0, ar1

-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device)
        compiled = torch.compile(func)

        code = run_and_get_triton_code(compiled, arg)
@ -836,7 +841,7 @@ class CompileTest(TestCase):

        # Test aoti
        AOTIRunnerUtil.run(func, (arg,))
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    @fresh_cache()
@ -851,7 +856,7 @@ class CompileTest(TestCase):
            ar1 = [funcol.wait_tensor(out) for out in ar1]
            return ar0, ar1

-        args = [torch.rand(4, 4, device="cuda") for _ in range(2)]
+        args = [torch.rand(4, 4, device=self.device.type) for _ in range(2)]
        compiled = torch.compile(func)
        code = run_and_get_triton_code(compiled, args)
        buf0, buf1, buf2, buf3 = find_buffer_assignments(code)
@ -881,7 +886,7 @@ class CompileTest(TestCase):

        # Test aoti
        out = AOTIRunnerUtil.run(func, (args,))  # noqa: F841
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    @fresh_cache()
@ -892,7 +897,7 @@ class CompileTest(TestCase):
            ar0 = funcol.wait_tensor(ar0)
            return ar0

-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device.type)
        compiled = torch.compile(func)

        code = run_and_get_triton_code(compiled, arg)
@ -917,7 +922,7 @@ class CompileTest(TestCase):
            # Expect allocation
            return ar0

-        arg = torch.rand(4, 4, device="cuda").T
+        arg = torch.rand(4, 4, device=self.device.type).T
        compiled = torch.compile(func)

        code = run_and_get_triton_code(compiled, arg)
@ -948,7 +953,7 @@ class CompileTest(TestCase):
            buf2 = torch.mm(arg, buf1)
            return buf1, buf2

-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device.type)
        compiled = torch.compile(func)
        code = run_and_get_triton_code(compiled, arg)
        buf0, buf1 = find_buffer_assignments(code)
@ -978,7 +983,7 @@ class CompileTest(TestCase):
            ag0 = funcol.wait_tensor(ag0)
            return ag0

-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device.type)
        compiled = torch.compile(func)
        code = run_and_get_triton_code(compiled, arg)
        (
@ -995,7 +1000,7 @@ class CompileTest(TestCase):

        # Test aoti
        AOTIRunnerUtil.run(func, (arg,))
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    @fresh_cache()
@ -1005,7 +1010,7 @@ class CompileTest(TestCase):
            ag0 = [funcol.wait_tensor(out) for out in ag0]
            return ag0

-        args = [torch.rand(4, 4, device="cuda") for _ in range(4)]
+        args = [torch.rand(4, 4, device=self.device.type) for _ in range(4)]
        compiled = torch.compile(func)
        code = run_and_get_triton_code(compiled, args)
        (
@ -1029,7 +1034,7 @@ class CompileTest(TestCase):

        # Test aoti
        out = AOTIRunnerUtil.run(func, (args,))  # noqa: F841
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

    @unittest.skipIf(not HAS_GPU, "This is a GPU test!")
    @fresh_cache()
@ -1039,7 +1044,7 @@ class CompileTest(TestCase):
            return funcol.wait_tensor(t)

        # Test aoti
-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device.type)
        compiled = torch.compile(func)
        code = run_and_get_triton_code(compiled, arg)
        (
@ -1051,7 +1056,7 @@ class CompileTest(TestCase):

        # Test aoti
        AOTIRunnerUtil.run(func, (arg,))
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    @fresh_cache()
@ -1061,7 +1066,7 @@ class CompileTest(TestCase):
            rs0 = funcol.wait_tensor(rs0)
            return rs0

-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device.type)
        compiled = torch.compile(func)
        code = run_and_get_triton_code(compiled, arg)
        (
@ -1077,7 +1082,7 @@ class CompileTest(TestCase):

        # Test aoti
        AOTIRunnerUtil.run(func, (arg,))
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    @fresh_cache()
@ -1089,7 +1094,7 @@ class CompileTest(TestCase):
            rs0 = [funcol.wait_tensor(out) for out in rs0]
            return rs0

-        args = [torch.rand(4, 4, device="cuda") for _ in range(4)]
+        args = [torch.rand(4, 4, device=self.device.type) for _ in range(4)]
        compiled = torch.compile(func)
        code = run_and_get_triton_code(compiled, args)
        (
@ -1113,7 +1118,7 @@ class CompileTest(TestCase):

        # Test aoti
        AOTIRunnerUtil.run(func, (args,))
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    @fresh_cache()
@ -1142,7 +1147,9 @@ class CompileTest(TestCase):

        input_split_sizes = send_sz_matrix[self.rank]
        output_split_sizes = send_sz_matrix[:, self.rank].contiguous()
-        input = torch.full((input_split_sizes.sum().item(),), float(self.rank)).cuda()
+        input = torch.full((input_split_sizes.sum().item(),), float(self.rank)).to(
+            self.device.type
+        )

        with torch._dynamo.config.patch(
            dynamic_shapes=True,
@ -1176,7 +1183,7 @@ class CompileTest(TestCase):
            br1 = funcol.wait_tensor(br1)
            return br0, br1

-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device.type)
        compiled = torch.compile(func)

        code = run_and_get_triton_code(compiled, arg)
@ -1199,7 +1206,7 @@ class CompileTest(TestCase):

        # Test aoti
        AOTIRunnerUtil.run(func, (arg,))
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    @fresh_cache()
@ -1214,7 +1221,7 @@ class CompileTest(TestCase):
            ar1 = funcol.wait_tensor(ar1)
            return ar0, ar1

-        arg = torch.rand(4, 4, device="cuda")
+        arg = torch.rand(4, 4, device=self.device.type)
        compiled = torch.compile(func, fullgraph=True)

        code = run_and_get_triton_code(compiled, arg)
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@ -1087,6 +1087,62 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):

        dist.destroy_process_group()

+    @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_comm_split_group_mixed_backend(self):
+        # Test `ncclCommSplit` for smaller subgroups of the world when
+        # we've passed a specific device_id to init_process_group.
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device(f"cuda:{self.rank}")
+        # pg = self._create_process_group_nccl(store, self.opts(), device_id=device)
+        # create nccl processgroup with opts
+        c10d.init_process_group(
+            "cpu:gloo,cuda:nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            pg_options=self.opts(),
+            device_id=device,
+        )
+        pg = c10d.distributed_c10d._get_default_group()
+        backend = pg._get_backend(torch.device(device))
+
+        cuda_tensor = torch.full((1,), self.rank).cuda(device)
+        cpu_tensor = torch.full((1,), self.rank)
+        # Create subgroup between ranks 0, 1
+        subg_ranks = [0, 1]
+        ng1 = c10d.split_group(pg, [subg_ranks])
+        backend1 = ng1._get_backend(torch.device(device))
+
+        # check basic options are the same between parent and child
+        self.assertEqual(backend.options._timeout, backend1.options._timeout)
+        self.assertEqual(
+            backend.options.is_high_priority_stream,
+            backend1.options.is_high_priority_stream,
+        )
+        self.assertEqual(ng1.group_desc, "default_pg:split:0")
+
+        # comm split happens eagerly since device_id is passed to init_process_group.
+        self.assertEqual(backend.comm_split_count(), 1)
+        # dist.get_process_group_ranks returns the global ranks in the subgroup.
+        self.assertEqual(
+            dist.get_process_group_ranks(ng1),
+            subg_ranks if self.rank in subg_ranks else [],
+        )
+
+        # is part of ng1; otherwise, -1
+        if dist.get_rank(ng1) >= 0:
+            dist.broadcast(cuda_tensor, dist.get_global_rank(ng1, 0), group=ng1)
+            self.assertEqual(cuda_tensor, torch.full((1,), 0))
+            dist.broadcast(cpu_tensor, dist.get_global_rank(ng1, 0), group=ng1)
+            self.assertEqual(cpu_tensor, torch.full((1,), 0))
+
+        ng2 = c10d.split_group(pg, [subg_ranks])
+        self.assertEqual(ng2.group_desc, "default_pg:split:1")
+        self.assertEqual(backend.comm_split_count(), 2)
+
+        dist.destroy_process_group()
+
    @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
    def test_non_blocking_init(self):
--- a/test/distributed/test_c10d_ops_nccl.py
+++ b/test/distributed/test_c10d_ops_nccl.py
@ -33,7 +33,6 @@ from torch.testing._internal.common_distributed import (
 from torch.testing._internal.common_utils import (
    run_tests,
    skip_but_pass_in_sandcastle_if,
-    skipIfRocm,
    TEST_WITH_DEV_DBG_ASAN,
 )

@ -319,7 +318,6 @@ class ProcessGroupNCCLOpTest(MultiProcContinuousTest):

    @requires_nccl()
    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    @skipIfRocm()
    def test_nccl_watchdog_cudagraph(self):
        # test that the watchdog does not crash graphs with disallowed event query
        pg = self.pg
--- a/test/distributed/test_compute_comm_reordering.py
+++ b/test/distributed/test_compute_comm_reordering.py
@ -29,7 +29,6 @@ from torch.testing._internal.common_distributed import (
    requires_accelerator_dist_backend,
 )
 from torch.testing._internal.common_fsdp import get_devtype
-from torch.testing._internal.common_utils import skipIfRocm
 from torch.testing._internal.inductor_utils import HAS_GPU


@ -368,7 +367,6 @@ class TestComputeCommReorderingMultiProc(DynamoDistributedMultiProcTestCase):
            self.assertTrue(same(out, correct))

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
-    @skipIfRocm
    # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
    @patch.object(torch._inductor.config, "compile_threads", 1)
    @patch.object(
--- a/test/distributed/test_cupy_as_tensor.py
+++ b/test/distributed/test_cupy_as_tensor.py
@ -8,11 +8,7 @@ from dataclasses import dataclass
 import torch
 from torch.multiprocessing.reductions import reduce_tensor
 from torch.testing._internal.common_distributed import MultiProcContinuousTest
-from torch.testing._internal.common_utils import (
-    requires_cuda_p2p_access,
-    run_tests,
-    skipIfRocm,
-)
+from torch.testing._internal.common_utils import requires_cuda_p2p_access, run_tests


 # So that tests are written in device-agnostic way
@ -63,7 +59,6 @@ class CupyAsTensorTest(MultiProcContinuousTest):
    def device(self) -> torch.device:
        return torch.device(device_type, self.rank)

-    @skipIfRocm
    def test_cupy_as_tensor(self) -> None:
        """
        Test that torch.as_tensor works for cupy array interface
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@ -1,6 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 import os
+import unittest

 import torch
 import torch.distributed as dist
@ -26,7 +27,7 @@ from torch.distributed.tensor._collective_utils import (
 )
 from torch.distributed.tensor.placement_types import _Partial, Shard
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, TEST_XPU
 from torch.testing._internal.distributed._tensor.common_dtensor import (
    DTensorTestBase,
    with_comms,
@ -35,6 +36,10 @@ from torch.testing._internal.distributed.fake_pg import FakeProcessGroup, FakeSt
 from torch.utils._typing_utils import not_none


+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+device_count = torch.accelerator.device_count()
+
+
 def _set_env_var(addr="localhost", port="25364", world_size=1, rank=0, local_rank=-1):
    os.environ["MASTER_ADDR"] = addr
    os.environ["MASTER_PORT"] = port
@ -44,6 +49,7 @@ def _set_env_var(addr="localhost", port="25364", world_size=1, rank=0, local_ran
        os.environ["LOCAL_RANK"] = f"{local_rank}"


+@unittest.skipIf(TEST_XPU, "XPU does not support gloo backend.")
 class DeviceMeshTestGlooBackend(DTensorTestBase):
    @property
    def backend(self):
@ -73,14 +79,16 @@ class DeviceMeshSetDeviceTest(DTensorTestBase):

        # Set the device on each process before DeviceMesh constructor,
        # and device to be different than the default world rank
-        torch.cuda.set_device((self.rank + 2) % self.world_size)
+        torch.accelerator.set_device_index((self.rank + 2) % self.world_size)
        _set_env_var(world_size=self.world_size, rank=self.rank)
        DeviceMesh(self.device_type, mesh_tensor)
        self.assertTrue(is_initialized())

        # check that the device is set to the correct device
        # and respect the previous set_device calls
-        self.assertEqual(torch.cuda.current_device(), (self.rank + 2) % self.world_size)
+        self.assertEqual(
+            torch.accelerator.current_device_idx(), (self.rank + 2) % self.world_size
+        )
        self.destroy_pg()

    @skip_if_lt_x_gpu(4)
@ -101,7 +109,7 @@ class DeviceMeshSetDeviceTest(DTensorTestBase):

        # check that the device is set to the correct device
        # and respect the LOCAL_RANK env var
-        self.assertEqual(torch.cuda.current_device(), local_rank)
+        self.assertEqual(torch.accelerator.current_device_idx(), local_rank)
        self.destroy_pg()

    @skip_if_lt_x_gpu(4)
@ -120,7 +128,7 @@ class DeviceMeshSetDeviceTest(DTensorTestBase):
        self.assertTrue(is_initialized())

        # check that the device is set to the correct device
-        self.assertEqual(torch.cuda.current_device(), self.rank)
+        self.assertEqual(torch.accelerator.current_device_idx(), self.rank)
        self.destroy_pg()


@ -222,7 +230,7 @@ class DeviceMeshTest(DTensorTestBase):
    @with_comms
    def test_device_mesh_2d(self):
        mesh_tensor = torch.arange(4).reshape(2, 2)
-        # construct a cuda device mesh
+        # construct a device mesh for self.device_type
        mesh = DeviceMesh(self.device_type, mesh_tensor)

        # check all dim groups
@ -246,19 +254,21 @@ class DeviceMeshTest(DTensorTestBase):

    @with_comms
    def test_device_mesh_init_backend(self):
-        mesh = DeviceMesh(self.device_type, [1], _init_backend=False)
+        mesh = DeviceMesh(
+            self.device_type, torch.arange(10), _init_backend=False, _rank=5
+        )

        with self.assertRaisesRegex(RuntimeError, "process groups not initialized!"):
            mesh.get_group()

        # coordinates should always been populated when init_backend is False, as whenever
        # we call init_backend we should make sure the default pg already created
-        mesh.get_coordinate()
+        self.assertEqual(mesh.get_coordinate(), [5])

+    @unittest.skipIf(not torch.accelerator.is_available(), "No accelerator available!")
    def test_fake_pg_device_mesh(self):
        fake_store = FakeStore()
        init_process_group("fake", store=fake_store, rank=0, world_size=self.world_size)
-        device_type = "cuda" if torch.cuda.is_available() else "cpu"
        mesh = DeviceMesh(device_type, torch.arange(self.world_size))

        local_tensor = torch.randn(2, 8)
@ -298,7 +308,7 @@ class DeviceMeshTest(DTensorTestBase):
        regex = r"Invalid mesh \[\[0, 1\], \[2, 3\]\] for ProcessGroup with ranks \[0, 1, 2, 3\]"
        with self.assertRaisesRegex(ValueError, regex):
            DeviceMesh.from_group(
-                global_pg, "cuda", invalid_mesh, mesh_dim_names=("dim0", "dim1")
+                global_pg, device_type, invalid_mesh, mesh_dim_names=("dim0", "dim1")
            )

        device_mesh = init_device_mesh(self.device_type, (2, 2))
@ -318,12 +328,11 @@ class DeviceMeshTest(DTensorTestBase):
            # test init_device_mesh with an invalid device type that contains a GPU index
            mesh_shape = (2, self.world_size // 2)
            init_device_mesh(
-                "cuda:0", mesh_shape=mesh_shape, mesh_dim_names=("dp", "tp")
+                f"{device_type}:0", mesh_shape=mesh_shape, mesh_dim_names=("dp", "tp")
            )

    @with_comms
    def test_set_mesh_dim_group_options(self):
-        device_type = "cuda" if torch.cuda.is_available() else "cpu"
        _mesh_resources._set_mesh_dim_group_options(1, "fake", None)

        mesh_tensor = torch.arange(4).reshape(2, 2)
@ -339,7 +348,7 @@ class DeviceMeshTestNDim(DTensorTestBase):

    @with_comms
    def test_device_mesh_nd(self):
-        # construct a cuda device mesh
+        # construct a device mesh for self.device_type
        mesh_tensor = torch.arange(8).reshape(2, 2, 2)
        mesh = DeviceMesh(self.device_type, mesh_tensor)

@ -708,7 +717,9 @@ class TestDeviceMeshGetItem(DTensorTestBase):
        with self.assertRaisesRegex(KeyError, "Invalid mesh_dim_name"):
            mesh_dim_names = ("DP", "TP")
            mesh = init_device_mesh(
-                self.device_type, (2, 4), mesh_dim_names=mesh_dim_names
+                self.device_type,
+                (2, 4),
+                mesh_dim_names=mesh_dim_names,
            )
            mesh[child_mesh_dim_name]

@ -823,6 +834,15 @@ class TestDeviceMeshGetItem(DTensorTestBase):
        ):
            mesh_3d["cp", "dp"]

+    @with_comms
+    def test_flatten_mesh_1d(self):
+        mesh_shape = (4,)
+        mesh_dim_names = ("default",)
+        mesh_1d = init_device_mesh(
+            self.device_type, mesh_shape, mesh_dim_names=mesh_dim_names
+        )
+        mesh_1d._flatten()
+
    @with_comms
    def test_flatten_mesh_3d(self):
        mesh_shape = (2, 2, 2)
@ -831,6 +851,13 @@ class TestDeviceMeshGetItem(DTensorTestBase):
            self.device_type, mesh_shape, mesh_dim_names=mesh_dim_names
        )

+        # Test flatten into an existing mesh_dim_name inside the mesh
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "already exists for submesh of the DeviceMesh",
+        ):
+            mesh_3d._flatten("dp")
+
        # Test flatten contiguous dims
        dp_cp_mesh = mesh_3d["dp", "cp"]
        flattened_dp_cp_mesh = dp_cp_mesh._flatten()
@ -920,7 +947,9 @@ class TestMeshEnv(DTensorTestBase):
    @with_comms
    def test_get_root_mesh(self):
        mesh_3d = init_device_mesh(
-            self.device_type, (2, 2, 2), mesh_dim_names=("dp", "cp", "tp")
+            self.device_type,
+            (2, 2, 2),
+            mesh_dim_names=("dp", "cp", "tp"),
        )

        dp_cp_mesh = mesh_3d["dp", "cp"]
@ -968,7 +997,9 @@ class TestMeshEnv(DTensorTestBase):
    @with_comms
    def test_get_all_submeshes(self):
        mesh_2d = init_device_mesh(
-            self.device_type, (2, 4), mesh_dim_names=("replicate", "shard")
+            self.device_type,
+            (2, 4),
+            mesh_dim_names=("replicate", "shard"),
        )
        all_submeshes = _mesh_resources._get_all_submeshes(mesh_2d, "replicate")
        self.assertEqual(len(all_submeshes), 4)
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@ -43,11 +43,12 @@ from torch.testing._internal.common_distributed import (
    DynamoDistributedMultiProcTestCase,
    DynamoDistributedSingleProcTestCase,
    import_transformers_or_skip,
-    requires_nccl,
+    requires_accelerator_dist_backend,
    skip_if_lt_x_gpu,
 )
-from torch.testing._internal.common_utils import requires_cuda
+from torch.testing._internal.common_utils import skipIfXpu
 from torch.testing._internal.inductor_utils import HAS_GPU
+from torch.testing._internal.triton_utils import requires_cuda_and_triton


 def reset_rng_state():
@ -270,7 +271,15 @@ def get_hf_bert(rank):
    except ImportError as e:
        raise unittest.SkipTest("Unable to import transformers") from e

-    batch_size, max_length, config, device = 4, 512, BertConfig(), f"cuda:{rank}"
+    device_type = (
+        acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+    )
+    batch_size, max_length, config, device = (
+        4,
+        512,
+        BertConfig(),
+        f"{device_type}:{rank}",
+    )
    model = AutoModelForMaskedLM.from_config(config).to(device)
    input_ids = torch.randint(0, config.vocab_size, (batch_size, max_length)).to(device)
    decoder_ids = torch.randint(0, config.vocab_size, (batch_size, max_length)).to(
@ -550,8 +559,8 @@ class TestFakeDistributedSingleProc(torch._dynamo.test_case.TestCase):

 # Are these tests failing?  Check and see if TestFakeDistributedSingleProc has a
 # single process version; if it's just a problem in the Dynamo distributed
-# optimizer, you should be able to repro it single process!
-@requires_nccl()
+# # optimizer, you should be able to repro it single process!
+@requires_accelerator_dist_backend(["nccl", "xccl"])
 class TestMultiProc(DynamoDistributedMultiProcTestCase):
    """
    Note: MultiProcTestCase spawns processes per test and is slow.
@ -559,12 +568,16 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
    sparingly for integration tests.
    """

+    device_type = (
+        acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+    )
+
    @skip_if_lt_x_gpu(2)
    @config.patch(optimize_ddp=False, enable_compiler_collectives=True)
    def test_ddp_baseline_aot_eager_multiprocess(self):
        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
            self.assertFalse(config.optimize_ddp)
-            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
            m = DDP(m, device_ids=[self.rank])
            m = torch.compile(m, backend="aot_eager")
            outputs = m(inputs)
@ -632,7 +645,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):

        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
            self.assertFalse(config.optimize_ddp)
-            model = MyModel().to(device="cuda")
+            model = MyModel().to(device=self.device_type)

            # Activation checkpointing for Linear layers.
            non_reentrant_wrapper = functools.partial(
@ -647,7 +660,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
            )

            model = DDP(model)
-            x = torch.randn(10, 64).cuda()
+            x = torch.randn(10, 64).to(self.device_type)
            correct_outputs = model(x)

            opt_model = torch.compile(model)
@ -659,14 +672,14 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
    def test_fsdp_aot_eager(self):
        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
            # Test with basic FSDP wrapping (outer wrap around whole model)
-            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
            fsdp_m = FSDP(m, use_orig_params=True)
            fsdp_m = torch.compile(fsdp_m, backend="aot_eager")
            outputs = fsdp_m(inputs)
            self.assertTrue(same(correct_outputs, outputs))

            # Test with recursive wrapping, nested FSDP around each Linear
-            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
            fsdp_m = FSDP(
                m,
                auto_wrap_policy=functools.partial(
@ -680,6 +693,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):

    @skip_if_lt_x_gpu(2)
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @requires_cuda_and_triton
    def test_ddp_optimizer_cudagraph(self):
        class Net(nn.Module):
            def __init__(self):
@ -730,7 +744,9 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
            from torch._dynamo.utils import counters

            counters.clear()
-            m, inputs, correct_outputs = get_mutating_model(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_mutating_model(
+                f"{self.device_type}:{self.rank}"
+            )
            fsdp_m = FSDP(m, use_orig_params=True)
            fsdp_m = torch.compile(fsdp_m, backend="eager", fullgraph=False)
            outputs = fsdp_m(inputs)
@ -748,7 +764,9 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
            from torch._dynamo.utils import counters

            counters.clear()
-            m, inputs, correct_outputs = get_forced_getattr_module(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_forced_getattr_module(
+                f"{self.device_type}:{self.rank}"
+            )
            fsdp_m = FSDP(m, use_orig_params=True)
            fsdp_m = torch.compile(fsdp_m, backend="eager", fullgraph=False)
            outputs = fsdp_m(inputs)
@ -762,7 +780,9 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
            from torch._dynamo.utils import counters

            counters.clear()
-            m, inputs, correct_outputs = get_forced_getattr_module(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_forced_getattr_module(
+                f"{self.device_type}:{self.rank}"
+            )
            fsdp_m = FSDP(m, use_orig_params=True)
            fsdp_m = torch.compile(fsdp_m, backend="eager", fullgraph=False)
            outputs = fsdp_m(inputs)
@ -774,14 +794,14 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
    def test_fsdp_inductor(self):
        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
            # Test with basic FSDP wrapping (outer wrap around whole model)
-            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
            fsdp_m = FSDP(m, use_orig_params=True)
            fsdp_m = torch.compile(fsdp_m, backend="inductor")
            outputs = fsdp_m(inputs)
            self.assertTrue(same(correct_outputs, outputs))

            # Test with recursive wrapping, nested FSDP around each Linear
-            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            m, inputs, correct_outputs = get_model(f"{self.device_type}:{self.rank}")
            fsdp_m = FSDP(
                m,
                auto_wrap_policy=functools.partial(
@ -799,7 +819,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
    def test_fsdp_activation_checkpointing(self):
        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
            model, inputs = get_toy_model_for_activation_checkpointing(
-                f"cuda:{self.rank}"
+                f"{self.device_type}:{self.rank}"
            )
            is_inner = lambda module: isinstance(module, ToyInnerModel)  # noqa: E731
            wrap_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=is_inner)
@ -961,7 +981,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
            torch._dynamo.utils.clear_compilation_metrics()

            # TODO: This should be possible to do inside the function, but
-            device = f"cuda:{self.rank}"
+            device = f"{self.device_type}:{self.rank}"

            @torch.compile()
            def f(x, y):
@ -1181,7 +1201,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
            pg = dist.distributed_c10d._get_default_group()

-            device = f"cuda:{self.rank}"
+            device = f"{self.device_type}:{self.rank}"

            @torch.compile(fullgraph=True)
            def f(x):
@ -1196,6 +1216,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
            pg = dist.distributed_c10d.GroupMember.NON_GROUP_MEMBER
            self.assertEqual(f(x), x + 1)

+    @skipIfXpu  # ProcessGroupXCCL doesn't support _set_default_timeout yet.
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    @patch.object(torch._inductor.config, "fx_graph_cache", False)
    @patch.object(torch._inductor.config, "fx_graph_remote_cache", False)
@ -1205,7 +1226,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
            torch._dynamo.utils.clear_compilation_metrics()

-            device = f"cuda:{self.rank}"
+            device = f"{self.device_type}:{self.rank}"

            pg = dist.distributed_c10d._get_default_group()

@ -1238,7 +1259,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):

            w = pg.allreduce(x)
            w.wait()
-            torch.cuda.synchronize(device)
+            torch.accelerator.synchronize(device)

            metrics = torch._dynamo.utils.get_compilation_metrics()
            # Number of compiles same on all nodes
@ -1247,6 +1268,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
            for r in res[1:]:
                self.assertEqual(res[0], r)

+    @skipIfXpu  # ProcessGroupXCCL doesn't support _set_default_timeout yet.
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    @patch.object(torch._inductor.config, "fx_graph_cache", True)
    @patch.object(torch._inductor.config, "fx_graph_remote_cache", False)
@ -1258,7 +1280,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
        with fresh_cache(), _dynamo_dist_per_rank_init(self.rank, self.world_size):
            torch._dynamo.utils.clear_compilation_metrics()

-            device = f"cuda:{self.rank}"
+            device = f"{self.device_type}:{self.rank}"

            pg = dist.distributed_c10d._get_default_group()

@ -1281,7 +1303,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):

            w = pg.allreduce(x)
            w.wait()
-            torch.cuda.synchronize(device)
+            torch.accelerator.synchronize(device)
            torch._dynamo.reset()

            if self.rank == 0:
@ -1298,11 +1320,11 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):

            w = pg.allreduce(x)
            w.wait()
-            torch.cuda.synchronize(device)
+            torch.accelerator.synchronize(device)


-@requires_nccl()
-@requires_cuda
+@requires_accelerator_dist_backend(["nccl", "xccl"])
+@unittest.skipUnless(torch.accelerator.is_available(), "Requires accelerator")
 class TestSingleProc(DynamoDistributedSingleProcTestCase):
    """
    Test harness initializes dist process group.
@ -1311,6 +1333,10 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
    Use TestMultiProc for things that really need to run on multiple nodes
    """

+    device_type = (
+        acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+    )
+
    def get_model(
        self, bsz=20, in_feat=10, hidden_feat=5000, out_feat=5, ctx_manager=None
    ):
@ -1428,6 +1454,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
                self.assertEqual(len(break_reasons), 4)
                self.assertTrue(all("DDPOptimizer" in r.reason for r in break_reasons))

+    @skipIfXpu  # XPU device doesn't support flex_attention yet.
    @patch.object(config, "optimize_ddp", True)
    def test_compiled_flex_attention_full_model_ddp(self):
        class Model(torch.nn.Module):
@ -1474,16 +1501,16 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
        S = 512
        D = 64

-        device = "cuda"
        model = Model(S, H, D)
-        model.to(device)
+        model.to(self.device_type)
        model = torch.compile(model)
        model = DDP(model, device_ids=self.device_ids)

-        hidden_states = torch.randn(B, S, H * D).to(device)
+        hidden_states = torch.randn(B, S, H * D).to(self.device_type)
        model(hidden_states)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

+    @skipIfXpu  # XPU device doesn't support flex_attention yet.
    @patch.object(config, "optimize_ddp", True)
    def test_compiled_flex_attention_local_ddp(self):
        class Model(torch.nn.Module):
@ -1530,15 +1557,14 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
        S = 512
        D = 64

-        device = "cuda"
        model = Model(S, H, D)
-        model.to(device)
+        model.to(self.device_type)
        model = torch.compile(model)
        model = DDP(model, device_ids=self.device_ids)

-        hidden_states = torch.randn(B, S, H * D).to(device)
+        hidden_states = torch.randn(B, S, H * D).to(self.device_type)
        model(hidden_states)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

    @patch.object(config, "optimize_ddp", True)
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@ -1787,9 +1813,9 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
                a = torch.cos(a)
                return a

-        mod = MockModule().cuda()
+        mod = MockModule().to(self.device_type)
        mod = DDP(mod, bucket_cap_mb=1)
-        x = torch.randn(N, N, device="cuda", requires_grad=True)
+        x = torch.randn(N, N, device=self.device_type, requires_grad=True)
        args = (x,)

        backend = "aot_eager"
@ -1799,7 +1825,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):

    def test_fsdp_orig_params_assert(self):
        # Test with basic FSDP wrapping (outer wrap around whole model)
-        m, inputs, _ = get_model(f"cuda:{self.rank}")
+        m, inputs, _ = get_model(f"{self.device_type}:{self.rank}")
        fsdp_m = FSDP(m, use_orig_params=False)
        # Test is that this function call does not throw an exception.
        fsdp_m = torch.compile(fsdp_m)
@ -1845,7 +1871,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):

                    return out

-            device = f"cuda:{self.rank}"
+            device = f"{self.device_type}:{self.rank}"
            m = ToyModel(
                in_feat=10,
                hidden_feat=5000,
@ -1892,7 +1918,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):

        torch._dynamo.reset()

-        device = f"cuda:{self.rank}"
+        device = f"{self.device_type}:{self.rank}"
        m = ToyModel(
            in_feat=10,
            hidden_feat=5000,
@ -1933,9 +1959,14 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
        class DuplicateModule(nn.Module):
            def __init__(self) -> None:
                super().__init__()
-                self._param = torch.randn((3,), device="cuda")
+                device_type = (
+                    acc.type
+                    if (acc := torch.accelerator.current_accelerator())
+                    else "cpu"
+                )
+                self._param = torch.randn((3,), device=device_type)
                self._buf = torch.nn.Buffer(
-                    torch.randn((3,), requires_grad=False, device="cuda")
+                    torch.randn((3,), requires_grad=False, device=device_type)
                )

            def forward(self, x: torch.Tensor) -> torch.Tensor:
@ -1948,7 +1979,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
        model = DuplicateModule()
        fsdp_model = FSDP(copy.deepcopy(model), use_orig_params=True)
        fsdp_model = torch.compile(fsdp_model, backend="aot_eager")
-        inp = torch.randn((2, 3), device="cuda")
+        inp = torch.randn((2, 3), device=self.device_type)
        local_out = model(inp)
        fsdp_out = fsdp_model(inp)
        self.assertEqual(local_out, fsdp_out)
@ -1965,8 +1996,13 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
        class BufModule(nn.Module):
            def __init__(self) -> None:
                super().__init__()
+                device_type = (
+                    acc.type
+                    if (acc := torch.accelerator.current_accelerator())
+                    else "cpu"
+                )
                self._buf = nn.Buffer(
-                    torch.randn((3,), requires_grad=False, device="cuda")
+                    torch.randn((3,), requires_grad=False, device=device_type)
                )

            def forward(self, x: torch.Tensor) -> torch.Tensor:
@ -1975,7 +2011,12 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
        class Model(nn.Module):
            def __init__(self) -> None:
                super().__init__()
-                self._param = nn.Parameter(torch.randn((1,), device="cuda"))
+                device_type = (
+                    acc.type
+                    if (acc := torch.accelerator.current_accelerator())
+                    else "cpu"
+                )
+                self._param = nn.Parameter(torch.randn((1,), device=device_type))
                self._buf_module = BufModule()
                # Share the buffer, meaning same tensor but different source
                self._buf = self._buf_module._buf
@ -1992,7 +2033,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
        fsdp_model = FSDP(Model(), use_orig_params=True)
        cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
        fsdp_model = torch.compile(fsdp_model, backend=cnt)
-        inp = torch.randn((2, 3), device="cuda")
+        inp = torch.randn((2, 3), device=self.device_type)
        for _ in range(15):
            fsdp_model(inp)
        # Check for no recompiles (if there were incorrect de-dup guards, then
@ -2011,7 +2052,12 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
                super().__init__()
                self._use_self = use_self
                torch.manual_seed(42)  # force `_param` to be deterministic
-                self._param = nn.Parameter(torch.randn((3,), device="cuda"))
+                device_type = (
+                    acc.type
+                    if (acc := torch.accelerator.current_accelerator())
+                    else "cpu"
+                )
+                self._param = nn.Parameter(torch.randn((3,), device=device_type))

            def forward(self, x: torch.Tensor) -> torch.Tensor:
                if self._use_self:
@ -2026,7 +2072,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
                return x + y

        model = ModuleWithStaticMethod(False)
-        x = torch.randn((2, 3), device="cuda")
+        x = torch.randn((2, 3), device=self.device_type)
        ref_out = model(x)
        test_outs: list[torch.Tensor] = []

--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@ -10,6 +10,7 @@ import torch
 import torch._dynamo
 import torch._dynamo.logging
 import torch._dynamo.test_case
+import torch.distributed as c10d

 # for some reason importing functional collectives after dynamo breaks collectives handling!
 import torch.distributed._functional_collectives as _functional_collectives
@ -37,14 +38,16 @@ from torch.testing._internal.common_distributed import (
    DynamoDistributedMultiProcTestCase,
    DynamoDistributedSingleProcTestCase,
    MultiProcessTestCase,
-    requires_nccl,
+    requires_accelerator_dist_backend,
    skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
    parametrize,
-    requires_cuda,
    skipIfRocm,
+    skipIfXpu,
+    TEST_XPU,
+    xfailIf,
 )
 from torch.testing._internal.inductor_utils import HAS_GPU
 from torch.utils._python_dispatch import TorchDispatchMode
@ -57,13 +60,15 @@ def _tolist_with_constrain_as_size(tensor):
    return lst


-@requires_nccl()
+@requires_accelerator_dist_backend(["nccl", "xccl"])
@instantiate_parametrized_tests
 class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
    """
    Run correctness checks in multi-proc runner, mark with minimum # GPUs to run under
    """

+    device = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
    def get_world_trs(self):
        return {
            "tag": "",
@ -100,8 +105,11 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
                example,
                **self.get_world_trs(),
            )
-            t = torch.randn(4, 4, device="cuda")
-            inputs = (t if self.rank == 0 else torch.zeros(4, 4, device="cuda"), 0)
+            t = torch.randn(4, 4, device=self.device)
+            inputs = (
+                t if self.rank == 0 else torch.zeros(4, 4, device=self.device),
+                0,
+            )
            eager_out = example(*inputs)
            self.assertTrue(same(t, eager_out))

@ -135,7 +143,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
                matmul_cat_col,
                **self.get_world_trs(),
            )
-            inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 6
+            inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 6

            eager_out = matmul_cat_col(*inputs)
            compiled_matmul_cat_col = compile(matmul_cat_col, inputs)
@ -177,7 +185,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
            for nelem in [1024, 2048, 4096]:
                # CI (Tesla T4) does not support bfloat16 compilation natively,
                # using float
-                x = torch.randn(nelem, device="cuda", dtype=torch.float)
+                x = torch.randn(nelem, device=self.device, dtype=torch.float)
                golden_out = eager_func(x)

                for _ in range(3):
@ -215,8 +223,8 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
                eager_func,
                **self.get_world_trs(),
            )
-            eager_inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 4
-            inductor_inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 2
+            eager_inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 4
+            inductor_inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 2

            eager_out = inductor_func(eager_func(*eager_inputs), *inductor_inputs)
            compiled_inductor_func = compile(
@ -254,8 +262,8 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
                inductor_func,
                **self.get_world_trs(),
            )
-            inductor_inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 4
-            eager_inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 2
+            inductor_inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 4
+            eager_inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 2

            eager_out = eager_func(inductor_func(*inductor_inputs), *eager_inputs)
            compiled_inductor_func = compile(inductor_func, inductor_inputs)
@ -266,7 +274,9 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):

    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    @skip_if_lt_x_gpu(2)
+    @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1728
    @skipIfRocm
+    @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1728
    def test_eager_async_allreduce_inductor_wait(self):
        import torch.distributed as dist
        from torch._inductor.utils import run_and_get_code
@ -289,7 +299,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
            return y * y

        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-            x = torch.ones(12800, 12800, device="cuda") + self.rank
+            x = torch.ones(12800, 12800, device=self.device) + self.rank
            self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)

            # NOTE: We run for 10 iterations each, to ensure that the GPU execution is way behind CPU
@ -360,7 +370,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
            return (e,)

        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-            inputs = torch.ones(4, 4, device="cuda") + self.rank
+            inputs = torch.ones(4, 4, device=self.device) + self.rank
            compiled = torch.compile(func)
            out = compiled(inputs, **self.get_world_trs())
            correct = func(inputs, **self.get_world_trs())
@ -377,7 +387,8 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
            inputs = (
                # rank0: [0., 1.], rank1: [2., 3.]
-                torch.arange(2, dtype=torch.float32, device="cuda") + 2 * self.rank,
+                torch.arange(2, dtype=torch.float32, device=self.device)
+                + 2 * self.rank,
                [1, 0],
            )
            compiled = torch.compile(func)
@ -386,7 +397,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
            self.assertTrue(same(out, correct))

            # rank0: [2., 3.], rank1: [0., 1.]
-            expected = torch.arange(2, dtype=torch.float32, device="cuda") + 2 * (
+            expected = torch.arange(2, dtype=torch.float32, device=self.device) + 2 * (
                (self.rank - 1 + self.world_size) % self.world_size
            )
            self.assertEqual(out, expected)
@ -409,9 +420,9 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
                return out

        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-            model = Model().cuda()
+            model = Model().to(self.device)
            model_compiled = torch.compile(model)
-            inp = torch.tensor([[2, 1, 3, 0]], dtype=torch.long, device="cuda")
+            inp = torch.tensor([[2, 1, 3, 0]], dtype=torch.long, device=self.device)
            out = model_compiled(inp, self.world_size, **self.get_world_trs())
            correct = model(inp, self.world_size, **self.get_world_trs())
            self.assertTrue(same(out, correct))
@ -426,7 +437,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):

        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
            func_compiled = torch.compile(func)
-            inp = torch.tensor(self.rank, dtype=torch.long, device="cuda")
+            inp = torch.tensor(self.rank, dtype=torch.long, device=self.device)
            out = func_compiled(inp, self.world_size)
            correct = func(inp, self.world_size)
            self.assertTrue(same(out, correct))
@ -448,9 +459,9 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
                return out

        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-            model = Model().cuda()
+            model = Model().to(self.device)
            model_compiled = torch.compile(model)
-            inp = torch.tensor([[2, 1, 3, 0]], dtype=torch.long, device="cuda")
+            inp = torch.tensor([[2, 1, 3, 0]], dtype=torch.long, device=self.device)
            out = model_compiled(inp, self.world_size, **self.get_world_trs())
            correct = model(inp, self.world_size, **self.get_world_trs())
            self.assertTrue(same(out, correct))
@ -479,7 +490,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
                example,
                **self.get_world_trs(),
            )
-            inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 2
+            inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 2

            eager_out = example(*inputs)
            compiled_matmul_cat_col = compile(example, inputs)
@ -506,7 +517,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
                example,
                **self.get_world_trs(),
            )
-            inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 2
+            inputs = (torch.ones(4, 4, device=self.device) + self.rank,) * 2

            eager_out = example(*inputs)
            compiled_fn = compile(example, inputs)
@ -560,7 +571,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
                dtype=torch.int64,
            )
            inputs = (
-                torch.ones(int(row), 5, device="cuda") * (self.rank + 1),
+                torch.ones(int(row), 5, device=self.device) * (self.rank + 1),
                input_split_sizes_tensor,
                output_split_sizes_tensor,
            )
@ -729,7 +740,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
                dtype=torch.int64,
            )
            inputs = (
-                torch.ones(int(row), 5, device="cuda", requires_grad=True)
+                torch.ones(int(row), 5, device=self.device, requires_grad=True)
                * (self.rank + 1),
                input_split_sizes_tensor,
                output_split_sizes_tensor,
@ -792,7 +803,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):

        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
            inputs = (
-                torch.ones(self.world_size, self.world_size, device="cuda")
+                torch.ones(self.world_size, self.world_size, device=self.device)
                * (self.rank + 1),
            )
            trs = self.get_world_trs()
@ -816,8 +827,11 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):


@instantiate_parametrized_tests
-@requires_nccl()
-@requires_cuda
+@requires_accelerator_dist_backend(["nccl", "xccl"])
+@unittest.skipIf(
+    not torch.accelerator.is_available(),
+    "No accelerator is available",
+)
 class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
    """
    Prefer single-proc test runner for basic tests as it is easier to work with.
@ -840,7 +854,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
            ar = torch.ops.c10d_functional.wait_tensor(ar)
            return ar

-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)

        compiled = torch.compile(func)
        out = compiled(inputs, **self.get_world_trs())
@ -875,7 +889,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
            other = torch.ones_like(inp) + 22
            return ar, other

-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)

        compiled = torch.compile(func)
        code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
@ -908,7 +922,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
            other = torch.ones_like(inp) + 22
            return ar, y, other

-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)

        compiled = torch.compile(func)
        code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
@ -949,7 +963,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
            ar = _functional_collectives.all_reduce(inp, "sum", "0")
            return ar

-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)
        counter = CompileCounter()
        compiled = torch.compile(func, backend=counter)
        out = compiled(inputs)
@ -960,12 +974,13 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
        self.assertEqual(counter.op_count, 2)
        self.assertTrue(same(out, correct))

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
    def test_dynamo_trace_all_gather_tensor(self):
        def func(inp):
            ar = _functional_collectives.all_gather_tensor(inp, 0, "0")
            return ar

-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)
        counter = CompileCounter()
        compiled = torch.compile(func, backend=counter)
        out = compiled(inputs)
@ -976,6 +991,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
        self.assertEqual(counter.op_count, 2)
        self.assertTrue(same(out, correct))

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
    def test_dynamo_trace_all_gather_tensor_pg(self):
        def func(inp, *, pg):
            ar = _functional_collectives.all_gather_tensor(inp, 0, pg)
@ -992,6 +1008,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
        self.assertEqual(counter.op_count, 2)
        self.assertTrue(same(out, correct))

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
    def test_dynamo_rewrite_dist_all_gather(self):
        def func(inp, out, *, pg):
            torch.distributed.all_gather_into_tensor(
@ -1017,6 +1034,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
        assert counter.op_count == 3
        assert same(outputs, correct_outputs)

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
    def test_dynamo_rewrite_dist_all_gather_list(self):
        def func(inp, out, *, pg):
            torch.distributed.all_gather(
@ -1039,6 +1057,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
        assert counter.frame_count == 1
        assert same(outputs, correct_outputs)

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
    def test_dynamo_rewrite_dist_all_gather_args_match(self):
        # Duplicated most of the structure from test_dynamo_rewrite_dist_all_gather
        # except uses kwargs to ensure rewrite has matching arg names
@ -1067,6 +1086,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
        assert counter.op_count == 3
        assert same(outputs, correct_outputs)

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
    def test_dynamo_rewrite_dist_reduce_scatter(self):
        def func(inp, out, *, pg):
            torch.distributed.reduce_scatter_tensor(
@ -1234,6 +1254,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
        input = torch.ones(2, device=self.device)
        compiled(input)

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
    def test_dynamo_support_collective_op_with_async_op_False(self):
        def func(inp, out, *, pg):
            # user explicitly set the attribute `async_op` to False,
@ -1293,12 +1314,13 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
        assert counter.op_count == 1
        assert same(outputs, correct_outputs)

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
    def test_dynamo_trace_reduce_scatter_tensor(self):
        def func(inp):
            ar = _functional_collectives.reduce_scatter_tensor(inp, "sum", 0, "0")
            return ar

-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)
        counter = CompileCounter()
        compiled = torch.compile(func, backend=counter)
        out = compiled(inputs)
@ -1309,6 +1331,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
        self.assertEqual(counter.op_count, 2)
        self.assertTrue(same(out, correct))

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
    def test_dynamo_trace_allgather_coalesced(self):
        def func(inp, *, tag, ranks, group_size):
            ar = torch.ops.c10d_functional.all_gather_into_tensor_coalesced(
@ -1316,7 +1339,10 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
            )
            return ar

-        inputs = [torch.ones(4, 4, device="cuda"), torch.ones(6, 6, device="cuda")]
+        inputs = [
+            torch.ones(4, 4, device=self.device),
+            torch.ones(6, 6, device=self.device),
+        ]
        counter = CompileCounter()
        compiled = torch.compile(func, backend=counter)
        out = compiled(inputs, **self.get_world_trs())
@ -1336,7 +1362,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
            ar = _functional_collectives.all_reduce(inp, "sum", "0")
            return ar

-        input = torch.ones(4, 4, device="cuda", requires_grad=True)
+        input = torch.ones(4, 4, device=self.device, requires_grad=True)
        compiled = torch.compile(
            func, backend="aot_eager"
        )  # inductor bug with single-op allreduce graph
@ -1354,6 +1380,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
        out = torch.ops.c10d_functional.all_reduce(x, "sum", **self.get_world_trs())
        self.assertEqual(x.size(), out.size())

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False})
    def test_inductor_all_gather_coalesced(self):
@ -1373,7 +1400,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
            other = torch.ones_like(inp) + 22
            return ar0, y, other, ar1

-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)

        compiled = torch.compile(func)
        code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
@ -1400,6 +1427,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
        correct = func(inputs, **self.get_world_trs())
        assert same(out, correct), f"{out} va {correct}"

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False})
    def test_inductor_reduce_scatter_coalesced(self):
@ -1419,7 +1447,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
            other = torch.ones_like(inp) + 22
            return ar0, y, other, ar1

-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)

        compiled = torch.compile(func)
        code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
@ -1446,6 +1474,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
        correct = func(inputs, **self.get_world_trs())
        assert same(out, correct), f"{out} va {correct}"

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    def test_reorder_peak_memory(self):
        """
@ -1467,7 +1496,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
            other = torch.ones_like(inp) + 22
            return ar0, y, other, ar1

-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)

        # get stats directly from the internal helper without affecting the real pass's signature
        node_stats: Optional[dict[BaseSchedulerNode, ReorderInfo]] = None
@ -1639,10 +1668,10 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):

            return y, ag_0_out, ag_1_out

-        x = torch.ones(4, 384, device="cuda", dtype=torch.float32)
-        w = torch.ones(384, 512, device="cuda", dtype=torch.float32)
-        ag_0 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
-        ag_1 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+        x = torch.ones(4, 384, device=self.device, dtype=torch.float32)
+        w = torch.ones(384, 512, device=self.device, dtype=torch.float32)
+        ag_0 = torch.ones(384, 512, device=self.device, dtype=torch.float32)
+        ag_1 = torch.ones(384, 512, device=self.device, dtype=torch.float32)
        inputs = [x, w, ag_0, ag_1]

        with torch._inductor.config.patch(
@ -1807,12 +1836,12 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
                rs_3_out,
            )

-        x = torch.ones(4, 384, device="cuda", dtype=torch.float32)
-        w = torch.ones(384, 512, device="cuda", dtype=torch.float32)
-        ag_0 = torch.ones(1024, 512, device="cuda", dtype=torch.float32)
-        ag_1 = torch.ones(512, 1024, device="cuda", dtype=torch.float32)
-        ag_2 = torch.ones(1024, 512, device="cuda", dtype=torch.float32)
-        ag_3 = torch.ones(512, 1024, device="cuda", dtype=torch.float32)
+        x = torch.ones(4, 384, device=self.device, dtype=torch.float32)
+        w = torch.ones(384, 512, device=self.device, dtype=torch.float32)
+        ag_0 = torch.ones(1024, 512, device=self.device, dtype=torch.float32)
+        ag_1 = torch.ones(512, 1024, device=self.device, dtype=torch.float32)
+        ag_2 = torch.ones(1024, 512, device=self.device, dtype=torch.float32)
+        ag_3 = torch.ones(512, 1024, device=self.device, dtype=torch.float32)
        inputs = [x, w, ag_0, ag_1, ag_2, ag_3]

        # get stats directly from the internal helper without affecting the real pass's signature
@ -1914,6 +1943,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
        node_stat1 = next(it)
        self.assertTrue("collective ordering" in node_stat1.limiting_factor)

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/1581
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    def test_reorder_respects_wait_dep(self):
        """
@ -1936,7 +1966,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
            # ensure other is not incorrectly aliasing ar's buffer
            return ag_1_wait

-        inputs = torch.ones(4, 4, device="cuda")
+        inputs = torch.ones(4, 4, device=self.device)

        # get stats directly from the internal helper without affecting the real pass's signature
        node_stats: Optional[dict[BaseSchedulerNode, ReorderInfo]] = None
@ -1985,7 +2015,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
            self.assertEqual(stats.moves, 0)


-@requires_nccl()
+@requires_accelerator_dist_backend(["nccl", "xccl"])
 class TestSyncDecisionCrossRanks(MultiProcessTestCase):
    def setUp(self) -> None:
        super().setUp()
@ -2001,16 +2031,21 @@ class TestSyncDecisionCrossRanks(MultiProcessTestCase):

    @property
    def device(self) -> torch.device:
-        return torch.device(f"cuda:{self.rank}")
+        device_type = torch.accelerator.current_accelerator().type
+        return torch.device(f"{device_type}:{self.rank}")

    def _init_process_group(self) -> None:
        torch._inductor.config.triton.store_cubin = True
        torch._inductor.config.debug = True

-        torch.cuda.set_device(self.device)
+        torch.get_device_module(self.device).set_device(self.device)
        store = torch.distributed.FileStore(self.file_name, self.world_size)
+        backend = c10d.get_default_backend_for_device(
+            torch.accelerator.current_accelerator().type
+        )
+
        torch.distributed.init_process_group(
-            backend="nccl",
+            backend=backend,
            world_size=self.world_size,
            rank=self.rank,
            store=store,
--- a/test/distributed/test_p2p_ipc.py
+++ b/test/distributed/test_p2p_ipc.py
@ -7,11 +7,7 @@
 import torch
 from torch.multiprocessing.reductions import reduce_tensor
 from torch.testing._internal.common_distributed import MultiProcContinuousTest
-from torch.testing._internal.common_utils import (
-    requires_cuda_p2p_access,
-    run_tests,
-    skipIfRocm,
-)
+from torch.testing._internal.common_utils import requires_cuda_p2p_access, run_tests


 # So that tests are written in device-agnostic way
@ -34,7 +30,6 @@ class P2PIpcTest(MultiProcContinuousTest):
    def device(self) -> torch.device:
        return torch.device(device_type, self.rank)

-    @skipIfRocm
    def test_p2p_ipc(self) -> None:
        """
        Test that cross-process P2P access works, by reducing a tensor,
--- a/test/distributed/test_run.py
+++ b/test/distributed/test_run.py
@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+# Owner(s): ["oncall: r2p"]
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from unittest.mock import MagicMock, patch
+
+import torch.distributed.run as run
+from torch.distributed.launcher.api import launch_agent, LaunchConfig
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class RunTest(TestCase):
+    def setUp(self):
+        # Save original environment variable if it exists
+        self.original_signals_env = os.environ.get(
+            "TORCHELASTIC_SIGNALS_TO_HANDLE", None
+        )
+
+    def tearDown(self):
+        # Restore original environment variable
+        if self.original_signals_env is not None:
+            os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = self.original_signals_env
+        elif "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
+            del os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"]
+
+    def test_signals_to_handle_default(self):
+        """Test that the default value for signals_to_handle is correctly set."""
+        parser = run.get_args_parser()
+        args = parser.parse_args(["dummy_script.py"])
+        self.assertEqual(args.signals_to_handle, "SIGTERM,SIGINT,SIGHUP,SIGQUIT")
+
+    def test_signals_to_handle_custom(self):
+        """Test that a custom value for signals_to_handle is correctly parsed."""
+        parser = run.get_args_parser()
+        args = parser.parse_args(
+            ["--signals-to-handle=SIGTERM,SIGUSR1,SIGUSR2", "dummy_script.py"]
+        )
+        self.assertEqual(args.signals_to_handle, "SIGTERM,SIGUSR1,SIGUSR2")
+
+    def test_config_from_args_signals_to_handle(self):
+        """Test that the signals_to_handle argument is correctly passed to LaunchConfig."""
+        parser = run.get_args_parser()
+        args = parser.parse_args(
+            ["--signals-to-handle=SIGTERM,SIGUSR1,SIGUSR2", "dummy_script.py"]
+        )
+        config, _, _ = run.config_from_args(args)
+        self.assertEqual(config.signals_to_handle, "SIGTERM,SIGUSR1,SIGUSR2")
+
+    @patch("torch.distributed.launcher.api.LocalElasticAgent")
+    @patch("torch.distributed.launcher.api.rdzv_registry.get_rendezvous_handler")
+    def test_launch_agent_sets_environment_variable(self, mock_get_handler, mock_agent):
+        """Test that launch_agent sets the TORCHELASTIC_SIGNALS_TO_HANDLE environment variable."""
+        # Setup
+        config = LaunchConfig(
+            min_nodes=1,
+            max_nodes=1,
+            nproc_per_node=1,
+            signals_to_handle="SIGTERM,SIGUSR1,SIGUSR2",
+        )
+        entrypoint = "dummy_script.py"
+        args = []
+
+        # Make sure the environment variable doesn't exist before the test
+        if "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
+            del os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"]
+
+        # Mock agent.run() to return a MagicMock
+        mock_agent_instance = MagicMock()
+        mock_agent_instance.run.return_value = MagicMock(
+            is_failed=lambda: False, return_values={}
+        )
+        mock_agent.return_value = mock_agent_instance
+
+        # Call launch_agent
+        launch_agent(config, entrypoint, args)
+
+        # Verify that the environment variable was set correctly
+        self.assertEqual(
+            os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"], "SIGTERM,SIGUSR1,SIGUSR2"
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@ -54,6 +54,8 @@ DEFAULT_HOSTNAME = "localhost"

 torch.backends.cuda.matmul.allow_tf32 = False

+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+

 def gpus_for_rank(world_size):
    """Multigpu tests are designed to simulate the multi nodes with multi
@ -61,8 +63,8 @@ def gpus_for_rank(world_size):
    On a single node, all visible GPUs are evenly
    divided to subsets, each process only uses a subset.
    """
-    visible_devices = list(range(torch.cuda.device_count()))
-    gpus_per_process = torch.cuda.device_count() // world_size
+    visible_devices = list(range(torch.accelerator.device_count()))
+    gpus_per_process = torch.accelerator.device_count() // world_size
    gpus_for_rank = []
    for rank in range(world_size):
        gpus_for_rank.append(
@ -1174,8 +1176,8 @@ class TestClientProtocol(TestCase):


 if __name__ == "__main__":
-    assert not torch.cuda._initialized, (
-        "test_distributed must not have initialized CUDA context on main process"
+    if device_type != "cpu":
+        assert not torch.get_device_module()._initialized, (
+            "test_distributed must not have initialized {device_type} context on main process"
        )
-
    run_tests()
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@ -644,7 +644,7 @@ class SymmMemEmptySetDeviceTest(MultiProcessTestCase):

        symm_mem_hdl.barrier()

-    @runOnRocmArch(MI300_ARCH)
+    @skipIfRocm
    @skip_if_lt_x_gpu(2)
    @parametrize("set_device", [True, False])
    def test_empty_strided_p2p(self, set_device: bool) -> None:
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@ -115,10 +115,13 @@ from torch.testing._internal.common_utils import (
    set_default_dtype,
    set_rng_seed,
    skipIfTorchDynamo,
+    TEST_XPU,
    TestCase,
 )


+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
@ -1788,18 +1791,21 @@ class TestDistributions(DistributionsTestCase):
            ).logpmf(sample)
            self.assertEqual(log_prob, expected, atol=1e-4, rtol=0)

-    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
    def test_zero_excluded_binomial(self):
        vals = Binomial(
-            total_count=torch.tensor(1.0).cuda(), probs=torch.tensor(0.9).cuda()
+            total_count=torch.tensor(1.0).to(device_type),
+            probs=torch.tensor(0.9).to(device_type),
        ).sample(torch.Size((100000000,)))
        self.assertTrue((vals >= 0).all())
        vals = Binomial(
-            total_count=torch.tensor(1.0).cuda(), probs=torch.tensor(0.1).cuda()
+            total_count=torch.tensor(1.0).to(device_type),
+            probs=torch.tensor(0.1).to(device_type),
        ).sample(torch.Size((100000000,)))
        self.assertTrue((vals < 2).all())
        vals = Binomial(
-            total_count=torch.tensor(1.0).cuda(), probs=torch.tensor(0.5).cuda()
+            total_count=torch.tensor(1.0).to(device_type),
+            probs=torch.tensor(0.5).to(device_type),
        ).sample(torch.Size((10000,)))
        # vals should be roughly half zeroes, half ones
        assert (vals == 0.0).sum() > 4000
@ -2050,15 +2056,15 @@ class TestDistributions(DistributionsTestCase):
                )
        torch.set_default_dtype(saved_dtype)

-    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
    def test_poisson_gpu_sample(self):
        set_rng_seed(1)
        for rate in [0.12, 0.9, 4.0]:
            self._check_sampler_discrete(
-                Poisson(torch.tensor([rate]).cuda()),
+                Poisson(torch.tensor([rate]).to(device_type)),
                scipy.stats.poisson(rate),
-                f"Poisson(lambda={rate}, cuda)",
+                f"Poisson(lambda={rate}, {device_type})",
                failure_rate=1e-3,
            )

@ -3490,13 +3496,13 @@ class TestDistributions(DistributionsTestCase):

        self._check_log_prob(Gamma(alpha, beta), ref_log_prob)

-    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
    def test_gamma_gpu_shape(self):
-        alpha = torch.randn(2, 3).cuda().exp().requires_grad_()
-        beta = torch.randn(2, 3).cuda().exp().requires_grad_()
-        alpha_1d = torch.randn(1).cuda().exp().requires_grad_()
-        beta_1d = torch.randn(1).cuda().exp().requires_grad_()
+        alpha = torch.randn(2, 3).to(device_type).exp().requires_grad_()
+        beta = torch.randn(2, 3).to(device_type).exp().requires_grad_()
+        alpha_1d = torch.randn(1).to(device_type).exp().requires_grad_()
+        beta_1d = torch.randn(1).to(device_type).exp().requires_grad_()
        self.assertEqual(Gamma(alpha, beta).sample().size(), (2, 3))
        self.assertEqual(Gamma(alpha, beta).sample((5,)).size(), (5, 2, 3))
        self.assertEqual(Gamma(alpha_1d, beta_1d).sample((1,)).size(), (1, 1))
@ -3527,7 +3533,10 @@ class TestDistributions(DistributionsTestCase):
    def test_gamma_gpu_sample(self):
        set_rng_seed(0)
        for alpha, beta in product([0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
-            a, b = torch.tensor([alpha]).cuda(), torch.tensor([beta]).cuda()
+            a, b = (
+                torch.tensor([alpha]).to(device_type),
+                torch.tensor([beta]).to(device_type),
+            )
            self._check_sampler_sampler(
                Gamma(a, b),
                scipy.stats.gamma(alpha, scale=1.0 / beta),
@ -3973,11 +3982,11 @@ class TestDistributions(DistributionsTestCase):
            self.assertEqual(frac_zeros, 0.5, atol=0.05, rtol=0)
            self.assertEqual(frac_ones, 0.5, atol=0.05, rtol=0)

-    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
    def test_beta_underflow_gpu(self):
        set_rng_seed(1)
        num_samples = 50000
-        conc = torch.tensor(1e-2, dtype=torch.float64).cuda()
+        conc = torch.tensor(1e-2, dtype=torch.float64).to(device_type)
        beta_samples = Beta(conc, conc).sample([num_samples])
        self.assertEqual((beta_samples == 0).sum(), 0)
        self.assertEqual((beta_samples == 1).sum(), 0)
--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@ -18,7 +18,7 @@ from torch._dynamo.backends.common import aot_autograd
 from torch._dynamo.testing import CompileCounterWithBackend
 from torch._higher_order_ops.wrap import tag_activation_checkpoint
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
-from torch.testing._internal.common_utils import IS_WINDOWS, skipIfHpu, skipIfRocm
+from torch.testing._internal.common_utils import IS_WINDOWS, skipIfHpu
 from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
 from torch.testing._internal.two_tensor import TwoTensor
@ -1364,7 +1364,6 @@ Non-primal fwd outputs from model w/o backward hook: {mod_no_hook_fwd_outputs_no
        self.assertEqual(out, out_compiled)
        self.assertEqual(input.grad, input_compiled.grad)

-    @skipIfRocm
    @requires_cuda_and_triton
    def test_autocast_flash_attention(self, device):
        def fn(primals_1, primals_2, primals_3):
--- a/test/dynamo/test_error_messages.py
+++ b/test/dynamo/test_error_messages.py
@ -726,14 +726,14 @@ Call to `torch._dynamo.graph_break()`
            Unsupported,
            lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
            """\
-LOAD_BUILD_CLASS bytecode not supported
-  Explanation: Dynamo does not support tracing classes that are defined in the compiled region.
-  Hint: Move the class definition out of the compiled region.
-  Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.
+Attempted to call function marked as skipped
+  Explanation: Dynamo does not know how to trace the builtin `builtins.__build_class__.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
+  Hint: If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
+  Hint: If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.

-  Developer debug context:
+  Developer debug context: module: builtins, qualname: __build_class__, skip reason: <missing reason>

- For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0075.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html

 from user code:
   File "test_error_messages.py", line N, in fn
--- a/test/dynamo/test_guard_manager.py
+++ b/test/dynamo/test_guard_manager.py
@ -116,6 +116,8 @@ num_guards_executed=0)
        const_guard = guards.LAMBDA_GUARD(
            root,
            functools.partial(equals_match, expected=5),
+            {},
+            False,
            equals_match_verbose_code_parts(5),
        )
        self.assertTrue(const_guard(5))
@ -405,10 +407,14 @@ num_guards_executed=0)
        guard_manager.add_type_match_guard(id_type(5), ["type(x) == int"])
        guard_manager.add_lambda_guard(
            functools.partial(ge_match, expected=5),
+            {},
+            False,
            ge_match_verbose_code_parts(expected=5),
        )
        guard_manager.add_lambda_guard(
            functools.partial(less_match, expected=10),
+            {},
+            False,
            less_match_verbose_code_parts(expected=10),
        )
        self.assertEqual(len(guard_manager.get_leaf_guards()), 3)
@ -428,10 +434,14 @@ num_guards_executed=0)
        guard_manager.add_type_match_guard(id_type(foo), ["type(x) == Foo"])
        guard_manager.getattr_manager("x", "x", 1, default_mgr_enum).add_lambda_guard(
            functools.partial(equals_match, expected=foo.x),
+            {},
+            False,
            equals_match_verbose_code_parts(foo.x),
        )
        guard_manager.getattr_manager("y", "y", 2, default_mgr_enum).add_lambda_guard(
            functools.partial(equals_match, expected=foo.y),
+            {},
+            False,
            equals_match_verbose_code_parts(foo.y),
        )
        self.assertEqual(len(guard_manager.get_leaf_guards()), 1)
@ -474,10 +484,14 @@ num_guards_executed=0)
        guard_manager.add_type_match_guard(id_type(foo), ["type(x) == Foo"])
        guard_manager.getitem_manager(0, "", 1, default_mgr_enum).add_lambda_guard(
            functools.partial(equals_match, expected=foo[0]),
+            {},
+            False,
            equals_match_verbose_code_parts(foo[0]),
        )
        guard_manager.getitem_manager(1, "", 2, default_mgr_enum).add_lambda_guard(
            functools.partial(equals_match, expected=foo[1]),
+            {},
+            False,
            equals_match_verbose_code_parts(foo[1]),
        )
        self.assertEqual(len(guard_manager.get_leaf_guards()), 1)
@ -585,6 +599,8 @@ num_guards_executed=0)
            lambda x: isinstance(x, Pair)
            and isinstance(x.x, torch.Tensor)
            and isinstance(x.y, int),
+            {},
+            False,
            "global guard fail",
        )

@ -635,6 +651,8 @@ num_guards_executed=0)
        )
        attr_manager.add_lambda_guard(
            lambda x: x == 4,
+            {},
+            False,
            "Expected value 4",
        )

@ -675,6 +693,8 @@ num_guards_executed=0)

        weakref_manager.add_lambda_guard(
            lambda x: isinstance(x, torch.Tensor),
+            {},
+            False,
            "global weakref fail",
        )

@ -694,6 +714,8 @@ num_guards_executed=0)
        )
        foo_mgr.add_lambda_guard(
            lambda x: x == 3,
+            {},
+            False,
            "Expected value 3",
        )
        self.assertTrue(guard_manager.check(a))
@ -779,7 +801,7 @@ num_guards_executed=0)
        # Add key-value manager (nothing : {"z" : 3})
        self.assertTrue(root.check(f_locals))
        dict_mgr.get_key_manager(1, "", nothing, default_mgr_enum).add_lambda_guard(
-            lambda x: x is nothing, ["x is nothing"]
+            lambda x: x is nothing, {}, False, ["x is nothing"]
        )
        self.assertTrue(root.check(f_locals))
        value_mgr = dict_mgr.get_value_manager(
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@ -1767,6 +1767,52 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
        out = f(MyTuple(a, b))
        self.assertTrue(same(a + 1, out))

+    def test_namedtuple_source_dynamic_attributes(self):
+        class MyNamedTuple(typing.NamedTuple):
+            a: torch.Tensor
+            b: torch.Tensor
+
+        class MyNamedTupleSubclass(MyNamedTuple):
+            pass
+
+        @torch.compile(fullgraph=True, backend="eager")
+        def f(tup):
+            c = torch.tensor(3.0)
+            tup.c = c  # Add dynamic attribute
+            return tup
+
+        extended_tup = MyNamedTupleSubclass(a=torch.tensor([1.0]), b=torch.tensor(2.0))
+        result = f(extended_tup)
+        # Verify the tuple has the expected structure
+        self.assertEqual(result.a, torch.tensor([1.0]))
+        self.assertEqual(result.b, torch.tensor(2.0))
+        self.assertTrue(hasattr(result, "c"))
+        self.assertEqual(result.c, torch.tensor(3.0))
+
+    def test_namedtuple_sourceless_dynamic_attributes(self):
+        class MyNamedTuple(typing.NamedTuple):
+            a: torch.Tensor
+            b: torch.Tensor
+
+        class MyNamedTupleSubclass(MyNamedTuple):
+            pass
+
+        @torch.compile(backend="eager")
+        def f():
+            # Create namedtuple inside function (sourceless)
+            tup = MyNamedTupleSubclass(a=torch.tensor([1.0]), b=torch.tensor(2.0))
+            # Add dynamic attribute
+            tup.c = torch.tensor(3.0)
+            return tup
+
+        result = f()
+        # Verify the tuple has the expected structure
+        self.assertEqual(result.a, torch.tensor([1.0]))
+        self.assertEqual(result.b, torch.tensor(2.0))
+        # Verify the dynamic attribute is preserved
+        self.assertTrue(hasattr(result, "c"))
+        self.assertEqual(result.c, torch.tensor(3.0))
+
    def test_structseq1(self):
        def fn(x, y):
            return torch.return_types.max((x, y))
@ -7161,7 +7207,9 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
            return x + 1

        guard_manager = torch._dynamo.guards.RootGuardManager()
-        guard_manager.add_lambda_guard(lambda L: isinstance(L["x"], int), [])
+        guard_manager.add_lambda_guard(
+            lambda L: isinstance(L["x"], int), {"x": 0}, True, []
+        )

        def injected(x):
            return x + 42
@ -7186,27 +7234,33 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
            return x + 1

        guard_manager_bool = torch._dynamo.guards.RootGuardManager()
-        guard_manager_bool.add_lambda_guard(lambda L: isinstance(L["x"], bool), [])
+        guard_manager_bool.add_lambda_guard(
+            lambda L: isinstance(L["x"], bool), {"x": 0}, True, []
+        )

        def injected_bool(x: bool):
            return x + 102

        guard_manager_int = torch._dynamo.guards.RootGuardManager()
-        guard_manager_int.add_lambda_guard(lambda L: isinstance(L["x"], int), [])
+        guard_manager_int.add_lambda_guard(
+            lambda L: isinstance(L["x"], int), {"x": 0}, True, []
+        )

        def injected_int(x: int):
            return x + 42

        guard_manager_tensor = torch._dynamo.guards.RootGuardManager()
        guard_manager_tensor.add_lambda_guard(
-            lambda L: isinstance(L["x"], torch.Tensor), []
+            lambda L: isinstance(L["x"], torch.Tensor), {"x": 0}, True, []
        )

        def injected_tensor(x: torch.Tensor):
            return x + 100

        guard_manager_str = torch._dynamo.guards.RootGuardManager()
-        guard_manager_str.add_lambda_guard(lambda L: isinstance(L["x"], str), [])
+        guard_manager_str.add_lambda_guard(
+            lambda L: isinstance(L["x"], str), {"x": 0}, True, []
+        )

        def injected_str(x: str):
            return x + "1"
@ -7283,7 +7337,10 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):

        guard_manager_bool = torch._dynamo.guards.RootGuardManager()
        guard_manager_bool.add_lambda_guard(
-            lambda L: isinstance(L["x"], bool), ["isinstance(L['x'], bool)"]
+            lambda L: isinstance(L["x"], bool),
+            {"x": 0},
+            True,
+            ["isinstance(L['x'], bool)"],
        )

        def injected_bool(x: bool):
@ -9547,6 +9604,69 @@ def ___make_guard_fn():

            f(torch.randn(9, requires_grad=True), torch.tensor([3, 6]))

+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_dim_order(self):
+        @torch.compile(dynamic=False, fullgraph=True, backend="eager")
+        def f(x):
+            x = x.permute(3, 0, 2, 1)
+            return x, x.dim_order()
+
+        @torch.compile(dynamic=False, fullgraph=True, backend="eager")
+        def g(x):
+            return x.dim_order()
+
+        @torch.compile(dynamic=False, fullgraph=True, backend="eager")
+        def h0(xs, ambiguity_check=False):
+            u0, u1, u2 = xs.tolist()
+            torch._check(u2 >= u0)
+            torch._check(u1 >= u0)
+            # stride ordering still isn't unique here, should raise
+            y = torch.empty_strided([4, 4, 4], [u0, u1, u2])
+            return y.dim_order(ambiguity_check=ambiguity_check)
+
+        @torch.compile(dynamic=False, fullgraph=True, backend="eager")
+        def h1(xs, ambiguity_check=False):
+            u0, u1, u2 = xs.tolist()
+            y = torch.empty_strided([4, 4, 4], [u0, u0, u0])  # no ordering
+            return y.dim_order(ambiguity_check=ambiguity_check)
+
+        # check that for functions permuting contiguous input, the original stride is recovered with dim_order.
+        def test(x):
+            stride_inp = tuple(x.stride())
+            f_out, f_order = f(x)
+            self.assertEqual(stride_inp, tuple(f_out.stride(i) for i in f_order))
+
+        # shape: [4, u0, 5, u1]
+        x0 = torch.randn(4, 1, 5, 2)
+        torch._dynamo.decorators.mark_unbacked(x0, 1)
+        torch._dynamo.decorators.mark_unbacked(x0, 3)
+        test(x0)
+
+        # shape: [u0, u1, u2, u3]
+        x1 = torch.randn(4, 1, 5, 2)
+        for i in range(x1.ndim):
+            torch._dynamo.decorators.mark_unbacked(x1, i)
+        test(x1)
+
+        # custom strides (all integers)
+        x2 = torch.randn(10000)
+        x2 = x2.as_strided([4, 4, 4, 4], [1, 2, 4, 8])
+        assert g(x2) == (3, 2, 1, 0)
+
+        # custom unbacked strides with no ordering: ambiguity check should raise
+        xs = torch.tensor([2, 3, 4])
+        h0(xs)
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.TorchRuntimeError,
+            r"The tensor does not have unique dim order.",
+        ):
+            h0(xs, ambiguity_check=True)
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.TorchRuntimeError,
+            r"The tensor does not have unique dim order.",
+        ):
+            h1(xs, ambiguity_check=True)
+
    def test_str_format_assert1(self):
        @torch.compile(backend="eager", fullgraph=True)
        def fn(img):
@ -12638,6 +12758,22 @@ fn
        self.assertRaises(Unsupported, f, [])
        self.assertRaises(Unsupported, f, "1 + j")

+    def test_compiled_class_graph_break(self):
+        counter = CompileCounter()
+
+        @torch.compile(backend=counter, fullgraph=False)
+        def f(x):
+            x += 1
+
+            class C:
+                pass
+
+            return x.sin()
+
+        x = torch.randn(3)
+        f(x)
+        self.assertEqual(counter.frame_count, 2)
+

 class MiscTestsPyTree(torch._inductor.test_case.TestCase):
    @parametrize_pytree_module
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@ -60,13 +60,16 @@ from torch.testing._internal.common_cuda import (
    SM70OrLater,
    TEST_CUDA,
 )
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_device_type import (
+    E4M3_MAX_POS,
+    e4m3_type,
+    instantiate_device_type_tests,
+)
 from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
    parametrize,
    serialTest,
    skipIfHpu,
-    skipIfRocm,
    skipIfWindows,
    TEST_WITH_ROCM,
 )
@ -7500,7 +7503,6 @@ class ReproTestsDevice(torch._dynamo.test_case.TestCase):
            out = f_compiled(x, s0, s1, s2)
            self.assertEqual(out_ref, out)

-    @skipIfRocm
    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "requires gpu with fp8 support")
    @requires_cuda
    def test_partitioner_saves_weights_for_bw(self):
@ -7512,9 +7514,9 @@ class ReproTestsDevice(torch._dynamo.test_case.TestCase):
            return a

        def scale(t, amax_t):
-            max_v = torch.finfo(torch.float8_e4m3fn).max
+            max_v = E4M3_MAX_POS
            scale_t = torch.clamp(amax_t.float(), min=1e-12) / max_v
-            t_fp8 = mul_tiled(t, scale_t.reciprocal()).to(torch.float8_e4m3fn)
+            t_fp8 = mul_tiled(t, scale_t.reciprocal()).to(e4m3_type)
            return t_fp8, scale_t

        def matmul(first, amax_first, second_t, amax_second_t, bias):
--- a/test/dynamo_expected_failures/TestNNParametrization.test_new_spectral_norm_forward_swap_True
+++ b/test/dynamo_expected_failures/TestNNParametrization.test_new_spectral_norm_forward_swap_True
--- a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_blocked
+++ b/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_blocked
--- a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_convert_to_bool
+++ b/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_convert_to_bool
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Mapping
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Mapping
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Sequence_mixins
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Sequence_mixins
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_from_iterable
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_from_iterable
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_interoperability_with_real_sets
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_interoperability_with_real_sets
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue16373
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue16373
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_copy_subclass
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_copy_subclass
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Generator
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Generator
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-ClosingTestCase.test_closing
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-ClosingTestCase.test_closing
--- a/Show More
+++ b/Show More