tc

[dtensor] avoid shape recompilations on DTensorSpec (#163820 )
skips DTensorSpec.sizes/strides in metadata guard checks Pull Request resolved: https://github.com/pytorch/pytorch/pull/163820 Approved by: https://github.com/azahed98
2025-10-25 16:14:55 +08:00 · 2025-10-07 13:09:13 -07:00 · 2025-10-03 17:18:18 +00:00 · 2025-10-03 16:31:07 +00:00 · 2025-10-03 16:15:09 +00:00 · 2025-10-03 16:14:32 +00:00
558 changed files with 11113 additions and 4373 deletions
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -13,49 +13,6 @@ def list_dir(path: str) -> list[str]:
    return check_output(["ls", "-1", path]).decode().split("\n")


-def build_ArmComputeLibrary() -> None:
-    """
-    Using ArmComputeLibrary for aarch64 PyTorch
-    """
-    print("Building Arm Compute Library")
-    acl_build_flags = [
-        "debug=0",
-        "neon=1",
-        "opencl=0",
-        "os=linux",
-        "openmp=1",
-        "cppthreads=0",
-        "arch=armv8a",
-        "multi_isa=1",
-        "fixed_format_kernels=1",
-        "build=native",
-    ]
-    acl_install_dir = "/acl"
-    acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
-    if os.path.isdir(acl_install_dir):
-        shutil.rmtree(acl_install_dir)
-    if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
-        check_call(
-            [
-                "git",
-                "clone",
-                "https://github.com/ARM-software/ComputeLibrary.git",
-                "-b",
-                "v25.02",
-                "--depth",
-                "1",
-                "--shallow-submodules",
-            ]
-        )
-
-    check_call(
-        ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
-        cwd=acl_checkout_dir,
-    )
-    for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
-        shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
-
-
 def replace_tag(filename) -> None:
    with open(filename) as f:
        lines = f.readlines()
@ -356,19 +313,13 @@ if __name__ == "__main__":
        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "

    if enable_mkldnn:
-        build_ArmComputeLibrary()
        print("build pytorch with mkldnn+acl backend")
-        build_vars += (
-            "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
-            "ACL_ROOT_DIR=/acl "
-            "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "
-            "ACL_INCLUDE_DIR=/acl/build "
-            "ACL_LIBRARY=/acl/build "
-        )
+        build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
+        build_vars += "ACL_ROOT_DIR=/acl "
        if enable_cuda:
            build_vars += "BLAS=NVPL "
        else:
-            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS "
+            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
    else:
        print("build pytorch without mkldnn backend")

--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -299,40 +299,6 @@ def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
        )


-def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None:
-    print("Building OpenBLAS")
-    host.run_cmd(
-        f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.28 {git_clone_flags}"
-    )
-    make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8"
-    host.run_cmd(
-        f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS"
-    )
-
-
-def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None:
-    print("Building Arm Compute Library")
-    acl_build_flags = " ".join(
-        [
-            "debug=0",
-            "neon=1",
-            "opencl=0",
-            "os=linux",
-            "openmp=1",
-            "cppthreads=0",
-            "arch=armv8a",
-            "multi_isa=1",
-            "fixed_format_kernels=1",
-            "build=native",
-        ]
-    )
-    host.run_cmd(
-        f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}"
-    )
-
-    host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
-
-
 def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
    host.run_cmd("pip3 install auditwheel")
    host.run_cmd(
@ -700,7 +666,6 @@ def start_build(
    configure_system(
        host, compiler=compiler, use_conda=use_conda, python_version=python_version
    )
-    build_OpenBLAS(host, git_clone_flags)

    if host.using_docker():
        print("Move libgfortant.a into a standard location")
@ -723,6 +688,8 @@ def start_build(
        f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
    )

+    host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
+
    print("Building PyTorch wheel")
    build_opts = ""
    if pytorch_build_number is not None:
@ -743,16 +710,18 @@ def start_build(
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
    if enable_mkldnn:
-        build_ArmComputeLibrary(host, git_clone_flags)
+        host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
        print("build pytorch with mkldnn+acl backend")
        build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
+        build_vars += " BLAS=OpenBLAS"
+        build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
+        build_vars += " ACL_ROOT_DIR=/acl"
        host.run_cmd(
-            f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && "
-            f"{build_vars} python3 -m build --wheel --no-isolation{build_opts}"
+            f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
        )
        print("Repair the wheel")
        pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
-        ld_library_path = "$HOME/acl/build:$HOME/pytorch/build/lib"
+        ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
        host.run_cmd(
            f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
        )
@ -908,7 +877,7 @@ def terminate_instances(instance_type: str) -> None:
 def parse_arguments():
    from argparse import ArgumentParser

-    parser = ArgumentParser("Builid and test AARCH64 wheels using EC2")
+    parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
    parser.add_argument("--key-name", type=str)
    parser.add_argument("--debug", action="store_true")
    parser.add_argument("--build-only", action="store_true")
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@ -1 +1 @@
-v2.28.3-1
+v2.27.5-1
--- a/.ci/docker/ci_commit_pins/nccl-cu13.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu13.txt
@ -1 +1 @@
-v2.28.3-1
+v2.27.7-1
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-bbb06c0334a6772b92d24bde54956e675c8c6604
+27664085f804afc83df26f740bb46c365854f2c4
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -1,16 +1,27 @@
-set -euo pipefail
+#!/bin/bash
+# Script used only in CD pipeline

-readonly version=v25.02
-readonly src_host=https://github.com/ARM-software
-readonly src_repo=ComputeLibrary
+set -eux
+
+ACL_VERSION=${ACL_VERSION:-"v25.02"}
+ACL_INSTALL_DIR="/acl"

 # Clone ACL
-[[ ! -d ${src_repo} ]] && git clone ${src_host}/${src_repo}.git
-cd ${src_repo}
-
-git checkout $version
+git clone https://github.com/ARM-software/ComputeLibrary.git -b "${ACL_VERSION}" --depth 1 --shallow-submodules

+ACL_CHECKOUT_DIR="ComputeLibrary"
 # Build with scons
+pushd $ACL_CHECKOUT_DIR
 scons -j8  Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \
  os=linux arch=armv8a build=native multi_isa=1 \
  fixed_format_kernels=1 openmp=1 cppthreads=0
+popd
+
+# Install ACL
+sudo mkdir -p ${ACL_INSTALL_DIR}
+for d in arm_compute include utils support src build
+do
+  sudo cp -r ${ACL_CHECKOUT_DIR}/${d} ${ACL_INSTALL_DIR}/${d}
+done
+
+rm -rf $ACL_CHECKOUT_DIR
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -3,8 +3,10 @@

 set -ex

-cd /
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules
+OPENBLAS_VERSION=${OPENBLAS_VERSION:-"v0.3.30"}
+
+# Clone OpenBLAS
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" --depth 1 --shallow-submodules

 OPENBLAS_CHECKOUT_DIR="OpenBLAS"
 OPENBLAS_BUILD_FLAGS="
@ -17,5 +19,7 @@ CFLAGS=-O3
 BUILD_BFLOAT16=1
 "

-make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
-make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
+make -j8 ${OPENBLAS_BUILD_FLAGS} -C $OPENBLAS_CHECKOUT_DIR
+sudo make install -C $OPENBLAS_CHECKOUT_DIR
+
+rm -rf $OPENBLAS_CHECKOUT_DIR
--- a/.ci/docker/common/patch_libstdc.sh
+++ b/.ci/docker/common/patch_libstdc.sh
@ -0,0 +1,9 @@
+#!/bin/bash
+set -xe
+# Script used in Linux x86 and aarch64 CD pipeline
+
+# Workaround for exposing statically linked libstdc++ CXX11 ABI symbols.
+# see: https://github.com/pytorch/pytorch/issues/133437
+LIBNONSHARED=$(gcc -print-file-name=libstdc++_nonshared.a)
+nm -g $LIBNONSHARED | grep " T " | grep recursive_directory_iterator | cut -c 20-  > weaken-symbols.txt
+objcopy --weaken-symbols weaken-symbols.txt $LIBNONSHARED $LIBNONSHARED
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -130,7 +130,8 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op
 RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
    /opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \
    done;
-
+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh

 # cmake-3.18.4 from pip; force in case cmake3 already exists
 RUN yum install -y python3-pip && \
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -62,6 +62,13 @@ ARG OPENBLAS_VERSION
 ADD ./common/install_openblas.sh install_openblas.sh
 RUN bash ./install_openblas.sh && rm install_openblas.sh

+# Install Arm Compute Library
+FROM base as arm_compute
+# use python3.9 to install scons
+RUN python3.9 -m pip install scons==4.7.0
+RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin
+COPY ./common/install_acl.sh install_acl.sh
+RUN bash ./install_acl.sh && rm install_acl.sh
 FROM base as final

 # remove unnecessary python versions
@ -70,4 +77,7 @@ RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
-ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
+COPY --from=arm_compute /acl /acl
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:/acl/build/:$LD_LIBRARY_PATH
+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@ -86,6 +86,15 @@ FROM base as nvpl
 ADD ./common/install_nvpl.sh install_nvpl.sh
 RUN bash ./install_nvpl.sh && rm install_nvpl.sh

+# Install Arm Compute Library
+FROM base as arm_compute
+# use python3.9 to install scons
+RUN python3.9 -m pip install scons==4.7.0
+RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin
+COPY ./common/install_acl.sh install_acl.sh
+RUN bash ./install_acl.sh && rm install_acl.sh
+FROM base as final
+
 FROM final as cuda_final
 ARG BASE_CUDA_VERSION
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
@ -93,5 +102,9 @@ COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BAS
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=nvpl /opt/nvpl/lib/  /usr/local/lib/
 COPY --from=nvpl /opt/nvpl/include/  /usr/local/include/
+COPY --from=arm_compute /acl /acl
 RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
 ENV PATH=/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/acl/build/:$LD_LIBRARY_PATH
+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -28,6 +28,7 @@ fi
 MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
 DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
 OPENBLAS_VERSION=${OPENBLAS_VERSION:-}
+ACL_VERSION=${ACL_VERSION:-}

 case ${image} in
    manylinux2_28-builder:cpu)
@ -41,7 +42,6 @@ case ${image} in
        GPU_IMAGE=arm64v8/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
        MANY_LINUX_VERSION="2_28_aarch64"
-        OPENBLAS_VERSION="v0.3.30"
        ;;
    manylinuxs390x-builder:cpu-s390x)
        TARGET=final
@ -119,7 +119,8 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
 DOCKER_BUILDKIT=1 docker build  \
    ${DOCKER_GPU_BUILD_ARG} \
    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
-    --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \
+    --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION:-}" \
+    --build-arg "ACL_VERSION=${ACL_VERSION:-}" \
    --target "${TARGET}" \
    -t "${tmp_tag}" \
    $@ \
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -52,10 +52,10 @@ flatbuffers==24.12.23
 #Pinned versions: 24.12.23
 #test that import:

-hypothesis==5.35.1
+hypothesis==6.56.4
 # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
 #Description: advanced library for generating parametrized tests
-#Pinned versions: 5.35.1
+#Pinned versions: 6.56.4
 #test that import: test_xnnpack_integration.py, test_pruning_op.py, test_nn.py

 junitparser==2.1.1
@ -98,7 +98,7 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Pinned versions:
 #test that import:

-mypy==1.16.0 ; platform_system != "Windows"
+mypy==1.16.0 ; platform_system == "Linux"
 # Pin MyPy version because new errors are likely to appear with each release
 # Skip on Windows as lots of type annotations are POSIX specific
 #Description: linter
@ -169,7 +169,7 @@ optree==0.13.0

 pillow==11.0.0
 #Description:  Python Imaging Library fork
-#Pinned versions: 10.3.0
+#Pinned versions: 11.0.0
 #test that import:

 protobuf==5.29.5
@ -217,7 +217,7 @@ pytest-subtests==0.13.1
 #Pinned versions:
 #test that import:

-xdoctest==1.1.0
+xdoctest==1.3.0
 #Description: runs doctests in pytest
 #Pinned versions: 1.1.0
 #test that import:
@ -268,7 +268,7 @@ scipy==1.14.1 ; python_version >= "3.12"
 #test that import:

 # needed by torchgen utils
-typing-extensions>=4.10.0
+typing-extensions==4.12.2
 #Description: type hints for python
 #Pinned versions:
 #test that import:
@ -361,9 +361,10 @@ pwlf==2.2.1
 #test that import: test_sac_estimator.py

 # To build PyTorch itself
-pyyaml
+pyyaml==6.0.2
 pyzstd
-setuptools>=70.1.0
+setuptools==78.1.1
+packaging==23.1
 six

 scons==4.5.2 ; platform_machine == "aarch64"
@ -384,7 +385,10 @@ cmake==3.31.6
 tlparse==0.4.0
 #Description: required for log parsing

-cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
+filelock==3.18.0
+#Description: required for inductor testing
+
+cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x" and platform_system != "Darwin"
 #Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits.
 #test that import: test_cuda.py

--- a/.ci/manywheel/build_rocm.sh
+++ b/.ci/manywheel/build_rocm.sh
@ -107,6 +107,10 @@ if [[ $ROCM_INT -ge 60200 ]]; then
    ROCM_SO_FILES+=("librocm-core.so")
 fi

+if [[ $ROCM_INT -ge 70000 ]]; then
+    ROCM_SO_FILES+=("librocroller.so")
+fi
+
 OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
 if [[ "$OS_NAME" == *"CentOS Linux"* || "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -89,7 +89,7 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
  export USE_MKLDNN=1
  export USE_MKLDNN_ACL=1
-  export ACL_ROOT_DIR=/ComputeLibrary
+  export ACL_ROOT_DIR=/acl
 fi

 if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -67,7 +67,7 @@ fi
 #       wheels with cxx11-abi

 echo "Checking that the gcc ABI is what we expect"
-if [[ "$(uname)" != 'Darwin' ]]; then
+if [[ "$(uname)" != 'Darwin' &&  "$(uname -m)" != "s390x" ]]; then
  # We also check that there are cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
--- a/.ci/pytorch/smoke_test/check_binary_symbols.py
+++ b/.ci/pytorch/smoke_test/check_binary_symbols.py
@ -32,6 +32,9 @@ LIBTORCH_NAMESPACE_LIST = (
    "torch::",
 )

+# Patterns for detecting statically linked libstdc++ symbols
+STATICALLY_LINKED_CXX11_ABI = [re.compile(r".*recursive_directory_iterator.*")]
+

 def _apply_libtorch_symbols(symbols):
    return [
@ -53,12 +56,17 @@ def get_symbols(lib: str) -> list[tuple[str, str, str]]:
    return [x.split(" ", 2) for x in lines.decode("latin1").split("\n")[:-1]]


-def grep_symbols(lib: str, patterns: list[Any]) -> list[str]:
+def grep_symbols(
+    lib: str, patterns: list[Any], symbol_type: str | None = None
+) -> list[str]:
    def _grep_symbols(
        symbols: list[tuple[str, str, str]], patterns: list[Any]
    ) -> list[str]:
        rc = []
        for _s_addr, _s_type, s_name in symbols:
+            # Filter by symbol type if specified
+            if symbol_type and _s_type != symbol_type:
+                continue
            for pattern in patterns:
                if pattern.match(s_name):
                    rc.append(s_name)
@ -80,6 +88,18 @@ def grep_symbols(lib: str, patterns: list[Any]) -> list[str]:
        return functools.reduce(list.__add__, (x.result() for x in tasks), [])


+def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None:
+    cxx11_statically_linked_symbols = grep_symbols(
+        lib, STATICALLY_LINKED_CXX11_ABI, symbol_type="T"
+    )
+    num_statically_linked_symbols = len(cxx11_statically_linked_symbols)
+    print(f"num_statically_linked_symbols (T): {num_statically_linked_symbols}")
+    if num_statically_linked_symbols > 0:
+        raise RuntimeError(
+            f"Found statically linked libstdc++ symbols (recursive_directory_iterator): {cxx11_statically_linked_symbols[:100]}"
+        )
+
+
 def check_lib_symbols_for_abi_correctness(lib: str) -> None:
    print(f"lib: {lib}")
    cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
@ -107,6 +127,7 @@ def main() -> None:

    libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
    check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
+    check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path)


 if __name__ == "__main__":
--- a/.ci/pytorch/test_fa3_abi_stable.sh
+++ b/.ci/pytorch/test_fa3_abi_stable.sh
@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -ex -o pipefail
+
+# Suppress ANSI color escape sequences
+export TERM=vt100
+
+# shellcheck source=./common.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+# shellcheck source=./common-build.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
+
+echo "Environment variables"
+env
+
+echo "Testing FA3 stable wheel still works with currently built torch"
+
+echo "Installing ABI Stable FA3 wheel"
+# The wheel was built on https://github.com/Dao-AILab/flash-attention/commit/b3846b059bf6b143d1cd56879933be30a9f78c81
+# on torch nightly torch==2.9.0.dev20250830+cu129
+$MAYBE_SUDO pip -q install https://s3.amazonaws.com/ossci-linux/wheels/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl
+
+pushd flash-attention/hopper
+export PYTHONPATH=$PWD
+pytest -v -s \
+  "test_flash_attn.py::test_flash_attn_output[1-1-192-False-False-False-0.0-False-False-mha-dtype0]" \
+  "test_flash_attn.py::test_flash_attn_varlen_output[511-1-64-True-False-False-0.0-False-False-gqa-dtype2]" \
+  "test_flash_attn.py::test_flash_attn_kvcache[1-128-128-False-False-True-None-0.0-False-False-True-False-True-False-gqa-dtype0]" \
+  "test_flash_attn.py::test_flash_attn_race_condition[97-97-192-True-dtype0]" \
+  "test_flash_attn.py::test_flash_attn_combine[2-3-64-dtype1]" \
+  "test_flash_attn.py::test_flash3_bw_compatibility"
+popd
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -38,10 +38,12 @@ if errorlevel 1 goto fail
 if not errorlevel 0 goto fail

 :: Update CMake
+:: TODO: Investigate why this helps MKL detection, even when CMake from choco is not used
 call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=System' --apply-install-arguments-to-dependencies --version=3.27.9
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail

+:: TODO: Move to .ci/docker/requirements-ci.txt
 call pip install mkl==2024.2.0 mkl-static==2024.2.0 mkl-include==2024.2.0
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
--- a/.ci/pytorch/win-test-helpers/test_libtorch.bat
+++ b/.ci/pytorch/win-test-helpers/test_libtorch.bat
@ -15,37 +15,35 @@ if errorlevel 1 exit /b 1
 if not errorlevel 0 exit /b 1

 cd %TMP_DIR_WIN%\build\torch\test
+
+:: Enable delayed variable expansion to make the list
+setlocal enabledelayedexpansion
+set EXE_LIST=
 for /r "." %%a in (*.exe) do (
-    call :libtorch_check "%%~na" "%%~fa"
+  if "%%~na" == "c10_intrusive_ptr_benchmark" (
+    @REM NB: This is not a gtest executable file, thus couldn't be handled by
+    @REM pytest-cpp and is excluded from test discovery by run_test
+    call "%%~fa"
    if errorlevel 1 goto fail
+    if not errorlevel 0 goto fail
+  ) else (
+    if "%%~na" == "verify_api_visibility" (
+      @REM Skip verify_api_visibility as it is a compile-level test
+    ) else (
+      set EXE_LIST=!EXE_LIST! cpp/%%~na
+    )
+  )
 )

-goto :eof
-
-:libtorch_check
-
 cd %CWD%
 set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test

-:: Skip verify_api_visibility as it a compile level test
-if "%~1" == "verify_api_visibility" goto :eof
+:: Run python test\run_test.py on the list
+set NO_TD=True && python test\run_test.py --cpp --verbose -i !EXE_LIST!
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail

-echo Running "%~2"
-if "%~1" == "c10_intrusive_ptr_benchmark" (
-  :: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp
-  call "%~2"
-  goto :eof
-)
-
-python test\run_test.py --cpp --verbose -i "cpp/%~1"
-if errorlevel 1 (
-  echo %1 failed with exit code %errorlevel%
-  goto fail
-)
-if not errorlevel 0 (
-  echo %1 failed with exit code %errorlevel%
-  goto fail
-)
+goto :eof

 :eof
 exit /b 0
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -37,27 +37,8 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
  export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
 fi

-# TODO: Move both of them to Windows AMI
-python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
-
-# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
-# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
-# scipy from 1.6.3 to 1.10
-# expecttest from 0.1.3 to 0.3.0
-# xdoctest from 1.0.2 to 1.3.0
-python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"
-
-# Install Z3 optional dependency for Windows builds.
-python -m pip install z3-solver==4.15.1.0
-
-# Install tlparse for test\dynamo\test_structured_trace.py UTs.
-python -m pip install tlparse==0.4.0
-
-# Install parameterized
-python -m pip install parameterized==0.8.1
-
-# Install pulp for testing ilps under torch\distributed\_tools
-python -m pip install pulp==2.9.0
+# TODO: Move this to .ci/docker/requirements-ci.txt
+python -m pip install "psutil==5.9.1" "pynvml==11.4.1" "pytest-shard==0.1.2"

 run_tests() {
    # Run nvidia-smi if available
--- a/.ci/pytorch/windows/cuda128.bat
+++ b/.ci/pytorch/windows/cuda128.bat
@ -37,10 +37,10 @@ IF "%CUDA_PATH_V128%"=="" (
 )

 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0
+    set TORCH_CUDA_ARCH_LIST=7.0;7.5;8.0;8.6;9.0;10.0;12.0
    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
-    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
+    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
 )

 set "CUDA_PATH=%CUDA_PATH_V128%"
--- a/.clang-tidy
+++ b/.clang-tidy
@ -59,9 +59,9 @@ performance-*,
 -performance-enum-size,
 readability-container-size-empty,
 readability-delete-null-pointer,
-readability-duplicate-include
+readability-duplicate-include,
 readability-misplaced-array-index,
-readability-redundant*
+readability-redundant*,
 readability-simplify-subscript-expr,
 readability-string-compare,
 -readability-redundant-access-specifiers,
--- a/.github/actions/teardown-win/action.yml
+++ b/.github/actions/teardown-win/action.yml
@ -23,9 +23,6 @@ runs:
      run: |
        .github\scripts\kill_active_ssh_sessions.ps1

-    - name: Clean up leftover processes on non-ephemeral Windows runner
-      uses: pytorch/test-infra/.github/actions/cleanup-runner@main
-
    # Cleaning up Windows workspace sometimes fails flakily with device or resource busy
    # error, meaning one or more processes haven't stopped completely yet. So trying to
    # retry this step several time similar to how checkout-pytorch GHA does
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-78a47f87ce259a48f0391fa9ae15add05ea7432b
+0ad9951c416d33c5da4f7a504fb162cbe62386f5
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -202,7 +202,7 @@ ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}
 ARG nvcc_threads=4
 ENV NVCC_THREADS=$nvcc_threads
-ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}

 ARG USE_SCCACHE
@ -297,16 +297,28 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
    echo "[INFO] Showing torch_build_versions.txt content:" && \
    cat torch_build_versions.txt

+# Install build and runtime dependencies, this is needed for flashinfer install
+COPY requirements/build.txt requirements/build.txt
+COPY use_existing_torch.py use_existing_torch.py
+RUN python3 use_existing_torch.py
+RUN cat requirements/build.txt
+
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version > /dev/null 2>&1; then \
        python3 -m pip install uv==0.8.4; \
    fi
+
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy

+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/build.txt
+
+
 # Default mount file as placeholder, this just avoid the mount error
 ARG TORCH_WHEELS_PATH="./requirements"
 # Install torch, torchaudio and torchvision
@ -332,13 +344,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install xformers wheel from previous stage
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system /wheels/xformers/*.whl --verbose
-
 # Build flashinfer from source.
 ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
 # install package for build flashinfer
 # see issue: https://github.com/flashinfer-ai/flashinfer/issues/738

-RUN pip install build==1.3.0
 RUN pip freeze | grep -E 'setuptools|packaging|build'

 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
--- a/.github/ci_configs/vllm/use_existing_torch.py
+++ b/.github/ci_configs/vllm/use_existing_torch.py
@ -1,9 +1,14 @@
 import glob
+import os


 requires_files = glob.glob("requirements/*.txt")
 requires_files += ["pyproject.toml"]
+
 for file in requires_files:
+    if not os.path.exists(file):
+        print(f"!!! skipping missing {file}")
+        continue
    print(f">>> cleaning {file}")
    with open(file) as f:
        lines = f.readlines()
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -1,37 +0,0 @@
-boto3==1.35.42
-build==1.2.2.post1
-cmake==3.27.*
-expecttest==0.3.0
-fbscribelogger==0.1.7
-filelock==3.18.0
-hypothesis==6.56.4
-librosa>=0.6.2
-mpmath==1.3.0
-networkx==2.8.7
-ninja==1.10.2.4
-numba==0.59.0
-numpy==1.26.4
-opt-einsum>=3.3
-optree==0.13.0
-packaging==23.1
-parameterized==0.8.1
-pillow==10.3.0
-protobuf==5.29.5
-psutil==5.9.8
-pygments==2.15.0
-pytest-cpp==2.3.0
-pytest-flakefinder==1.1.0
-pytest-rerunfailures==10.3
-pytest-subtests==0.13.1
-pytest-xdist==3.3.1
-pytest==7.3.2
-pyyaml==6.0.2
-scipy==1.12.0
-setuptools==78.1.1
-sympy==1.13.3
-tlparse==0.4.0
-tensorboard==2.13.0
-typing-extensions==4.12.2
-unittest-xml-reporting<=3.2.0,>=2.0.0
-xdoctest==1.1.0
-z3-solver==4.15.1.0
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -502,6 +502,7 @@ def perform_misc_tasks(
    job_name: str,
    pr_body: str,
    branch: Optional[str] = None,
+    tag: Optional[str] = None,
 ) -> None:
    """
    In addition to apply the filter logic, the script also does the following
@ -509,7 +510,9 @@ def perform_misc_tasks(
    """
    set_output(
        "keep-going",
-        branch == MAIN_BRANCH or check_for_setting(labels, pr_body, "keep-going"),
+        branch == MAIN_BRANCH
+        or bool(tag and re.match(r"^trunk/[a-f0-9]{40}$", tag))
+        or check_for_setting(labels, pr_body, "keep-going"),
    )
    set_output(
        "ci-verbose-test-logs",
@ -634,6 +637,7 @@ def main() -> None:
        job_name=args.job_name,
        pr_body=pr_body if pr_body else "",
        branch=args.branch,
+        tag=tag,
    )

    # Set the filtered test matrix as the output
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -53,7 +53,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
-        "nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
@ -70,7 +70,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
-        "nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
@ -87,7 +87,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
-        "nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | "
+        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
--- a/.github/workflows/_linux-test-stable-fa3.yml
+++ b/.github/workflows/_linux-test-stable-fa3.yml
@ -0,0 +1,255 @@
+# The point of this workflow is to test that a FA3 wheel that was built based off the
+# stable ABI as of torch nightly 20250830 can still run on the newer torch.
+#
+# This workflow is very similar to the _linux-test.yml workflow, with the following
+# differences:
+#   1. It is simpler (there is no test matrix)
+#   2. It pulls flash-attention as a secondary repository in order to access the tests.
+#      Note that it does not BUILD anything from flash-attention, as we have a prebuilt
+#      wheel. We pull flash-attention only to run a few tests.
+#   3. It runs only FA3 tests. No PyTorch tests are run.
+name: linux-test-stable-fa3
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      docker-image:
+        required: true
+        type: string
+        description: Docker image to run in.
+      timeout-minutes:
+        required: false
+        type: number
+        default: 30
+        description: |
+          Set the maximum (in minutes) how long the workflow should take to finish
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
+      VLLM_TEST_HUGGING_FACE_TOKEN:
+        required: false
+        description: |
+          HF Auth token to test vllm
+      SCRIBE_GRAPHQL_ACCESS_TOKEN:
+        required: false
+        description: |
+          FB app token to write to scribe endpoint
+
+env:
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+
+jobs:
+  test:
+    # Don't run on forked repos
+    if: github.repository_owner == 'pytorch'
+    runs-on: linux.aws.h100
+    timeout-minutes: ${{ inputs.timeout-minutes || 30 }}
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          no-sudo: true
+
+      - name: Checkout flash-attention as a secondary repository
+        uses: actions/checkout@v4
+        with:
+          repository: Dao-AILab/flash-attention
+          path: flash-attention
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-image-name: ${{ inputs.docker-image }}
+
+      - name: Use following to pull public copy of the image
+        id: print-ghcr-mirror
+        env:
+          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+        shell: bash
+        run: |
+          tag=${ECR_DOCKER_IMAGE##*:}
+          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Check if in a container runner
+        shell: bash
+        id: check_container_runner
+        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
+
+      - name: Setup GPU_FLAG for docker run
+        id: setup-gpu-flag
+        run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+
+      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
+        id: setup-sscache-port-flag
+        run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+
+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Download build artifacts
+        uses: ./.github/actions/download-build-artifacts
+        with:
+          name: ${{ inputs.build-environment }}
+          s3-bucket: ${{ inputs.s3-bucket }}
+
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+
+      - name: Set Test step time
+        id: test-timeout
+        shell: bash
+        env:
+          JOB_TIMEOUT: ${{ inputs.timeout-minutes }}
+        run: |
+          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
+
+      - name: Preserve github env variables for use in docker
+        shell: bash
+        run: |
+          env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+          env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+
+      - name: Test
+        id: test
+        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
+        env:
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_WORKFLOW: ${{ github.workflow }}
+          GITHUB_JOB: ${{ github.job }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_RUN_NUMBER: ${{ github.run_number }}
+          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
+          SHM_SIZE: '2g'
+          DOCKER_IMAGE: ${{ inputs.docker-image }}
+          VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ steps.get-job-id.outputs.job-id }}
+        run: |
+          set -x
+
+          TEST_COMMAND=.ci/pytorch/test_fa3_abi_stable.sh
+
+          # Leaving 1GB for the runner and other things
+          TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
+          # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
+          # comes from https://github.com/pytorch/test-infra/pull/6058
+          TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))
+
+
+          SHM_OPTS="--shm-size=${SHM_SIZE}"
+          JENKINS_USER="--user jenkins"
+          DOCKER_SHELL_CMD=
+
+          # detached container should get cleaned up by teardown_ec2_linux
+          # TODO: Stop building test binaries as part of the build phase
+          # Used for GPU_FLAG, SHM_OPTS, JENKINS_USER and DOCKER_SHELL_CMD since that doesn't play nice
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
+            -e GITHUB_ACTIONS \
+            -e GITHUB_REPOSITORY \
+            -e GITHUB_WORKFLOW \
+            -e GITHUB_JOB \
+            -e GITHUB_RUN_ID \
+            -e GITHUB_RUN_NUMBER \
+            -e GITHUB_RUN_ATTEMPT \
+            -e JOB_ID \
+            -e JOB_NAME \
+            -e BASE_SHA \
+            -e BRANCH \
+            -e SHA1 \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e HUGGING_FACE_HUB_TOKEN \
+            -e VLLM_TEST_HUGGING_FACE_TOKEN \
+            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
+            -e ARTIFACTS_FILE_SUFFIX \
+            --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
+            --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --ipc=host \
+            ${SHM_OPTS} \
+            --tty \
+            --detach \
+            --name="${container_name}" \
+            ${JENKINS_USER} \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            ${DOCKER_SHELL_CMD}
+          )
+
+          echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
+
+          docker exec -t "${container_name}" sh -c "python3 -m pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
+
+      - name: Collect backtraces from coredumps (if any)
+        if: always()
+        run: |
+          # shellcheck disable=SC2156
+          find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
+
+      - name: Store Core dumps on S3
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+        if: failure()
+        with:
+          name: coredumps-fa3-stable-abi-smoke-tests
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ./**/core.[1-9]*
+
+      - name: Upload utilization stats
+        if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' }}
+        continue-on-error: true
+        uses: ./.github/actions/upload-utilization-stats
+        with:
+          job_id: ${{ steps.get-job-id.outputs.job-id }}
+          job_name: ${{ steps.get-job-id.outputs.job-name }}
+          workflow_name: ${{ github.workflow }}
+          workflow_run_id: ${{github.run_id}}
+          workflow_attempt: ${{github.run_attempt}}
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@ -85,7 +85,7 @@ jobs:
        uses: pytorch/test-infra/.github/actions/setup-python@main
        with:
          python-version: ${{ inputs.python-version }}
-          pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
+          pip-requirements-file: .ci/docker/requirements-ci.txt

      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -122,7 +122,7 @@ jobs:
        uses: pytorch/test-infra/.github/actions/setup-python@main
        with:
          python-version: ${{ inputs.python-version }}
-          pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
+          pip-requirements-file: .ci/docker/requirements-ci.txt

      - name: Start monitoring script
        id: monitor-script
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -84,9 +84,6 @@ jobs:
          # in https://github.com/actions/checkout/issues/1018
          git config --global core.fsmonitor false

-      - name: Clean up leftover processes on non-ephemeral Windows runner
-        uses: pytorch/test-infra/.github/actions/cleanup-runner@main
-
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
        with:
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -77,9 +77,6 @@ jobs:
          # in https://github.com/actions/checkout/issues/1018
          git config --global core.fsmonitor false

-      - name: Clean up leftover processes on non-ephemeral Windows runner
-        uses: pytorch/test-infra/.github/actions/cleanup-runner@main
-
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
        with:
@ -106,18 +103,6 @@ jobs:
        with:
          cuda-version: ${{ inputs.cuda-version }}

-      # TODO: Move to a requirements.txt file for windows
-      - name: Install pip dependencies
-        uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 5
-          max_attempts: 5
-          retry_wait_seconds: 30
-          command: |
-            set -eu
-            python3 -m pip install 'xdoctest>=1.1.0'
-
      - name: Get workflow job id
        id: get-job-id
        uses: ./.github/actions/get-workflow-job-id
@ -272,15 +257,6 @@ jobs:
        shell: bash
        run: python3 .github/scripts/parse_ref.py

-      - name: Uninstall PyTorch
-        if: always()
-        continue-on-error: true
-        shell: bash
-        run: |
-          # This step removes PyTorch installed by the test to give a clean slate
-          # to the next job
-          python3 -mpip uninstall -y torch
-
      - name: Teardown Windows
        uses: ./.github/actions/teardown-win
        if: always()
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -132,7 +132,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -178,7 +178,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -224,7 +224,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -335,7 +335,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -381,7 +381,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -427,7 +427,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -538,7 +538,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -584,7 +584,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -630,7 +630,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -741,7 +741,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -787,7 +787,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -833,7 +833,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -944,7 +944,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -990,7 +990,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1036,7 +1036,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1147,7 +1147,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1193,7 +1193,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1239,7 +1239,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1350,7 +1350,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1396,7 +1396,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1442,7 +1442,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -127,7 +127,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_6-test:  # Testing
@ -193,7 +193,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_8-test:  # Testing
@ -259,7 +259,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda13_0-test:  # Testing
@ -721,7 +721,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_6-test:  # Testing
@ -787,7 +787,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_8-test:  # Testing
@ -853,7 +853,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda13_0-test:  # Testing
@ -1315,7 +1315,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_6-test:  # Testing
@ -1381,7 +1381,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
@ -1447,7 +1447,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda13_0-test:  # Testing
@ -1909,7 +1909,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_6-test:  # Testing
@ -1975,7 +1975,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_8-test:  # Testing
@ -2041,7 +2041,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda13_0-test:  # Testing
@ -2503,7 +2503,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_6-test:  # Testing
@ -2569,7 +2569,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_8-test:  # Testing
@ -2635,7 +2635,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda13_0-test:  # Testing
@ -3097,7 +3097,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_6-test:  # Testing
@ -3163,7 +3163,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_8-test:  # Testing
@ -3229,7 +3229,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda13_0-test:  # Testing
@ -3691,7 +3691,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_6-test:  # Testing
@ -3757,7 +3757,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_8-test:  # Testing
@ -3823,7 +3823,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda13_0-test:  # Testing
--- a/.github/workflows/operator_microbenchmark.yml
+++ b/.github/workflows/operator_microbenchmark.yml
@ -18,6 +18,7 @@ permissions:
  contents: read

 jobs:
+  # H100 A100 runners
  opmicrobenchmark-build:
    if: github.repository_owner == 'pytorch'
    name: opmicrobenchmark-build
@ -44,3 +45,56 @@ jobs:
      docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
    secrets: inherit
+
+  # B200 runner
+  opmicrobenchmark-build-b200:
+    if: github.repository_owner == 'pytorch'
+    name: opmicrobenchmark-build-b200
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
+        ]}
+    secrets: inherit
+
+  opmicrobenchmark-test-b200:
+    name: opmicrobenchmark-test-b200
+    uses: ./.github/workflows/_linux-test.yml
+    needs: opmicrobenchmark-build-b200
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
+      test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit
+
+  # ROCM MI300 runner
+  opmicrobenchmark-build-rocm:
+    if: github.repository_owner == 'pytorch'
+    name: opmicrobenchmark-build-rocm
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
+      test-matrix: |
+        { include: [
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
+        ]}
+    secrets: inherit
+
+  opmicrobenchmark-test-rocm:
+    name: opmicrobenchmark-test-rocm
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: opmicrobenchmark-build-rocm
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image: ${{ needs.opmicrobenchmark-build-rocm.outputs.docker-image }}
+      test-matrix: ${{ needs.opmicrobenchmark-build-rocm.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/test-h100.yml
+++ b/.github/workflows/test-h100.yml
@ -61,3 +61,15 @@ jobs:
      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.test-matrix }}
    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm90-FA3-ABI-stable-test:
+    name: linux-jammy-cuda12_8-py3_10-gcc11-sm90-FA3-ABI-stable-test
+    uses: ./.github/workflows/_linux-test-stable-fa3.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm90-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.docker-image }}
+      timeout-minutes: 30
+      s3-bucket: gha-artifacts
+    secrets: inherit
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@ -23,7 +23,7 @@ jobs:
        with:
          repository: pytorch/pytorch
          stable-branch: viable/strict
-          requires: '[\"pull\", \"trunk\", \"lint\", \"^linux-binary-manywheel$\", \"^linux-binary-libtorch-release$\", \"linux-aarch64\"]'
+          requires: '[\"pull\", \"trunk\", \"lint\",  \"linux-aarch64\"]'
          secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
          clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
          clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
@ -48,4 +48,7 @@ jobs:
            echo "{\"sha\": \"${LATEST_SHA}\", \"repository\":\"pytorch/pytorch\", \"timestamp\": ${TIME}}" > "/tmp/${LATEST_SHA}.json"
            pip install awscli==1.29.40
            aws s3 cp "/tmp/${LATEST_SHA}.json" "s3://ossci-raw-job-status/stable_pushes/pytorch/pytorch/${LATEST_SHA}.json"
+            # Push new viable/strict tag
+            cd pytorch/pytorch
+            git push origin "${LATEST_SHA}:refs/tags/viable/strict/${TIME}"
          fi
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -42,7 +42,7 @@ jobs:
      build-external-packages: "vllm"
      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
-      cuda-arch-list: '8.0;8.9;9.0'
+      cuda-arch-list: '8.0 8.9 9.0'
      runner: linux.24xlarge.memory
      test-matrix: |
        { include: [
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -18,6 +18,7 @@ exclude_patterns = [
    'torch/_inductor/autoheuristic/artifacts/**',
    'scripts/**',
    'test/generated_type_hints_smoketest.py',
+    'test/test_torchfuzz_repros.py',
    # CPython tests
    'test/dynamo/cpython/**',
    # Tests from the NumPy test suite
@ -27,6 +28,7 @@ exclude_patterns = [
    'torch/lib/**',
    'venv/**',
    '**/*.pyi',
+    "tools/experimental/dynamic_shapes/torchfuzz/**",
    'tools/test/test_selective_build.py',
 ]
 command = [
@ -1260,6 +1262,7 @@ exclude_patterns = [
    'test/test_masked.py',
    'test/test_maskedtensor.py',
    'test/test_matmul_cuda.py',
+    'test/test_scaled_matmul_cuda.py',
    'test/test_meta.py',
    'test/test_metal.py',
    'test/test_mkl_verbose.py',
--- a/18
+++ b/18
@ -181,15 +181,15 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 /torch/csrc/jit/python/init.cpp @mikaylagawarecki

 # CUDA and CUDA math libraries
-aten/src/ATen/cuda/ @eqy @syed-ahmed
-aten/src/ATen/cudnn/ @eqy @syed-ahmed
-aten/src/ATen/native/cuda/ @eqy @syed-ahmed
-aten/src/ATen/native/cudnn/ @eqy @syed-ahmed
-c10/cuda @eqy @syed-ahmed
-torch/cuda/ @eqy @syed-ahmed
-torch/csrc/cuda/ @eqy @syed-ahmed
-torch/backends/cuda/ @eqy @syed-ahmed
-torch/backends/cudnn/ @eqy @syed-ahmed
+aten/src/ATen/cuda/ @eqy @syed-ahmed @Aidyn-A
+aten/src/ATen/cudnn/ @eqy @syed-ahmed @Aidyn-A
+aten/src/ATen/native/cuda/ @eqy @syed-ahmed @Aidyn-A
+aten/src/ATen/native/cudnn/ @eqy @syed-ahmed @Aidyn-A
+c10/cuda @eqy @syed-ahmed @Aidyn-A
+torch/cuda/ @eqy @syed-ahmed @Aidyn-A
+torch/csrc/cuda/ @eqy @syed-ahmed @Aidyn-A
+torch/backends/cuda/ @eqy @syed-ahmed @Aidyn-A
+torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A

 # PyTree utilities
 /torch/utils/_pytree.py @XuehaiPan
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -81,7 +81,7 @@ git remote add upstream git@github.com:pytorch/pytorch.git
 make setup-env
 # Or run `make setup-env-cuda` for pre-built CUDA binaries
 # Or run `make setup-env-rocm` for pre-built ROCm binaries
-source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 ```

 ### Tips and Debugging
@ -182,28 +182,36 @@ You can use this script to check out a new nightly branch with the following:

 ```bash
 ./tools/nightly.py checkout -b my-nightly-branch
-source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 ```

 To install the nightly binaries built with CUDA, you can pass in the flag `--cuda`:

 ```bash
 ./tools/nightly.py checkout -b my-nightly-branch --cuda
-source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 ```

 To install the nightly binaries built with ROCm, you can pass in the flag `--rocm`:

 ```bash
 ./tools/nightly.py checkout -b my-nightly-branch --rocm
-source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 ```

 You can also use this tool to pull the nightly commits into the current branch:

 ```bash
-./tools/nightly.py pull -p my-env
-source my-env/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+./tools/nightly.py pull
+source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
+```
+
+To create the virtual environment with a specific Python interpreter, you can
+pass in the `--python` argument:
+
+```bash
+./tools/nightly.py --python /path/to/python3.12
+source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 ```

 Pulling will recreate a fresh virtual environment and reinstall the development
--- a/5
+++ b/5
@ -50,11 +50,10 @@ RUN git submodule update --init --recursive
 FROM conda as conda-installs
 ARG PYTHON_VERSION=3.11
 ARG CUDA_PATH=cu121
-ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=whl/nightly
 # Automatically set by buildx
-RUN /opt/conda/bin/conda update -y -n base -c defaults conda
-RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION}
+# pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
+RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=25.7.0

 ARG TARGETPLATFORM

--- a/RELEASE.md
+++ b/RELEASE.md
@ -3,6 +3,7 @@
 <!-- toc -->

  - [Release Compatibility Matrix](#release-compatibility-matrix)
+    - [PyTorch CUDA Support Matrix](#pytorch-cuda-support-matrix)
  - [Release Cadence](#release-cadence)
  - [General Overview](#general-overview)
    - [Frequently Asked Questions](#frequently-asked-questions)
@ -63,6 +64,22 @@ Following is the Release Compatibility Matrix for PyTorch releases:
 | 1.13 | >=3.7, <=3.10 | C++14 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
 | 1.12 | >=3.7, <=3.10 | C++14 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |

+### PyTorch CUDA Support Matrix
+
+For Release 2.9 PyTorch Supports following CUDA Architectures:
+
+| CUDA | architectures supported for Linux x86 and Windows builds | notes |
+| --- | --- | --- |
+| 12.6.3 | Maxwell(5.0), Pascal(6.0), Volta(7.0), Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0) | |
+| 12.8.1 | Volta(7.0), Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0), Blackwell(10.0, 12.0)  | |
+| 13.0.0 | Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0), Blackwell(10.0, 12.0+PTX) | +PTX available on linux builds only |
+
+| CUDA | architectures supported for Linux aarch64 builds |
+| --- | --- |
+| 12.6.3 | Ampere(8.0), Hopper(9.0) |
+| 12.8.1 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 12.0)  |
+| 13.0.0 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 11.0, 12.0+PTX) |
+
 ## Release Cadence

 Following is the release cadence. All future dates below are tentative. For latest updates on the release schedule, please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -605,6 +605,11 @@ if(UNIX)
  if(HAVE_MALLOC_USABLE_SIZE)
    add_definitions(-DHAVE_MALLOC_USABLE_SIZE=1)
  endif(HAVE_MALLOC_USABLE_SIZE)
+  set(CMAKE_EXTRA_INCLUDE_FILES "fcntl.h")
+  CHECK_FUNCTION_EXISTS(posix_fallocate HAVE_POSIX_FALLOCATE)
+  if(HAVE_POSIX_FALLOCATE)
+    add_definitions(-DHAVE_POSIX_FALLOCATE=1)
+  endif(HAVE_POSIX_FALLOCATE)
 endif(UNIX)

 ADD_DEFINITIONS(-DUSE_EXTERNAL_MZCRC)
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -279,42 +279,6 @@ bool Context::userEnabledOverrideableSDP() const {
  return enabled_overrideable;
 }

-static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
-static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
-
-bool Context::checkCuBLASConfigDeterministic() {
-  // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
-  // is set to deterministic setting
-  if (hasCUDART()) {
-    const auto workspace_config = c10::utils::get_env(cublas_config_var_name);
-    return (workspace_config == cublas_deterministic_configs[0] || workspace_config == cublas_deterministic_configs[1]);
-  }
-  return true;
-}
-
-void Context::alertCuBLASConfigNotDeterministic() const {
-  static const bool cublas_config_deterministic = checkCuBLASConfigDeterministic();
-  if (C10_LIKELY(!deterministicAlgorithms() || cublas_config_deterministic)) {
-    return;
-  }
-
-  auto msg = c10::str(
-    "Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ",
-    "`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ",
-    "it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this ",
-    "case, you must set an environment variable before running your PyTorch application: ",
-    cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ",
-    cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ",
-    "https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility"
-  );
-
-  if (deterministicAlgorithmsWarnOnly()) {
-    TORCH_WARN(msg);
-  } else {
-    TORCH_CHECK(false, msg);
-  }
-}
-
 bool Context::benchmarkCuDNN() const {
  return benchmark_cudnn;
 }
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -310,13 +310,7 @@ class TORCH_API Context {
  //
  // * Throw an error when `Context::deterministicAlgorithms()` is true. Most
  //   of the time, this should be accomplished by calling
-  //   `at::globalContext().alertNotDeterminstic()`.  However, if the
-  //   nondeterministic behavior is caused by the CuBLAS workspace
-  //   configuration in CUDA >= 10.2,
-  //   `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
-  //   called instead (in this case, a comment explaining why the operation is
-  //   nondeterministic is not necessary). See below for details on these
-  //   methods.
+  //   `at::globalContext().alertNotDeterminstic().
  //
  // * Have an entry in the list of nondeterministic PyTorch operations in the
  //   docstring of `use_deterministic_algorithms()` in torch/__init__.py
@ -340,12 +334,6 @@ class TORCH_API Context {
  // Throws an error if `Context::deterministicAlgorithms()` is true
  static void alertNotDeterministic(std::string_view const& caller);

-  // Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
-  // >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
-  // ":4096:8". For more details:
-  // https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
-  void alertCuBLASConfigNotDeterministic() const;
-
  void setFloat32MatmulPrecision(const std::string& s);
  void setFloat32Precision(
      const std::string& backend,
@ -429,7 +417,6 @@ class TORCH_API Context {
  }

 private:
-  static bool checkCuBLASConfigDeterministic();
  std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES> init_;
  bool enabled_cudnn = true;
  bool deterministic_cudnn = false;
--- a/aten/src/ATen/MapAllocator.cpp
+++ b/aten/src/ATen/MapAllocator.cpp
@ -292,6 +292,28 @@ MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags,
          if (ftruncate(fd, static_cast<off_t>(size)) == -1) {
            TORCH_CHECK(false, "unable to resize file <", filename_, "> to the right size: ", c10::utils::str_error(errno), " (", errno, ")");
          }
+
+#ifdef HAVE_POSIX_FALLOCATE
+          if (flags_ & ALLOCATOR_MAPPED_SHAREDMEM) {
+            for (;;) {
+              if (posix_fallocate(fd, 0, static_cast<off_t>(size)) == 0) {
+                break;
+              }
+
+              if (errno == EINTR) {
+                continue;
+              }
+
+              if (errno == EINVAL || errno == EOPNOTSUPP) {
+                // the underlying filesystem does not support the operation
+                break;
+              }
+
+              TORCH_CHECK(false, "unable to allocate shared memory(shm) for file <", filename_, ">: ", c10::utils::str_error(errno), " (", errno, ")");
+            }
+          }
+#endif
+
          if (fstat(fd, &file_stat) == -1 || file_stat.st_size < static_cast<int64_t>(size)) {
 #ifndef STRIP_ERROR_MESSAGES
            int last_err = errno;
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -6,6 +6,7 @@
 #include <c10/core/thread_pool.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/llvmMathExtras.h>
+#include <iostream>
 #include <optional>

 #include <deque>
@ -51,17 +52,15 @@ namespace {

 // Struct containing memory allocator summary statistics for host.
 struct TORCH_API HostStats {
-  // COUNT: allocations requested by client code. Note that active
-  // count can be extracted by looking at current allocations
-  Stat allocation;
-  // COUNT: number of allocated segments from host memory allocation.
-  Stat segment;
-
-  // SUM: bytes allocated by this memory alocator. Note that active bytes
-  // can be extracted by looking at current bytes allocated
+  // COUNT: total allocations (active)
+  Stat active_requests;
+  // SUM: bytes allocated/reserved by this memory alocator. (active)
+  Stat active_bytes;
+  // COUNT: total allocations (active + free)
+  Stat allocations;
+  // SUM: bytes allocated/reserved by this memory alocator. This accounts
+  // for both free and in-use blocks.
  Stat allocated_bytes;
-  // SUM: bytes reserved by this memory allocator (both free and used)
-  Stat reserved_bytes;

  // SUM: time spent in cudaHostAlloc/cudaHostRegister in microseconds
  DurationStat host_alloc_time;
@ -75,6 +74,9 @@ struct TORCH_API HostStats {

  // COUNT: number of times cudaHostFree/cudaHostUnregister was called.
  int64_t num_host_free = 0; // This is derived from segment or timing
+
+  // Count of cudaHostAlloc/cudaHostRegister per bucket
+  std::vector<int64_t> bucket_allocation = std::vector<int64_t>(MAX_SIZE_INDEX);
 };

 // Struct containing memory allocator summary statistics for host, as they
@ -82,17 +84,22 @@ struct TORCH_API HostStats {
 // avoid locking the allocator while collecting stats.
 struct alignas(64) HostStatsStaged {
  std::mutex timing_mutex_;
-  // COUNT: allocations requested by client code resulting in a new segment/block allocation
-  // LOCK: access to this stat is protected by the allocator's blocks_mutex_
-  Stat allocation;
-  // SUM: bytes within active memory blocks, including blocks that are
-  // currently in the free list.
+  // COUNT: total allocations (active + free)
  // LOCK: access to this stat is protected by the allocator's blocks_mutex_
+  Stat allocations;
+  // SUM: bytes allocated/reserved by this memory alocator. This accounts
+  // for both free and in-use blocks.
  Stat allocated_bytes;
-  // COUNT: number of allocations per bucket
+  // COUNT: number of allocations per bucket (active)
+  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
+  std::vector<Stat> active_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
+  // SUM: bytes of allocation per bucket (active)
+  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
+  std::vector<Stat> active_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
+  // COUNT: number of allocations per bucket (active + free)
  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
  std::vector<Stat> allocation_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
-  // SUM: bytes of allocation per bucket
+  // SUM: bytes of allocation per bucket (active + free)
  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
  std::vector<Stat> allocated_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
  // SUM: time spent in cudaHostAlloc/cudaHostRegister
@ -196,27 +203,7 @@ struct CachingHostAllocatorImpl {
    // background.
    if (!pinned_use_background_threads()) {
      process_events();
-    }
-
-    // Round up the allocation to the nearest power of two to improve reuse.
-    // These power of two sizes are also used to index into the free list.
-    size_t roundSize = c10::llvm::PowerOf2Ceil(size);
-
-    // First, try to allocate from the free list
-    auto* block = get_free_block(roundSize);
-    if (block) {
-      return {block->ptr_, reinterpret_cast<void*>(block)};
-    }
-
-    // Check in the recently freed blocks with pending events to see if we
-    // can reuse them. Call get_free_block again after processing events
-    if (pinned_use_background_threads()) {
-      process_events_for_specific_size(roundSize);
-      block = get_free_block(roundSize);
-      if (block) {
-        return {block->ptr_, reinterpret_cast<void*>(block)};
-      }
-
+    } else {
      // Launch the background thread and process events in a loop.
      static bool background_thread_flag [[maybe_unused]] = [this] {
        getBackgroundThreadPool()->run([&]() {
@ -229,6 +216,16 @@ struct CachingHostAllocatorImpl {
      }();
    }

+    // Round up the allocation to the nearest power of two to improve reuse.
+    // These power of two sizes are also used to index into the free list.
+    size_t roundSize = c10::llvm::PowerOf2Ceil(size);
+
+    // First, try to allocate from the free list
+    auto* block = get_free_block(roundSize);
+    if (block) {
+      return {block->ptr_, reinterpret_cast<void*>(block)};
+    }
+
    // Slow path: if we can't allocate from the cached free list, we need
    // to create a new block.
    void* ptr = nullptr;
@ -278,8 +275,6 @@ struct CachingHostAllocatorImpl {
      auto index = size_index(block->size_);
      std::lock_guard<std::mutex> g(free_list_[index].mutex_);
      free_list_[index].list_.push_back(block);
-      stats_.allocation_bucket_stats[index].decrease(1);
-      stats_.allocated_bytes_bucket_stats[index].decrease(block->size_);
    } else {
      // restore these events that record by used streams.
      std::lock_guard<std::mutex> g(events_mutex_);
@ -339,9 +334,12 @@ struct CachingHostAllocatorImpl {
      for (auto* block : blocks_to_remove) {
        blocks_.erase(block);
        ptr_to_block_.erase(block->ptr_);
-        stats_.allocation.decrease(1);
-        stats_.allocated_bytes.decrease(block->size_);
+        auto index = size_index(block->size_);
        free_block(block);
+        stats_.allocations.decrease(1);
+        stats_.allocated_bytes.decrease(block->size_);
+        stats_.allocation_bucket_stats[index].decrease(1);
+        stats_.allocated_bytes_bucket_stats[index].decrease(block->size_);
        delete block;
      }
    }
@ -388,16 +386,17 @@ struct CachingHostAllocatorImpl {
      // per bucket (we pick index 0 arbitrarily). These are also all the host
      // allocations, not taking into account caching and free lists.
      if (i == 0) {
-        stats.segment = stats_.allocation;
-        stats.reserved_bytes = stats_.allocated_bytes;
-        stats.num_host_alloc = stats.segment.allocated;
-        stats.num_host_free = stats.segment.freed;
+        stats.allocations = stats_.allocations;
+        stats.allocated_bytes = stats_.allocated_bytes;
+        stats.num_host_alloc = stats.allocations.allocated;
+        stats.num_host_free = stats.allocations.freed;
      }

      // Bucket stats need to be merged with the slow-path stats. We do this in
      // a best effort manner, since we can't really replay the cached events per bucket.
-      add_bucket_stats(stats.allocation, stats_.allocation_bucket_stats[i]);
-      add_bucket_stats(stats.allocated_bytes, stats_.allocated_bytes_bucket_stats[i]);
+      add_bucket_stats(stats.active_requests, stats_.active_bucket_stats[i]);
+      add_bucket_stats(stats.active_bytes, stats_.active_bytes_bucket_stats[i]);
+      stats.bucket_allocation[i] = stats_.allocation_bucket_stats[i].allocated;
    }

    // Get the timing stats
@ -421,9 +420,11 @@ struct CachingHostAllocatorImpl {
      std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);

      if (i == 0) {
-        stats_.allocation.reset_accumulated();
+        stats_.allocations.reset_accumulated();
        stats_.allocated_bytes.reset_accumulated();
      }
+      stats_.active_bucket_stats[i].reset_accumulated();
+      stats_.active_bytes_bucket_stats[i].reset_accumulated();
      stats_.allocation_bucket_stats[i].reset_accumulated();
      stats_.allocated_bytes_bucket_stats[i].reset_accumulated();
    }
@ -446,9 +447,11 @@ struct CachingHostAllocatorImpl {
      std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);

      if (i == 0) {
-        stats_.allocation.reset_peak();
+        stats_.allocations.reset_peak();
        stats_.allocated_bytes.reset_peak();
      }
+      stats_.active_bucket_stats[i].reset_peak();
+      stats_.active_bytes_bucket_stats[i].reset_peak();
      stats_.allocation_bucket_stats[i].reset_peak();
      stats_.allocated_bytes_bucket_stats[i].reset_peak();
    }
@ -465,7 +468,7 @@ struct CachingHostAllocatorImpl {
  virtual void add_allocated_block(B* block) {
    std::lock_guard<std::mutex> g(blocks_mutex_);
    blocks_.insert(block);
-    stats_.allocation.increase(1);
+    stats_.allocations.increase(1);
    stats_.allocated_bytes.increase(block->size_);
    ptr_to_block_.insert({block->ptr_, block});

@ -478,6 +481,8 @@ struct CachingHostAllocatorImpl {
      std::lock_guard<std::mutex> g(free_list_[index].mutex_);
      stats_.allocation_bucket_stats[index].increase(1);
      stats_.allocated_bytes_bucket_stats[index].increase(size);
+      stats_.active_bucket_stats[index].increase(1);
+      stats_.active_bytes_bucket_stats[index].increase(size);
    }
  }

@ -488,8 +493,8 @@ struct CachingHostAllocatorImpl {
      B* block = free_list_[index].list_.back();
      free_list_[index].list_.pop_back();
      block->allocated_ = true;
-      stats_.allocation_bucket_stats[index].increase(1);
-      stats_.allocated_bytes_bucket_stats[index].increase(size);
+      stats_.active_bucket_stats[index].increase(1);
+      stats_.active_bytes_bucket_stats[index].increase(size);
      return block;
    }
    return nullptr;
@ -583,8 +588,8 @@ struct CachingHostAllocatorImpl {
        auto index = size_index(block->size_);
        std::lock_guard<std::mutex> g(free_list_[index].mutex_);
        free_list_[index].list_.push_back(block);
-        stats_.allocation_bucket_stats[index].decrease(1);
-        stats_.allocated_bytes_bucket_stats[index].decrease(size);
+        stats_.active_bucket_stats[index].decrease(1);
+        stats_.active_bytes_bucket_stats[index].decrease(size);
        if (size != -1) {
          return;
        }
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@ -2,6 +2,7 @@
 #include <c10/core/impl/PythonDispatcherTLS.h>
 #include <ATen/core/PythonFallbackKernel.h>
 #include <c10/core/SafePyObject.h>
+#include <ATen/record_function.h>

 namespace {

@ -53,20 +54,24 @@ void pythonFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_
  TORCH_INTERNAL_ASSERT(tls_on_entry.has_value());
  // c10::impl::ForceDispatchKeyGuard dispatcher_guard(tls_on_entry.value());
  // StashTLSOnEntryGuard stash_guard;
-  c10::impl::ExcludeDispatchKeyGuard guard(after_Python_keyset);
+  c10::impl::ExcludeDispatchKeyGuard exclude_guard(after_Python_keyset);

+  const auto& schema = op.schema();
+  const auto num_arguments = schema.arguments().size();

  // If Torch Dispatch Mode is active, use its PyInterpreter for dispatch
  const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
  if (mode_stack_len > 0) {
+    RECORD_FUNCTION("PythonDispatchMode", torch::jit::last(*stack, num_arguments));
    const auto& cur_torch_dispatch_mode_state = c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
    cur_torch_dispatch_mode_state->pyinterpreter()->dispatch(op, stack);
    return;
  }

+  RECORD_FUNCTION("PythonSubclass", torch::jit::last(*stack, num_arguments));
+
  // Otherwise, find a PyInterpreter on a Tensor
-  const auto& schema = op.schema();
-  const auto num_arguments = schema.arguments().size();
+
  // It is safe to dispatch on the very first Tensor with a pyobj_interpreter
  // without checking the interpreters of any of the arguments, because when
  // we actually run dispatch(), we will take out PyObjects in the context
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -173,4 +173,12 @@ unsigned TensorBase::_register_hook(std::function<TensorBase(const TensorBase&)>
  return impl::GetVariableHooks()->_register_hook(*this, std::move(hook));
 }

+std::optional<ScalarType> TensorBase::grad_dtype() const {
+  return impl::GetVariableHooks()->grad_dtype(*this);
+}
+
+void TensorBase::set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const {
+  return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype);
+}
+
 } // namespace at
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -930,6 +930,10 @@ public:

  const TensorBase& requires_grad_(bool _requires_grad=true) const;

+  std::optional<ScalarType> grad_dtype() const;
+
+  void set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const;
+
  // View Variables
  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@ -68,6 +68,8 @@ struct TORCH_API VariableHooksInterface {
      const c10::OperatorHandle& op,
      c10::DispatchKeySet dispatch_keys,
      torch::jit::Stack* stack) const = 0;
+  virtual std::optional<c10::ScalarType> grad_dtype(const TensorBase&) const = 0;
+  virtual void set_grad_dtype(const TensorBase&, const std::optional<c10::ScalarType>&) const = 0;
 };

 TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -357,7 +357,7 @@ IValue IValue::equals(const IValue& rhs) const {
    case Tag::Enum:
      return lhs.toEnumHolder()->is(*rhs.toEnumHolder());
    case Tag::Uninitialized:
-      // Unitialized ivalues show up in no-ops when the compiler can prove a
+      // Uninitialized ivalues show up in no-ops when the compiler can prove a
      // value will never be used. Just return false on any equality comparison.
      return false;
  }
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -191,6 +191,10 @@ uint32_t _getAlignment(uintptr_t address) {

 #ifdef USE_ROCM
 static c10::cuda::CUDAStream _getCarveoutStream(int32_t value) {
+  // 0 is default value, meaning full CUs i.e. no mask
+  if (value == 0) {
+    return at::cuda::getCurrentCUDAStream();
+  }
  static int32_t last_value = 0;
  static hipStream_t stream;
  if (last_value == 0) {
@ -209,15 +213,15 @@ static c10::cuda::CUDAStream _getCarveoutStream(int32_t value) {
  int32_t CUs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
  // how many uint32_t do we need to cover all CUs, fill bitmask with 1
  uint32_t mask_size = static_cast<uint32_t>((CUs + 32 - 1) / 32);
-  std::vector<uint32_t> mask(mask_size, uint32_t{0xffffffff});
+  std::vector<uint32_t> mask(mask_size, uint32_t{0x00000000});
  // starting from lowest order bits, in 32-bit chunks
  // set bits to 0 based on how many CUs to carve out
  int32_t full_shifts = value / 32;
  int32_t remainder = value % 32;
  for (int32_t i = 0; i < full_shifts; i++) {
-    mask[i] = uint32_t{0x00000000};
+    mask[i] = uint32_t{0xffffffff};
  }
-  mask[full_shifts] = uint32_t{0xffffffff} << remainder;
+  mask[full_shifts] = uint32_t{0xffffffff} << (32 - remainder);

  // finally, create masked stream
  AT_CUDA_CHECK(hipExtStreamCreateWithCUMask(&stream, mask_size, &mask[0]));
@ -436,7 +440,6 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
    static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
  }

-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -570,8 +573,6 @@ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_D

 template <>
 void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -583,8 +584,6 @@ void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {

 template <>
 void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -596,8 +595,6 @@ void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {

 template <>
 void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -611,8 +608,6 @@ void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::co

 template <>
 void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -626,8 +621,6 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com

 template <typename C_Dtype>
 inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -699,8 +692,6 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP

 template <typename C_Dtype>
 inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  BGEMM_CHECK_ARGVALUES(at::BFloat16);
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
@ -1024,8 +1015,6 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dty

 template <>
 void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1037,8 +1026,6 @@ void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {

 template <>
 void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1050,8 +1037,6 @@ void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {

 template <>
 void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1065,8 +1050,6 @@ void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::comp

 template <>
 void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1080,8 +1063,6 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl

 template <typename C_Dtype>
 inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1190,7 +1171,6 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(

 template <typename C_Dtype>
 inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
  cublasOperation_t opb = _cublasOpFromChar(transb);
@ -2404,8 +2384,6 @@ void trsmBatched<c10::complex<double>>(

 template <>
 void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
@ -2421,8 +2399,6 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
  // gemv is bw bound, and does not benefit from TF32. But the precision
  // loss still happens on TF32. So we disable it here.
  NoTF32Guard disable_tf32;
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
@ -2435,8 +2411,6 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {

 template <>
 void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
@ -2450,8 +2424,6 @@ void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
  // gemv is bw bound, and does not benefit from TF32. But the precision
  // loss still happens on TF32. So we disable it here.
  NoTF32Guard disable_tf32;
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t op = _cublasOpFromChar(trans);
  _cublasAdjustLdLevel2(m, n, &lda);
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@ -151,11 +151,6 @@ struct CUDACachingHostAllocatorImpl
  }

  bool query_event(EventPool::Event& event) override {
-    // Do not call cudaEventQuery if capturing is underway
-    if (at::cuda::currentStreamCaptureStatusMayInitCtx() !=
-        at::cuda::CaptureStatus::None) {
-      return false;
-    }
    cudaError_t err = cudaEventQuery(*event);
    if (err == cudaErrorNotReady) {
      (void)cudaGetLastError(); // clear CUDA error
--- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
@ -90,6 +90,10 @@ public:
    allocator_->setMemoryFraction(fraction, device);
  }

+  std::vector<HIPCachingAllocator::StreamSegmentSize> getExpandableSegmentSizes(c10::DeviceIndex device) override {
+    return allocator_->getExpandableSegmentSizes(device);
+  }
+
  void enable(bool value) override {
    allocator_->enable(value);
  }
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -2801,6 +2801,7 @@ Tensor matrix_exp(const Tensor& a) {
 // TODO This should be deprecated in favor of linalg_matrix_exp_differential
 //      in FunctionsManual.cpp
 Tensor matrix_exp_backward(const Tensor& self, const Tensor& grad) {
+  squareCheckInputs(self, "matrix_exp_backward");
  NoTF32Guard disable_tf32;
  return backward_analytic_function_of_a_matrix(
    self, grad,
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -2067,7 +2067,7 @@ Tensor _reshape_copy_symint(
    TORCH_CHECK(0, "_reshape_copy not implemented for mkldnn tensors");
  }

-  if (self.is_contiguous()) {
+  if (self.is_contiguous_or_false()) {
    return self.view_symint(shape).clone(at::MemoryFormat::Contiguous);
  } else {
    return at::_unsafe_view_symint(
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -1375,7 +1375,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
  if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise
      && ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
      // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
-      ||  (dprops->major >= 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
+      ||  (dprops->major >= 10 && (!scale_a.sizes().empty() || !scale_b.sizes().empty())))) {
    TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
    at::cuda::detail::f8f8bf16_rowwise(
        mat1,
--- a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
+++ b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
@ -8,7 +8,6 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/Dispatch.h>
 #include <ATen/DynamicLibrary.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/LinearAlgebra.h>
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@ -1041,8 +1041,8 @@ std::string generate_code(
  // and `extra_args` for computation call if
  // extra arguments to capture runtime state are passed.
  // (look at polygamma for example).
-  std::string extra_params = "";
-  std::string extra_args = "";
+  std::string extra_params;
+  std::string extra_args;
  for (size_t i = 0; i < extra_args_typenames.size(); i++) {
    auto type = std::string(extra_args_typenames[i]);
    auto name = "extra_arg_" + std::to_string(i);
@ -1352,7 +1352,7 @@ std::string generate_reduction_code(
    int vec_size,
    int max_threads_codegen) {
  TORCH_INTERNAL_ASSERT(desc.nInputs == 1);
-  TORCH_INTERNAL_ASSERT(desc.extra_args_types.size() == 0);
+  TORCH_INTERNAL_ASSERT(desc.extra_args_types.empty());

  return generate_reduction_code(
      desc.nOutputs,
@ -1451,7 +1451,7 @@ std::optional<std::string> get_cache_dir() {
  std::string cache_dir;
  char* ptkcp = std::getenv("PYTORCH_KERNEL_CACHE_PATH");
  // Create kernel_cache_dir if needed as we do not want to create the base directory passed by the user
-  std::string kernels_cache_dir = "";
+  std::string kernels_cache_dir;
  if (ptkcp != nullptr) {
    cache_dir = std::string(ptkcp);
  } else {
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@ -14,7 +14,6 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/LinearAlgebra.h>
-#include <ATen/native/BatchLinearAlgebra.h>
 #include <ATen/native/cuda/linalg/BatchLinearAlgebraLib.h>
 #include <ATen/native/cuda/linalg/MagmaUtils.h>
 #include <ATen/native/cpu/zmath.h>
@ -1615,16 +1614,7 @@ static void lu_factor(const Tensor& input, const Tensor& pivots, const Tensor& i
  const auto preferred_backend = at::globalContext().linalgPreferredBackend();
 #ifdef USE_LINALG_SOLVER
  const auto lu_factor_cusolver = [batch_size, m, n](const Tensor& input, const Tensor& pivots, const Tensor& infos, bool compute_pivots) {
-    // In CUDA 10.2, lu_factor_looped_cusolver does not finish the computations when the input
-    // matrix is exactly singular. The returned pivots contain garbage. This breaks linalg.det
-    // Now, batched_cublas does not handle rectangular matrices, so we still dispatch to
-    // looped_cusolver even if m != n.
-#ifdef USE_ROCM
-    constexpr bool looped_correct = true;
-#else
-    constexpr bool looped_correct = CUSOLVER_VERSION >= 11100;
-#endif
-    if (m != n || (looped_correct && (batch_size == 1 || m >= 512))) {
+    if (m != n || (batch_size == 1 || m >= 512)) {
      lu_factor_looped_cusolver(input, pivots, infos, compute_pivots);
    } else {
      lu_factor_batched_cublas(input, pivots, infos, compute_pivots);
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
@ -14,6 +14,7 @@ struct EmbeddingBagParams {
  ::c10::metal::array<idx_type_t, 2> output_strides;
  ::c10::metal::array<idx_type_t, 2> max_indices_strides;

+  bool use_per_sample_weights;
  idx_type_t per_sample_weights_stride;

  idx_type_t num_indices;
@ -23,3 +24,24 @@ struct EmbeddingBagParams {
  EmbeddingBagMode mode;
  int64_t padding_idx;
 };
+
+template <typename idx_type_t = uint32_t>
+struct EmbeddingBagBackwardParams {
+  ::c10::metal::array<idx_type_t, 2> weight_grad_strides;
+  ::c10::metal::array<idx_type_t, 2> output_grad_strides;
+  ::c10::metal::array<idx_type_t, 2> max_indices_strides;
+  bool use_per_sample_weights;
+  idx_type_t per_sample_weights_stride;
+  idx_type_t feature_size;
+  EmbeddingBagMode mode;
+  int64_t padding_idx;
+};
+
+template <typename idx_type_t = uint32_t>
+struct EmbeddingBagPerSampleWeightsBackwardParams {
+  ::c10::metal::array<idx_type_t, 2> output_grad_strides;
+  ::c10::metal::array<idx_type_t, 2> weight_strides;
+  idx_type_t per_sample_weights_grad_stride;
+  idx_type_t feature_size;
+  int64_t padding_idx;
+};
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
@ -1,4 +1,5 @@
 #include <ATen/native/mps/kernels/EmbeddingBag.h>
+#include <c10/metal/atomic.h>
 #include <c10/metal/utils.h>
 #include <metal_array>
 #include <metal_stdlib>
@ -44,6 +45,7 @@ template <EmbeddingBagMode M, typename T>
 struct MaybeApplyPerSampleWeight {
  inline opmath_t<T> operator()(
      opmath_t<T> weight_val,
+      bool /*use_per_sample_weights*/,
      uint32_t /*per_sample_weights_index*/,
      constant T* /*per_sample_weights*/,
      uint32_t /*per_sample_weights_stride*/) {
@ -55,10 +57,11 @@ template <typename T>
 struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
  inline opmath_t<T> operator()(
      opmath_t<T> weight_val,
+      bool use_per_sample_weights,
      uint32_t per_sample_weights_index,
      constant T* per_sample_weights,
      uint32_t per_sample_weights_stride) {
-    if (per_sample_weights_stride) {
+    if (use_per_sample_weights) {
      T per_sample_weight = per_sample_weights
          [per_sample_weights_stride * per_sample_weights_index];
      return static_cast<opmath_t<T>>(per_sample_weight) * weight_val;
@ -154,6 +157,7 @@ void embedding_bag_impl(
  auto num_bags = params.num_bags;
  auto feature_size = params.feature_size;
  auto padding_idx = params.padding_idx;
+  auto use_per_sample_weights = params.use_per_sample_weights;
  auto per_sample_weights_stride = params.per_sample_weights_stride;
  constant auto& output_strides = params.output_strides;
  constant auto& weight_strides = params.weight_strides;
@ -183,7 +187,11 @@ void embedding_bag_impl(
             feature_idx * weight_strides[1]]);

    weight_val = MaybeApplyPerSampleWeight<M, T>()(
-        weight_val, indices_idx, per_sample_weights, per_sample_weights_stride);
+        weight_val,
+        use_per_sample_weights,
+        indices_idx,
+        per_sample_weights,
+        per_sample_weights_stride);

    auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0);

@ -239,19 +247,208 @@ kernel void embedding_bag(
  }
 }

-#define REGISTER_EMBEDDING_BAG_OP(T, I)                             \
-  template [[host_name("embedding_bag_" #T "_" #I)]]                \
-  kernel void embedding_bag<T, I>(                                  \
-      constant T * weight [[buffer(0)]],                            \
-      constant I * indices [[buffer(1)]],                           \
-      constant I * offsets [[buffer(2)]],                           \
-      constant T * per_sample_weights [[buffer(3)]],                \
-      device T * output [[buffer(4)]],                              \
-      device I * offset2bag [[buffer(5)]],                          \
-      device I * bag_size [[buffer(6)]],                            \
-      device I * max_indices [[buffer(7)]],                         \
-      constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
-      uint tid [[thread_position_in_grid]]);
+template <EmbeddingBagMode M, typename T>
+struct MaybeDivBagSize {
+  inline opmath_t<T> operator()(opmath_t<T> val, opmath_t<T> bag_size) {
+    return val;
+  }
+};
+
+template <typename T>
+struct MaybeDivBagSize<EmbeddingBagMode::MEAN, T> {
+  inline opmath_t<T> operator()(opmath_t<T> val, opmath_t<T> bag_size) {
+    return val / bag_size;
+  }
+};
+
+template <EmbeddingBagMode M, typename T, typename I>
+void embedding_bag_backward_sum_mean_impl(
+    constant T* output_grad,
+    constant I* indices,
+    constant I* offset2bag,
+    constant I* bag_size,
+    constant T* per_sample_weights,
+    device AtomicType_t<T>* weight_grad,
+    constant EmbeddingBagBackwardParams<uint32_t>& params,
+    uint tid) {
+  auto feature_size = params.feature_size;
+  auto indices_idx = tid / feature_size;
+  auto bag_idx = static_cast<uint32_t>(offset2bag[indices_idx]);
+  auto bag_size_val = bag_size[bag_idx];
+  auto weight_idx = indices[indices_idx];
+  auto padding_idx = params.padding_idx;
+
+  if (bag_size_val && weight_idx != padding_idx) {
+    auto feature_idx = tid % feature_size;
+    constant auto& weight_grad_strides = params.weight_grad_strides;
+    constant auto& output_grad_strides = params.output_grad_strides;
+    auto use_per_sample_weights = params.use_per_sample_weights;
+    auto per_sample_weights_stride = params.per_sample_weights_stride;
+
+    auto output_grad_val =
+        static_cast<opmath_t<T>>(output_grad
+                                     [bag_idx * output_grad_strides[0] +
+                                      feature_idx * output_grad_strides[1]]);
+
+    opmath_t<T> weight_grad_val = MaybeDivBagSize<M, T>()(
+        MaybeApplyPerSampleWeight<M, T>()(
+            output_grad_val,
+            use_per_sample_weights,
+            indices_idx,
+            per_sample_weights,
+            per_sample_weights_stride),
+        static_cast<opmath_t<T>>(bag_size_val));
+
+    AtomicType<T>::atomic_add(
+        weight_grad,
+        static_cast<int32_t>(weight_idx) * weight_grad_strides[0] +
+            feature_idx * weight_grad_strides[1],
+        static_cast<T>(weight_grad_val));
+  }
+}
+
+template <typename T, typename I>
+void embedding_bag_backward_max_impl(
+    constant T* output_grad,
+    constant I* bag_size,
+    constant I* max_indices,
+    device AtomicType_t<T>* weight_grad,
+    constant EmbeddingBagBackwardParams<uint32_t>& params,
+    uint tid) {
+  auto feature_size = params.feature_size;
+  auto bag_idx = tid / feature_size;
+  auto bag_size_val = bag_size[bag_idx];
+
+  if (bag_size_val) {
+    auto feature_idx = tid % feature_size;
+    constant auto& weight_grad_strides = params.weight_grad_strides;
+    constant auto& output_grad_strides = params.output_grad_strides;
+    constant auto& max_indices_strides = params.max_indices_strides;
+
+    auto output_grad_val = output_grad
+        [bag_idx * output_grad_strides[0] +
+         feature_idx * output_grad_strides[1]];
+    auto max_index =
+        static_cast<uint32_t>(max_indices
+                                  [bag_idx * max_indices_strides[0] +
+                                   feature_idx * max_indices_strides[1]]);
+
+    AtomicType<T>::atomic_add(
+        weight_grad,
+        max_index * weight_grad_strides[0] +
+            feature_idx * weight_grad_strides[1],
+        output_grad_val);
+  }
+}
+
+#define DISPATCH_BACKWARD_SUM_MEAN_IMPL(MODE)        \
+  return embedding_bag_backward_sum_mean_impl<MODE>( \
+      output_grad,                                   \
+      indices,                                       \
+      offset2bag,                                    \
+      bag_size,                                      \
+      per_sample_weights,                            \
+      weight_grad,                                   \
+      params,                                        \
+      tid)
+
+template <typename T, typename I>
+kernel void embedding_bag_backward(
+    constant T* output_grad [[buffer(0)]],
+    constant I* indices [[buffer(1)]],
+    constant I* offset2bag [[buffer(2)]],
+    constant I* bag_size [[buffer(3)]],
+    constant I* max_indices [[buffer(4)]],
+    constant T* per_sample_weights [[buffer(5)]],
+    device AtomicType_t<T>* weight_grad [[buffer(6)]],
+    constant EmbeddingBagBackwardParams<uint32_t>& params [[buffer(7)]],
+    uint tid [[thread_position_in_grid]]) {
+  switch (params.mode) {
+    case EmbeddingBagMode::SUM:
+      DISPATCH_BACKWARD_SUM_MEAN_IMPL(EmbeddingBagMode::SUM);
+    case EmbeddingBagMode::MEAN:
+      DISPATCH_BACKWARD_SUM_MEAN_IMPL(EmbeddingBagMode::MEAN);
+    case EmbeddingBagMode::MAX:
+      return embedding_bag_backward_max_impl(
+          output_grad, bag_size, max_indices, weight_grad, params, tid);
+  }
+}
+
+template <typename T, typename I>
+kernel void embedding_bag_per_sample_weights_backward(
+    constant T* output_grad [[buffer(0)]],
+    constant T* weight [[buffer(1)]],
+    constant I* indices [[buffer(2)]],
+    constant I* offset2bag [[buffer(3)]],
+    device AtomicType_t<T>* per_sample_weights_grad [[buffer(4)]],
+    constant EmbeddingBagPerSampleWeightsBackwardParams<uint32_t>& params
+    [[buffer(5)]],
+    uint tid [[thread_position_in_grid]]) {
+  auto feature_size = params.feature_size;
+  auto padding_idx = params.padding_idx;
+  auto indices_idx = tid / feature_size;
+  auto weight_idx = indices[indices_idx];
+
+  if (weight_idx != padding_idx) {
+    auto feature_idx = tid % feature_size;
+    auto bag_idx = static_cast<uint32_t>(offset2bag[indices_idx]);
+    constant auto& output_grad_strides = params.output_grad_strides;
+    constant auto& weight_strides = params.weight_strides;
+    auto per_sample_weights_grad_stride = params.per_sample_weights_grad_stride;
+
+    auto weight_val = weight
+        [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
+         feature_idx * weight_strides[1]];
+    auto output_grad_val = output_grad
+        [bag_idx * output_grad_strides[0] +
+         feature_idx * output_grad_strides[1]];
+    auto per_sample_weights_grad_val = static_cast<opmath_t<T>>(weight_val) *
+        static_cast<opmath_t<T>>(output_grad_val);
+
+    AtomicType<T>::atomic_add(
+        per_sample_weights_grad,
+        indices_idx * per_sample_weights_grad_stride,
+        static_cast<T>(per_sample_weights_grad_val));
+  }
+}
+
+#define REGISTER_EMBEDDING_BAG_OP(T, I)                                     \
+  template [[host_name("embedding_bag_" #T "_" #I)]]                        \
+  kernel void embedding_bag<T, I>(                                          \
+      constant T * weight [[buffer(0)]],                                    \
+      constant I * indices [[buffer(1)]],                                   \
+      constant I * offsets [[buffer(2)]],                                   \
+      constant T * per_sample_weights [[buffer(3)]],                        \
+      device T * output [[buffer(4)]],                                      \
+      device I * offset2bag [[buffer(5)]],                                  \
+      device I * bag_size [[buffer(6)]],                                    \
+      device I * max_indices [[buffer(7)]],                                 \
+      constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]],         \
+      uint tid [[thread_position_in_grid]]);                                \
+                                                                            \
+  template [[host_name("embedding_bag_backward_" #T "_" #I)]]               \
+  kernel void embedding_bag_backward<T, I>(                                 \
+      constant T * output_grad [[buffer(0)]],                               \
+      constant I * indices [[buffer(1)]],                                   \
+      constant I * offset2bag [[buffer(2)]],                                \
+      constant I * bag_size [[buffer(3)]],                                  \
+      constant I * max_indices [[buffer(4)]],                               \
+      constant T * per_sample_weights [[buffer(5)]],                        \
+      device AtomicType_t<T> * weight_grad [[buffer(6)]],                   \
+      constant EmbeddingBagBackwardParams<uint32_t> & params [[buffer(7)]], \
+      uint tid [[thread_position_in_grid]]);                                \
+                                                                            \
+  template                                                                  \
+      [[host_name("embedding_bag_per_sample_weights_backward_" #T "_" #I)]] \
+      kernel void embedding_bag_per_sample_weights_backward<T, I>(          \
+          constant T * output_grad [[buffer(0)]],                           \
+          constant T * weight [[buffer(1)]],                                \
+          constant I * indices [[buffer(2)]],                               \
+          constant I * offset2bag [[buffer(3)]],                            \
+          device AtomicType_t<T> * per_sample_weights_grad [[buffer(4)]],   \
+          constant EmbeddingBagPerSampleWeightsBackwardParams<uint32_t> &   \
+              params [[buffer(5)]],                                         \
+          uint tid [[thread_position_in_grid]]);

 REGISTER_EMBEDDING_BAG_OP(float, int);
 REGISTER_EMBEDDING_BAG_OP(float, long);
--- a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
+++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
@ -13,8 +13,10 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_embedding_bag_dense_backward_native.h>
 #include <ATen/ops/_embedding_bag_forward_only_native.h>
 #include <ATen/ops/_embedding_bag_native.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward_native.h>
 #include <ATen/ops/empty.h>
 #endif

@ -95,6 +97,7 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
  }

  bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
+  params.use_per_sample_weights = use_per_sample_weights;
  params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;

  params.num_indices = num_indices;
@ -177,4 +180,117 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_mps(
                            padding_idx);
 }

+Tensor _embedding_bag_dense_backward_mps(const Tensor& output_grad,
+                                         const Tensor& indices,
+                                         const Tensor& offset2bag,
+                                         const Tensor& bag_size,
+                                         const Tensor& max_indices,
+                                         int64_t num_weights,
+                                         bool scale_grad_by_freq,
+                                         int64_t mode,
+                                         const std::optional<Tensor>& per_sample_weights_opt,
+                                         int64_t padding_idx) {
+  // indices and offset2bag are assumed having correct dtypes and
+  // contiguous here due to the checks in _embedding_bag_backward in
+  // EmbeddingBag.cpp.
+  // Also see NOTE [ embedding_bag Native Functions ] in native_functions.yaml
+  // for more details.
+
+  int64_t feature_size = output_grad.size(1);
+  auto weight_grad = at::zeros({num_weights, feature_size}, output_grad.options());
+  EmbeddingBagBackwardParams<uint32_t> params;
+
+  for (const auto dim : c10::irange(2)) {
+    params.output_grad_strides[dim] = output_grad.stride(dim);
+    params.weight_grad_strides[dim] = weight_grad.stride(dim);
+
+    if (mode == EmbeddingBagMode::MAX) {
+      params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim));
+    }
+  }
+
+  bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
+  params.use_per_sample_weights = use_per_sample_weights;
+  params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
+  params.feature_size = output_grad.size(1);
+  params.mode = static_cast<EmbeddingBagMode>(mode);
+  params.padding_idx = padding_idx;
+
+  auto num_indices = offset2bag.numel();
+  auto num_threads = (params.mode == EmbeddingBagMode::MAX) ? output_grad.numel() : num_indices * params.feature_size;
+  MPSStream* stream = getCurrentMPSStream();
+
+  mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
+      auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_backward_{}_{}",
+                                                                    mps::scalarToMetalTypeString(output_grad),
+                                                                    mps::scalarToMetalTypeString(indices)));
+
+      getMPSProfiler().beginProfileKernel(
+          pipeline_state, "embedding_bag", {output_grad, indices, offset2bag, bag_size});
+      [computeEncoder setComputePipelineState:pipeline_state];
+      mps::mtl_setArgs(computeEncoder,
+                       output_grad,
+                       indices,
+                       offset2bag,
+                       bag_size,
+                       max_indices,
+                       use_per_sample_weights ? per_sample_weights_opt : std::nullopt,
+                       weight_grad,
+                       params);
+
+      mps::mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
+      getMPSProfiler().endProfileKernel(pipeline_state);
+    }
+  });
+
+  return std::move(weight_grad);
+}
+
+Tensor _embedding_bag_per_sample_weights_backward_mps(const Tensor& output_grad,
+                                                      const Tensor& weight,
+                                                      const Tensor& indices,
+                                                      const Tensor& offsets,
+                                                      const Tensor& offset2bag,
+                                                      int64_t mode,
+                                                      int64_t padding_idx) {
+  TORCH_INTERNAL_ASSERT(static_cast<EmbeddingBagMode>(mode) == EmbeddingBagMode::SUM);
+  int64_t num_indices = indices.size(0);
+  int64_t feature_size = output_grad.size(1);
+  auto per_sample_weights_grad = at::zeros({num_indices}, output_grad.options());
+  EmbeddingBagPerSampleWeightsBackwardParams params;
+
+  for (const auto dim : c10::irange(2)) {
+    params.output_grad_strides[dim] = output_grad.stride(dim);
+    params.weight_strides[dim] = weight.stride(dim);
+  }
+
+  params.per_sample_weights_grad_stride = per_sample_weights_grad.stride(0);
+  params.feature_size = feature_size;
+  params.padding_idx = padding_idx;
+
+  auto num_threads = num_indices * feature_size;
+  MPSStream* stream = getCurrentMPSStream();
+
+  mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
+      auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_per_sample_weights_backward_{}_{}",
+                                                                    mps::scalarToMetalTypeString(output_grad),
+                                                                    mps::scalarToMetalTypeString(indices)));
+
+      getMPSProfiler().beginProfileKernel(
+          pipeline_state, "embedding_bag_per_sample_weights_backward", {output_grad, weight, indices, offset2bag});
+      [computeEncoder setComputePipelineState:pipeline_state];
+      mps::mtl_setArgs(computeEncoder, output_grad, weight, indices, offset2bag, per_sample_weights_grad, params);
+
+      mps::mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
+      getMPSProfiler().endProfileKernel(pipeline_state);
+    }
+  });
+
+  return std::move(per_sample_weights_grad);
+}
+
 } // namespace at::native
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2379,7 +2379,7 @@

 - func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
  dispatch:
-    CPU, CUDA: _embedding_bag_backward_symint
+    CPU, CUDA, MPS: _embedding_bag_backward_symint

 - func: _embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
  dispatch:
@ -2389,12 +2389,14 @@
  dispatch:
    CPU: _embedding_bag_dense_backward_cpu
    CUDA: _embedding_bag_dense_backward_cuda
+    MPS: _embedding_bag_dense_backward_mps
  autogen: _embedding_bag_dense_backward.out

 - func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor
  dispatch:
    CPU: _embedding_bag_per_sample_weights_backward_cpu
    CUDA: _embedding_bag_per_sample_weights_backward_cuda
+    MPS: _embedding_bag_per_sample_weights_backward_mps
  autogen: _embedding_bag_per_sample_weights_backward.out

 - func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
@ -10256,6 +10258,7 @@
  structured: True
  dispatch:
    CPU, CUDA: all_all_out
+    MTIA: all_all_out_mtia
    MPS: all_all_out_mps

 - func: any(Tensor self) -> Tensor
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
@ -68,29 +68,6 @@ c10::MaybeOwned<Tensor> prepare_dense_matrix_for_cusparse(
  }
 }

-// This function is used for old CUDA Toolkit versions that doesn't support new cuSPARSE Generic API
-void addmm_out_legacy(
-    const at::sparse_csr::SparseCsrTensor& mat1,
-    const Tensor& mat2,
-    const Scalar& beta,
-    const Scalar& alpha,
-    const Tensor& result) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat1.is_sparse_csr());
-  auto nnz = mat1._nnz();
-  auto m = mat1.size(0);
-  auto k = mat1.size(1);
-  auto n = mat2.size(1);
-  auto crow_indices = mat1.crow_indices().to(kInt);
-  auto col_indices = mat1.col_indices().to(kInt);
-  auto values = mat1.values();
-  auto mat2_ = at::native::expect_resolved_conj(mat2);
-  auto result_ = at::native::expect_resolved_conj(result);
-  at::native::s_addmm_out_csr_sparse_dense_cuda_worker(nnz, m, n, k, result, beta, *result_, alpha, crow_indices, col_indices, values, *mat2_);
-  if (!result.is_same(*result_)) {
-    result.copy_(*result_);
-  }
-}
-
 c10::MaybeOwned<Tensor> inline prepare_dense_vector_for_cusparse(
    const Tensor& tensor) {
  if (tensor.is_non_overlapping_and_dense()) {
@ -582,9 +559,6 @@ void spmm(
    const Scalar& beta,
    const Scalar& alpha,
    const Tensor& result) {
-#if !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
-  addmm_out_legacy(mat1, mat2, beta, alpha, result);
-#else
  c10::MaybeOwned<Tensor> result_ = prepare_dense_matrix_for_cusparse(result);
  c10::MaybeOwned<Tensor> mat2_ = prepare_dense_matrix_for_cusparse(mat2);

@ -683,7 +657,6 @@ void spmm(
  if (!result.is_same(*result_)) {
    result.copy_(*result_);
  }
-#endif // !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
 }

 void spgemm(
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@ -672,7 +672,7 @@ Tensor bmm_sparse_cuda(const SparseTensor& self, const Tensor& mat2) {
  return bmm_out_sparse_cuda(self, mat2, result);
 }

-#if defined(USE_ROCM) || !(defined(_MSC_VER) && CUSPARSE_VERSION < 11000)
+#if defined(USE_ROCM) || defined(CUSPARSE_VERSION)
 __global__ void search_end_matrix_indices_cuda_kernel(
  int64_t* mat_el_end_indices,
  int64_t num_matrices,
@ -745,10 +745,6 @@ cudaDataType getTensorCudaDataType(Tensor self) {
 #endif

 Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor& result) {
-#if defined(_MSC_VER) && (CUSPARSE_VERSION < 11000)
-  TORCH_CHECK(false, "bmm sparse-dense CUDA is not supported on Windows with cuda before 11.0");
-#elif defined(USE_ROCM) || (defined(CUDART_VERSION) && (CUDART_VERSION >= 10010))  // linux cuda >= 10.1 or windows cuda >= 11.0
-
  TORCH_CHECK(!mat2.is_sparse(), "bmm_sparse: Tensor 'mat2' must be dense");
  TORCH_CHECK(self.dense_dim() == 0, "bmm_sparse: Tensor 'self' must have 0 dense dims, but has ", self.dense_dim());
  TORCH_CHECK(self.sparse_dim() == 3, "bmm_sparse: Tensor 'self' must have 3 sparse dims, but has ", self.sparse_dim());
@ -944,10 +940,6 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
  // them in column-major order in memory
  result.transpose_(1,2);

-#else
-  TORCH_CHECK(false, "bmm sparse-dense requires CUDA 10.1 or greater");
-#endif
-
  return result;
 }

--- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@ -40,7 +40,7 @@
 #include <thrust/iterator/discard_iterator.h>


-#if defined(__CUDACC__) && ((CUSPARSE_VERSION >= 11000) || (defined(USE_ROCM) && ROCM_VERSION >= 60300))
+#if defined(__CUDACC__) && (defined(CUSPARSE_VERSION) || (defined(USE_ROCM) && ROCM_VERSION >= 60300))
 #define IS_CUSPARSE11_AVAILABLE() 1
 #else
 #define IS_CUSPARSE11_AVAILABLE() 0
@ -689,13 +689,6 @@ void sparse_sparse_matmul_cuda_kernel(
        std::is_same_v<c10::complex<double>, scalar_t>,
    "sparse_sparse_matmul_cuda_kernel only supports data type of half, bfloat16, float, double and complex float, double.");

-  // older versions of cusparse on Windows segfault for complex128 dtype
-#if defined(_WIN32) && defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11400
-  TORCH_CHECK(
-      !(mat1.scalar_type() == ScalarType::ComplexDouble),
-      "Sparse multiplication with complex128 dtype inputs is not supported with current CUDA version. Please upgrade to CUDA Toolkit 11.2.1+");
-#endif
-
  Tensor mat1_indices_ = mat1._indices().contiguous();
  Tensor mat1_values = mat1._values().contiguous();

--- a/aten/src/ATen/native/vulkan/api/QueryPool.cpp
+++ b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
@ -5,6 +5,7 @@
 #include <torch/csrc/profiler/orchestration/vulkan.h>
 #endif // USE_KINETO

+#include <algorithm>
 #include <cmath>
 #include <iomanip>
 #include <iostream>
--- a/aten/src/ATen/test/cuda_allocator_test.cpp
+++ b/aten/src/ATen/test/cuda_allocator_test.cpp
@ -1,10 +1,83 @@
 #include <gtest/gtest.h>

 #include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDACachingAllocator.h>

 #include <ATen/test/allocator_clone_test.h>

+#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
+
+std::unordered_map<void*, size_t> allocation_sizes;
+
+void* logging_malloc(size_t size, int device, cudaStream_t stream) {
+    void* ptr;
+    cudaMalloc(&ptr, size);
+    allocation_sizes[ptr] = size;
+    return ptr;
+}
+
+void logging_free(void* ptr, size_t size, int device, cudaStream_t stream) {
+    if (allocation_sizes.find(ptr) != allocation_sizes.end()) {
+        if (allocation_sizes[ptr] != size) {
+          throw std::runtime_error("free mismatch");
+        }
+    } else {
+      throw std::runtime_error("free of unknown ptr");
+    }
+    cudaFree(ptr);
+    allocation_sizes.erase(ptr);
+}
+
+TEST(TestTorchUnique, UniqueComparisonTest) {
+  if (!at::cuda::is_available()) return;
+  auto custom_allocator =
+      torch::cuda::CUDAPluggableAllocator::createCustomAllocator(logging_malloc, logging_free);
+  torch::cuda::CUDAPluggableAllocator::changeCurrentAllocator(custom_allocator);
+  // Run the command 3 times; the first 2 will pass and the third invocation will have
+  // different sizes in alloc and free if the test fails.
+  for (int i = 0; i < 3; ++i) {
+    // Initialize simple sorted tensor with repeats
+    at::Tensor sorted_tensor =
+        at::tensor({0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 5},
+                      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA));
+
+    // This operation will call malloc/free with different sizes on the same pointer
+    auto unique_dim_result = at::unique_consecutive(sorted_tensor, false, true, 0);
+
+    // Everything below is only there to validate correct results
+    auto unique_dim_values = std::get<0>(unique_dim_result);
+    auto unique_dim_counts = std::get<2>(unique_dim_result);
+
+    // Check tensor sizes
+    EXPECT_EQ(unique_dim_values.size(0), 5);
+    EXPECT_EQ(unique_dim_counts.size(0), 5);
+
+    // Copy to CPU before accessing elements
+    at::Tensor cpu_values = unique_dim_values.cpu();
+    at::Tensor cpu_counts = unique_dim_counts.cpu();
+
+    // Use accessors on the CPU tensors
+    auto values_accessor = cpu_values.accessor<float, 1>();
+    auto counts_accessor = cpu_counts.accessor<int64_t, 1>();
+
+    // Check individual values using accessors
+    EXPECT_EQ(values_accessor[0], 0.0f);
+    EXPECT_EQ(values_accessor[1], 1.0f);
+    EXPECT_EQ(values_accessor[2], 2.0f);
+    EXPECT_EQ(values_accessor[3], 3.0f);
+    EXPECT_EQ(values_accessor[4], 5.0f);
+
+    // Check count values using accessors
+    EXPECT_EQ(counts_accessor[0], 3);
+    EXPECT_EQ(counts_accessor[1], 2);
+    EXPECT_EQ(counts_accessor[2], 1);
+    EXPECT_EQ(counts_accessor[3], 4);
+    EXPECT_EQ(counts_accessor[4], 1);
+  }
+}
+
 TEST(AllocatorTestCUDA, test_clone) {
+  if (!at::cuda::is_available()) return;
  test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
 }
--- a/aten/src/ATen/test/cuda_caching_host_allocator_test.cpp
+++ b/aten/src/ATen/test/cuda_caching_host_allocator_test.cpp
@ -19,10 +19,10 @@ TEST(CachingHostAllocatorTest, check_stats) {
  // Clear the stats and ensure they are zero.
  size_t round_size = c10::llvm::PowerOf2Ceil(N);
  auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
-  ASSERT_EQ(stats.allocation.current, 0);
-  ASSERT_EQ(stats.allocation.peak, 0);
-  ASSERT_EQ(stats.allocation.allocated, 0);
-  ASSERT_EQ(stats.allocation.freed, 0);
+  ASSERT_EQ(stats.allocations.current, 0);
+  ASSERT_EQ(stats.allocations.peak, 0);
+  ASSERT_EQ(stats.allocations.allocated, 0);
+  ASSERT_EQ(stats.allocations.freed, 0);

  void* ptr{nullptr};
  void* ctx{nullptr};
@ -32,14 +32,10 @@ TEST(CachingHostAllocatorTest, check_stats) {
    ptr = pinned_tensor.data_ptr();
    ctx = pinned_tensor.storage().data_ptr().get_context();
    auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
-    ASSERT_EQ(stats.allocation.current, 1);
-    ASSERT_EQ(stats.allocation.peak, 1);
-    ASSERT_EQ(stats.allocation.allocated, 1);
-    ASSERT_EQ(stats.allocation.freed, 0);
-    ASSERT_EQ(stats.segment.allocated, 1);
-    ASSERT_EQ(stats.segment.freed, 0);
-    ASSERT_EQ(stats.reserved_bytes.current, round_size);
-    ASSERT_EQ(stats.allocated_bytes.current, round_size);
+    ASSERT_EQ(stats.allocations.current, 1);
+    ASSERT_EQ(stats.allocations.peak, 1);
+    ASSERT_EQ(stats.allocations.allocated, 1);
+    // We dont track active bytes as free blocks are added in process_events
    ASSERT_EQ(stats.host_alloc_time.max, stats.host_alloc_time.min);
    ASSERT_EQ(stats.host_free_time.total, 0);
  }
@ -50,13 +46,9 @@ TEST(CachingHostAllocatorTest, check_stats) {
    auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
    ASSERT_EQ(ptr, pinned_tensor.data_ptr());
    ASSERT_EQ(ctx, pinned_tensor.storage().data_ptr().get_context());
-    ASSERT_EQ(stats.allocation.current, 1);
-    ASSERT_EQ(stats.allocation.peak, 1);
-    ASSERT_EQ(stats.allocation.allocated, 2);
-    ASSERT_EQ(stats.allocation.freed, 1);
-    ASSERT_EQ(stats.segment.allocated, 1);
-    ASSERT_EQ(stats.segment.freed, 0);
-    ASSERT_EQ(stats.reserved_bytes.current, round_size);
+    ASSERT_EQ(stats.allocations.current, 1);
+    ASSERT_EQ(stats.allocations.peak, 1);
+    ASSERT_EQ(stats.allocations.allocated, 1);
    ASSERT_EQ(stats.allocated_bytes.current, round_size);
  }
  // Ensure we don't reuse the allocation, due to size mismatch.
@ -68,14 +60,10 @@ TEST(CachingHostAllocatorTest, check_stats) {
    auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
    ASSERT_NE(ptr, pinned_tensor.data_ptr());
    ASSERT_NE(ctx, pinned_tensor.storage().data_ptr().get_context());
-    ASSERT_EQ(stats.allocation.current, 1);
-    ASSERT_EQ(stats.allocation.peak, 2);
-    ASSERT_EQ(stats.allocation.allocated, 3);
-    ASSERT_EQ(stats.allocation.freed, 2);
-    ASSERT_EQ(stats.segment.allocated, 2);
-    ASSERT_EQ(stats.segment.freed, 0);
-    ASSERT_EQ(stats.reserved_bytes.current, round_size + new_round_size);
-    ASSERT_EQ(stats.allocated_bytes.current, new_round_size);
+    ASSERT_EQ(stats.allocations.current, 2);
+    ASSERT_EQ(stats.allocations.peak, 2);
+    ASSERT_EQ(stats.allocations.allocated, 2);
+    ASSERT_EQ(stats.allocated_bytes.current, new_round_size + round_size);
    ASSERT_NE(stats.host_alloc_time.total, stats.host_alloc_time.min);
  }

@ -83,13 +71,10 @@ TEST(CachingHostAllocatorTest, check_stats) {
  {
    at::getHostAllocator(at::kCUDA)->empty_cache();
    auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
-    ASSERT_EQ(stats.allocation.current, 0);
+    ASSERT_EQ(stats.allocations.current, 0);
    ASSERT_EQ(stats.allocated_bytes.current, 0);
-    ASSERT_EQ(stats.allocation.peak, 2);
-    ASSERT_EQ(stats.allocation.allocated, 3);
-    ASSERT_EQ(stats.allocation.freed, 3);
-    ASSERT_EQ(stats.segment.allocated, 2);
-    ASSERT_EQ(stats.segment.freed, 2);
+    ASSERT_EQ(stats.allocations.peak, 2);
+    ASSERT_EQ(stats.allocations.allocated, 2);
    ASSERT_EQ(stats.num_host_alloc, 2);
    ASSERT_EQ(stats.num_host_free, 2);
    ASSERT_NE(stats.host_free_time.total, stats.host_free_time.min);
@ -100,9 +85,9 @@ TEST(CachingHostAllocatorTest, check_stats) {
    at::getHostAllocator(at::kCUDA)->reset_accumulated_stats();
    at::getHostAllocator(at::kCUDA)->reset_peak_stats();
    auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
-    ASSERT_EQ(stats.allocation.peak, 0);
-    ASSERT_EQ(stats.allocation.allocated, 0);
-    ASSERT_EQ(stats.allocation.freed, 0);
+    ASSERT_EQ(stats.allocations.peak, 0);
+    ASSERT_EQ(stats.allocations.allocated, 0);
+    ASSERT_EQ(stats.allocations.freed, 0);
    ASSERT_EQ(stats.allocated_bytes.peak, 0);
    ASSERT_EQ(stats.num_host_alloc, 0);
    ASSERT_EQ(stats.num_host_free, 0);
--- a/aten/tools/run_tests.sh
+++ b/aten/tools/run_tests.sh
@ -50,6 +50,7 @@ run_if_exists cuda_complex_test
 run_if_exists cuda_complex_math_test
 run_if_exists cuda_cub_test
 run_if_exists cuda_atomic_ops_test
+run_if_exists cuda_allocator_test

 if [ "$VALGRIND" == "ON" ]; then
  # NB: As these tests are invoked by valgrind, let's leave them for now as it's
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -897,6 +897,7 @@ libtorch_python_core_sources = [
    "torch/csrc/Stream.cpp",
    "torch/csrc/Event.cpp",
    "torch/csrc/TypeInfo.cpp",
+    "torch/csrc/acc/Module.cpp",
    "torch/csrc/api/src/python/init.cpp",
    "torch/csrc/autograd/functions/init.cpp",
    "torch/csrc/autograd/init.cpp",
--- a/c10/core/impl/DeviceGuardImplInterface.cpp
+++ b/c10/core/impl/DeviceGuardImplInterface.cpp
@ -9,16 +9,22 @@ std::array<
    static_cast<size_t>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)>
    device_guard_impl_registry;

-DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
+void registerDeviceGuard(
    DeviceType type,
    const DeviceGuardImplInterface* impl) {
  device_guard_impl_registry[static_cast<size_t>(type)].store(impl);
 }

+DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
+    DeviceType type,
+    const DeviceGuardImplInterface* impl) {
+  registerDeviceGuard(type, impl);
+}
+
 namespace {
 thread_local std::unique_ptr<DeviceGuardImplInterface> tls_fake_device_guard =
    nullptr;
-}
+} // namespace

 void ensureCUDADeviceGuardSet() {
  constexpr auto cuda_idx = static_cast<std::size_t>(DeviceType::CUDA);
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@ -368,6 +368,9 @@ inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) {
  return p;
 }

+void C10_API
+registerDeviceGuard(DeviceType type, const DeviceGuardImplInterface* impl);
+
 inline bool hasDeviceGuardImpl(DeviceType type) {
  return device_guard_impl_registry[static_cast<size_t>(type)].load();
 }
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -382,6 +382,7 @@ struct ExpandableSegment {
        peers_(std::move(peers)) {
    cudaDeviceProp prop{};
    C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device_));
+    mapped_size_ = 0;
    // we allocate enough address space for 1 1/8 the total memory on the GPU.
    // This allows for some cases where we have to unmap pages earlier in the
    // segment to put them at the end.
@ -493,6 +494,7 @@ struct ExpandableSegment {
      return SegmentRange{range.ptr, 0};
    }
    unmapHandles(begin, end);
+    mapped_size_ -= (end - begin) * segment_size_;
    return rangeFromHandles(begin, end);
  }

@ -632,6 +634,18 @@ struct ExpandableSegment {
    return max_handles_ * segment_size_;
  }

+  cudaStream_t getStream() {
+    return *stream_;
+  }
+
+  size_t getMappedSize() {
+    return mapped_size_;
+  }
+
+  size_t getSegmentSize() {
+    return segment_size_;
+  }
+
  void addPeer(c10::DeviceIndex device) {
    peers_.push_back(device);
    forEachAllocatedRange(
@ -666,6 +680,7 @@ struct ExpandableSegment {
          handles_.at(i).value().handle,
          0ULL));
    }
+    mapped_size_ += (end - begin) * segment_size_;
    setAccess(device_, begin, end);
    for (auto p : peers_) {
      setAccess(p, begin, end);
@ -734,6 +749,7 @@ struct ExpandableSegment {
  std::optional<cudaStream_t> stream_;
  CUdeviceptr ptr_{};
  size_t segment_size_;
+  size_t mapped_size_;
  size_t max_handles_;
  struct Handle {
    CUmemGenericAllocationHandle handle;
@ -779,6 +795,17 @@ struct ExpandableSegment {
  size_t size() const {
    return 0;
  }
+  cudaStream_t getStream() {
+    return nullptr;
+  }
+
+  size_t getMappedSize() {
+    return 0;
+  }
+
+  size_t getSegmentSize() {
+    return 0;
+  }
  void addPeer(c10::DeviceIndex device) {}
 };
 #endif
@ -2011,6 +2038,22 @@ class DeviceCachingAllocator {
    set_fraction = true;
  }

+  /** get expandable segment size for all the streams on device **/
+  std::vector<StreamSegmentSize> getExpandableSegmentSizes() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    std::vector<StreamSegmentSize> sizes;
+    for (auto& segment : expandable_segments_) {
+      if (!segment->getStream()) {
+        continue;
+      }
+      sizes.emplace_back(
+          segment->getStream(),
+          segment->getSegmentSize() == kSmallBuffer,
+          segment->getMappedSize());
+    }
+    return sizes;
+  }
+
  /** returns cached blocks to the system allocator **/
  void emptyCache(MempoolId_t mempool_id) {
    auto context = maybeGatherContext(RecordContext::ALL);
@ -3838,6 +3881,16 @@ class NativeCachingAllocator : public CUDAAllocator {
    device_allocator[device]->setMemoryFraction(fraction);
  }

+  std::vector<StreamSegmentSize> getExpandableSegmentSizes(
+      c10::DeviceIndex device) override {
+    TORCH_INTERNAL_ASSERT(
+        0 <= device && static_cast<size_t>(device) < device_allocator.size(),
+        "Allocator not initialized for device ",
+        device,
+        ": did you call init?");
+    return device_allocator[device]->getExpandableSegmentSizes();
+  }
+
  void recordHistory(
      bool enabled,
      CreateContextFn context_recorder,
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@ -203,6 +203,14 @@ struct ShareableHandle {
  std::string handle;
 };

+struct StreamSegmentSize {
+  StreamSegmentSize(cudaStream_t s, bool small, size_t sz)
+      : stream(s), is_small_pool(small), total_size(sz) {}
+  cudaStream_t stream;
+  bool is_small_pool;
+  size_t total_size;
+};
+
 class CUDAAllocator : public DeviceAllocator {
 public:
  virtual void* raw_alloc(size_t nbytes) = 0;
@ -211,6 +219,8 @@ class CUDAAllocator : public DeviceAllocator {
  virtual void init(int device_count) = 0;
  virtual double getMemoryFraction(c10::DeviceIndex device) = 0;
  virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
+  virtual std::vector<StreamSegmentSize> getExpandableSegmentSizes(
+      c10::DeviceIndex device) = 0;
  virtual void enable(bool value) = 0;
  virtual bool isEnabled() const = 0;
  virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
@ -365,6 +375,11 @@ inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
  return get()->setMemoryFraction(fraction, device);
 }

+inline std::vector<StreamSegmentSize> getExpandableSegmentSizes(
+    c10::DeviceIndex device) {
+  return get()->getExpandableSegmentSizes(device);
+}
+
 inline void emptyCache(MempoolId_t mempool_id = {0, 0}) {
  return get()->emptyCache(mempool_id);
 }
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@ -495,6 +495,13 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
    // introduces performance nondeterminism.
  }

+  std::vector<StreamSegmentSize> getExpandableSegmentSizes(
+      c10::DeviceIndex device) override {
+    TORCH_CHECK(
+        false,
+        "CUDAMallocAsyncAllocator does not yet support getExpandableSegmentSizes.");
+  }
+
  void emptyCache(/*unused*/ MempoolId_t mempool_id) override {
    std::lock_guard<std::mutex> lk(general_mutex);

--- a/caffe2/perfkernels/batch_box_cox_sve128.cc
+++ b/caffe2/perfkernels/batch_box_cox_sve128.cc
@ -2,175 +2,126 @@
 #include <arm_neon.h>
 #include <arm_neon_sve_bridge.h>
 #include <arm_sve.h>
+#include <cfloat>
+#include <cmath>

 #include "c10/macros/Macros.h"

-// Log and exp approximations inspired from ACL implementation
+/// Select `svlog` accuracy:
+/// - 0: original.
+/// - 1: more accurate, similar performance.
+/// - 2: very high accuracy, a bit lower speed.
+#define SVLOG_ACCURACY 2

-inline float32x4_t vtaylor_polyq_for_log_f32(float32x4_t x) {
-  const float32x4_t log_tab_1 = vdupq_n_f32(-2.29561495781f);
-  const float32x4_t log_tab_2 = vdupq_n_f32(-2.47071170807f);
-  const float32x4_t log_tab_3 = vdupq_n_f32(-5.68692588806f);
-  const float32x4_t log_tab_4 = vdupq_n_f32(-0.165253549814f);
-  const float32x4_t log_tab_5 = vdupq_n_f32(5.17591238022f);
-  const float32x4_t log_tab_6 = vdupq_n_f32(0.844007015228f);
-  const float32x4_t log_tab_7 = vdupq_n_f32(4.58445882797f);
-  const float32x4_t log_tab_8 = vdupq_n_f32(0.0141278216615f);
+/// Handle special cases in `svexp`:
+/// - 0: original.
+/// - 1: use clamp, better performance.
+/// - 2: no special case handling.
+#define SVEXP_SPECIAL_CLAMP 1

-  float32x4_t A = vmlaq_f32(log_tab_1, log_tab_5, x);
-  float32x4_t B = vmlaq_f32(log_tab_3, log_tab_7, x);
-  float32x4_t C = vmlaq_f32(log_tab_2, log_tab_6, x);
-  float32x4_t x2 = vmulq_f32(x, x);
-  float32x4_t D = svget_neonq(svmad_f32_x(
-      svptrue_b8(),
-      svset_neonq(svundef_f32(), x),
-      svset_neonq(svundef_f32(), log_tab_8),
-      svset_neonq(svundef_f32(), log_tab_4)));
-  float32x4_t x4 = vmulq_f32(x2, x2);
-  float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
-  return res;
+#if SVLOG_ACCURACY == 2
+static inline svfloat32_t svlog(svfloat32_t x) {
+  const svbool_t ptrue = svptrue_b8();
+
+  svint32_t u = svreinterpret_s32(x) - 0x3F2AAAAB;
+
+  svfloat32_t r = svreinterpret_f32((u & 0x007FFFFF) + 0x3F2AAAAB) - 1.0f;
+  svfloat32_t n = svcvt_f32_x(ptrue, u >> 23);
+  asm("" : "+w"(r)); // NOTE: can improve instruction scheduling.
+
+  svfloat32_t r2 = r * r;
+  svfloat32_t p = -0x1.4F9934p-3f + r * 0x1.5A9AA2p-3f;
+  svfloat32_t q = -0x1.00187Cp-2f + r * 0x1.961348p-3f;
+  svfloat32_t y = -0x1.FFFFC8p-2f + r * 0x1.555D7Cp-2f;
+  return (r + n * 0x1.62E43p-1f) +
+         (y + (q + (p + -0x1.3E737Cp-3f * r2) * r2) * r2) * r2;
 }
+#elif SVLOG_ACCURACY == 1
+static inline svfloat32_t svlog(svfloat32_t x) {
+  const svbool_t ptrue = svptrue_b8();

-inline float32x4_t vlogq_f32(float32x4_t x) {
-  const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
+  svint32_t u = svreinterpret_s32(x) - 0x3F2AAAAB;

-  // Extract exponent
-  int32x4_t m = svget_neonq(svsub_n_s32_x(
-      svptrue_b8(),
-      svset_neonq(
-          svundef_s32(),
-          vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23))),
-      127));
-  float32x4_t val = vreinterpretq_f32_s32(
-      vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
+  svfloat32_t r = svreinterpret_f32((u & 0x007FFFFF) + 0x3F2AAAAB) - 1.0f;
+  svfloat32_t n = svcvt_f32_x(ptrue, u >> 23);
+  asm("" : "+w"(r)); // NOTE: can improve instruction scheduling.

-  // Polynomial Approximation
-  float32x4_t poly = vtaylor_polyq_for_log_f32(val);
+  svfloat32_t r2 = r * r;
+  svfloat32_t A = -0x1.923814p-3f + r * 0x1.689E5Ep-3f;
+  svfloat32_t B = -0x1.FC0968p-3f + r * 0x1.93BF0Cp-3f;
+  svfloat32_t C = -0x1.000478p-1f + r * 0x1.556906p-2f;

-  // Reconstruct
-  poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
+  return (r + n * 0x1.62E43p-1f) + (C + (B + A * r2) * r2) * r2;
+}
+#elif SVLOG_ACCURACY == 0
+static inline svfloat32_t svlog(svfloat32_t x) {
+  const svbool_t ptrue = svptrue_b8();
+
+  svint32_t u = svsra_n_s32(svdup_n_s32(-127), svreinterpret_s32(x), 23);
+
+  svfloat32_t n = svcvt_f32_x(ptrue, u);
+  svfloat32_t r = svreinterpret_f32(svreinterpret_s32(x) - (u << 23));
+
+  svfloat32_t D = -0.165253549814f + r * 0.0141278216615f;
+  svfloat32_t C = -2.47071170807f + r * 0.844007015228f;
+  svfloat32_t B = -5.68692588806f + r * 4.58445882797f;
+  svfloat32_t A = -2.29561495781f + r * 5.17591238022f;
+
+  svfloat32_t r2 = r * r;
+  return (A + n * 0.6931471805f) + (B + (C + D * r2) * r2) * r2;
+}
+#endif
+
+static inline svfloat32_t svexp(svfloat32_t x) {
+  // Clamp interval set to prevent denormals!
+  const svfloat32_t max_input = svdup_n_f32(88.722839f);
+  const svfloat32_t min_input = svdup_n_f32(-87.33654f);
+  const svfloat32_t shift = svdup_n_f32(0x1.0000FEp+23f);
+  const svbool_t ptrue = svptrue_b8();
+
+#if SVEXP_SPECIAL_CLAMP == 1
+  x = svmax_x(ptrue, svmin_x(ptrue, x, max_input), min_input);
+#endif
+
+  svfloat32_t z = svmla_n_f32_x(ptrue, shift, x, 0x1.715476p+0f);
+  svfloat32_t n = z - shift;
+  svfloat32_t scale = svreinterpret_f32(svreinterpret_u32(z) << 23);
+
+  svfloat32_t r_hi = x - n * 0x1.62E400p-1f;
+  svfloat32_t r = r_hi - n * 0x1.7F7D1Cp-20f;
+  svfloat32_t r2 = r * r;
+
+  svfloat32_t C = 0x1.573E2Ep-5f + r * 0x1.0E4020p-7f;
+  svfloat32_t B = 0x1.FFFDB6p-2f + r * 0x1.555E66p-3f;
+  svfloat32_t A = r * 0x1.FFFFECp-1f;
+
+  svfloat32_t poly = scale + (A + (B + C * r2) * r2) * scale;
+
+#if SVEXP_SPECIAL_CLAMP == 0
+  const svfloat32_t inf = svdup_n_f32(std::numeric_limits<float>::infinity());
+  poly = svsel_f32(svcmplt_f32(ptrue, x, min_input), svdup_n_f32(0.0f), poly);
+  poly = svsel_f32(svcmpgt_f32(ptrue, x, max_input), inf, poly);
+#endif

  return poly;
 }

-inline float32x4_t vexpq_f32(float32x4_t x) {
-  const auto c1 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3f7ffff6)));
-  const auto c2 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3efffedb)));
-  const auto c3 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3e2aaf33)));
-  const auto c4 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3d2b9f17)));
-  const auto c5 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3c072010)));
-
-  const auto shift = vreinterpretq_f32_u32(
-      svget_neonq(svdup_n_u32(0x4b00007f))); // 2^23 + 127 = 0x1.0000fep23f
-  const auto inv_ln2 = vreinterpretq_f32_u32(
-      svget_neonq(svdup_n_u32(0x3fb8aa3b))); // 1 / ln(2) = 0x1.715476p+0f
-  const auto neg_ln2_hi = vreinterpretq_f32_u32(svget_neonq(
-      svdup_n_u32(0xbf317200))); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
-  const auto neg_ln2_lo = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(
-      0xb5bfbe8e))); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
-
-  const auto inf = svdup_n_f32(std::numeric_limits<float>::infinity());
-  const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5)
-  const auto zero = svdup_n_f32(0.f);
-  const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125)
-
-  // Range reduction:
-  //   e^x = 2^n * e^r
-  // where:
-  //   n = floor(x / ln(2))
-  //   r = x - n * ln(2)
-  //
-  // By adding x / ln(2) with 2^23 + 127 (shift):
-  //   * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127
-  //   forces decimal part
-  //     of x / ln(2) out of the result. The integer part of x / ln(2) (i.e. n)
-  //     + 127 will occupy the whole fraction part of z in FP32 format.
-  //     Subtracting 2^23 + 127 (shift) from z will result in the integer part
-  //     of x / ln(2) (i.e. n) because the decimal part has been pushed out and
-  //     lost.
-  //   * The addition of 127 makes the FP32 fraction part of z ready to be used
-  //   as the exponent
-  //     in FP32 format. Left shifting z by 23 bits will result in 2^n.
-  const auto z = vfmaq_f32(shift, x, inv_ln2);
-  const auto n = z - shift;
-  const auto scale =
-      vreinterpretq_f32_u32(vreinterpretq_u32_f32(z) << 23); // 2^n
-
-  // The calculation of n * ln(2) is done using 2 steps to achieve accuracy
-  // beyond FP32. This outperforms longer Taylor series (3-4 tabs) both in term
-  // of accuracy and performance.
-  const auto r_hi = vfmaq_f32(x, n, neg_ln2_hi);
-  const auto r = vfmaq_f32(r_hi, n, neg_ln2_lo);
-
-  // Compute the truncated Taylor series of e^r.
-  //   poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
-  const auto r2 = r * r;
-
-  const auto p1 = c1 * r;
-  const auto p23 = vfmaq_f32(c2, c3, r);
-  const auto p45 = vfmaq_f32(c4, c5, r);
-  const auto p2345 = vfmaq_f32(p23, p45, r2);
-  const auto p12345 = vfmaq_f32(p1, p2345, r2);
-
-  auto poly = svset_neonq(svundef_f32(), vfmaq_f32(scale, p12345, scale));
-
-  auto pHigh = svcmpgt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), max_input);
-  auto pLow = svcmplt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), min_input);
-
-  auto bound = svsel_f32(
-      pHigh,
-      inf,
-      zero);
-
-  auto pCombined = svorr_b_z(svptrue_b8(), pLow, pHigh);
-
-  // Handle underflow and overflow.
-  poly = svsel_f32(
-      pCombined,
-      bound,
-      poly);
-
-  return svget_neonq(poly);
-}
-
-// ln(x) = log2(x) * ln(2)
-// pow(x, n) = exp(n * ln(x))
-inline float32x4_t compute_batch_box_cox_vec_sve128_float(
+static inline svfloat32_t compute_batch_box_cox_vec_sve128_float(
    svfloat32_t lambda1_v,
    svfloat32_t lambda2_v,
    svfloat32_t data_v,
    svfloat32_t k_eps) {
-  // sum_v = lambda2_v + data_v
-  float32x4_t sum_v = vaddq_f32(svget_neonq(data_v), svget_neonq(lambda2_v));
+  const svbool_t ptrue = svptrue_b8();

-  // test lambda1_v: predNZ == 1 iff lambda1_v != 0
-  svbool_t predNZ = svcmpne_n_f32(svptrue_b8(), lambda1_v, 0.0f);
-
-  // clamp sum_v: sum_v = max(sum_v, k_eps)
-  sum_v = vmaxq_f32(sum_v, svget_neonq(k_eps));
-
-  // lnData = log(sum_v)
-  svfloat32_t lnData = svset_neonq(svundef_f32(), vlogq_f32(sum_v));
-
-  // if any lambda1 != 0, compute pow(sum_v, lambda1) using lnData
-  // pow(sum_v, lambda1) == exp(lambda1 * ln(sum_v))
+  svfloat32_t lnData = svlog(svmax_x(ptrue, data_v + lambda2_v, k_eps));
+  svbool_t predNZ = svcmpne_n_f32(ptrue, lambda1_v, 0.0f);
  if (C10_LIKELY(svptest_any(predNZ, predNZ))) {
-    // mult = lambda1 * ln(sum_v)
-    float32x4_t mult = vmulq_f32(svget_neonq(lnData), svget_neonq(lambda1_v));
-
-    // lambda1_r = 1 / lambda1
    svfloat32_t lambda1_r = svdivr_f32_m(predNZ, lambda1_v, svdup_n_f32(1.0f));
-
-    // pow = exp(mult)
-    float32x4_t pow = vexpq_f32(mult);
-
-    // merge results
-    // lnData if lambda1 == 0, (lambda1_r * pow - lambda1_r) if lambda1 != 0
+    svfloat32_t pow = svexp(lnData * lambda1_v);
    lnData = svsel_f32(predNZ, lambda1_r, lnData);
-    lnData =
-        svnmsb_f32_m(predNZ, lnData, svset_neonq(svundef_f32(), pow), lnData);
+    lnData = svnmsb_f32_m(predNZ, lnData, pow, lnData);
  }
-  return svget_neonq(lnData);
+  return lnData;
 }

 template <typename T>
@ -186,11 +137,11 @@ template <>
 void compute_batch_box_cox_vec_sve128(
    std::size_t N,
    std::size_t D,
-    const float* data_ptr,
-    const float* __restrict lambda1_ptr,
-    const float* __restrict lambda2_ptr,
-    float* output_ptr) {
-  svfloat32_t k_eps = svdup_n_f32(static_cast<float>(1e-6));
+    const float *data_ptr,
+    const float *__restrict lambda1_ptr,
+    const float *__restrict lambda2_ptr,
+    float *output_ptr) {
+  const svfloat32_t k_eps = svdup_n_f32(static_cast<float>(1e-6));

  std::size_t remainder = D % 4;
  std::size_t loopBound = D - remainder;
@ -204,17 +155,17 @@ void compute_batch_box_cox_vec_sve128(
      svfloat32_t lambda2_v =
          svset_neonq(svundef_f32(), vld1q_f32(lambda2_ptr + j));
      svfloat32_t data_v = svset_neonq(svundef_f32(), vld1q_f32(data_ptr));
-      float32x4_t result = compute_batch_box_cox_vec_sve128_float(
+      svfloat32_t result = compute_batch_box_cox_vec_sve128_float(
          lambda1_v, lambda2_v, data_v, k_eps);
-      vst1q_f32(output_ptr, result);
+      vst1q_f32(output_ptr, svget_neonq(result));
    }
    if (C10_LIKELY(remainder > 0)) {
      svfloat32_t lambda1_v = svld1_f32(remainderPred, lambda1_ptr + loopBound);
      svfloat32_t lambda2_v = svld1_f32(remainderPred, lambda2_ptr + loopBound);
      svfloat32_t data_v = svld1_f32(remainderPred, data_ptr);
-      float32x4_t result = compute_batch_box_cox_vec_sve128_float(
+      svfloat32_t result = compute_batch_box_cox_vec_sve128_float(
          lambda1_v, lambda2_v, data_v, k_eps);
-      svst1_f32(remainderPred, output_ptr, svset_neonq(svundef_f32(), result));
+      svst1_f32(remainderPred, output_ptr, result);
      data_ptr += remainder;
      output_ptr += remainder;
    }
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -1541,6 +1541,11 @@ if(NOT INTERN_BUILD_MOBILE)
    if(HAVE_MALLOC_USABLE_SIZE)
      add_definitions(-DHAVE_MALLOC_USABLE_SIZE=1)
    endif(HAVE_MALLOC_USABLE_SIZE)
+    set(CMAKE_EXTRA_INCLUDE_FILES "fcntl.h")
+    CHECK_FUNCTION_EXISTS(posix_fallocate HAVE_POSIX_FALLOCATE)
+    if(HAVE_POSIX_FALLOCATE)
+      add_definitions(-DHAVE_POSIX_FALLOCATE=1)
+    endif(HAVE_POSIX_FALLOCATE)
  endif(UNIX)

  add_definitions(-DUSE_EXTERNAL_MZCRC)
--- a/docs/source/notes/randomness.rst
+++ b/docs/source/notes/randomness.rst
@ -125,10 +125,6 @@ deterministic implementation will be used::
            [[ 0.1509,  1.8027],
             [ 0.0333, -1.1444]]], device='cuda:0')

-Furthermore, if you are using CUDA tensors, and your CUDA version is 10.2 or greater, you
-should set the environment variable `CUBLAS_WORKSPACE_CONFIG` according to CUDA documentation:
-`<https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility>`_
-
 CUDA convolution determinism
 ----------------------------
 While disabling CUDA convolution benchmarking (discussed above) ensures that
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@ -146,7 +146,7 @@ Indexing, Slicing, Joining, Mutating Ops
 Accelerators
 ----------------------------------
 Within the PyTorch repo, we define an "Accelerator" as a :class:`torch.device` that is being used
-alongside a CPU to speed up computation. These device use an asynchronous execution scheme,
+alongside a CPU to speed up computation. These devices use an asynchronous execution scheme,
 using :class:`torch.Stream` and :class:`torch.Event` as their main way to perform synchronization.
 We also assume that only one such accelerator can be available at once on a given host. This allows
 us to use the current accelerator as the default device for relevant concepts such as pinned memory,
--- a/pyrefly.toml
+++ b/pyrefly.toml
@ -40,16 +40,10 @@ project-excludes = [
  "torch/autograd/**",
  "torch/cuda/**",
  "torch/export/**",
-  "torch/profiler/**",
-  "torch/_prims_common/**",
-  "torch/backends/**",
-  "torch/testing/**",
-  "torch/_C/**",
-  "torch/sparse/**",
-  "torch/_library/**",
-  "torch/_prims/**",
-  "torch/_decomp/**",
-  "torch/_meta_registrations.py",
+  # formatting issues
+  "torch/linalg/__init__.py",
+  "torch/package/importer.py",
+  "torch/package/_package_pickler.py",
  # ====
  "benchmarks/instruction_counts/main.py",
  "benchmarks/instruction_counts/definitions/setup.py",
--- a/test/conftest.py
+++ b/test/conftest.py
@ -21,6 +21,16 @@ from _pytest.terminal import _get_raw_skip_reason
 from pytest_shard_custom import pytest_addoptions as shard_addoptions, PytestShardPlugin


+try:
+    from torch.testing._internal.common_utils import parse_cmd_line_args
+except ImportError:
+    # Temporary workaround needed until parse_cmd_line_args makes it into a nightlye because
+    # main / PR's tests are sometimes run against the previous day's nightly which won't
+    # have this function.
+    def parse_cmd_line_args():
+        pass
+
+
 if TYPE_CHECKING:
    from _pytest._code.code import ReprFileLocation

@ -83,6 +93,7 @@ def pytest_addoption(parser: Parser) -> None:


 def pytest_configure(config: Config) -> None:
+    parse_cmd_line_args()
    xmlpath = config.option.xmlpath_reruns
    # Prevent opening xmllog on worker nodes (xdist).
    if xmlpath and not hasattr(config, "workerinput"):
--- a/test/distributed/_composable/test_composability/test_pp_composability.py
+++ b/test/distributed/_composable/test_composability/test_pp_composability.py
@ -1,4 +1,5 @@
 # Owner(s): ["oncall: distributed"]
+import copy
 import os
 from typing import TYPE_CHECKING

@ -6,6 +7,7 @@ import torch
 import torch.distributed.checkpoint as dcp
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.distributed._composable.replicate_with_fsdp import replicate
 from torch.distributed.checkpoint import FileSystemReader
 from torch.distributed.checkpoint.default_planner import _EmptyStateDictLoadPlanner
 from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict
@ -366,6 +368,242 @@ class ComposabilityTest(MultiProcessTestCase):

        torch.distributed.destroy_process_group()

+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_if_lt_x_gpu(8)
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
+    )
+    @parametrize(
+        "ScheduleClass",
+        [
+            ScheduleGPipe,
+            Schedule1F1B,
+            ScheduleInterleaved1F1B,
+            ScheduleLoopedBFS,
+            ScheduleInterleavedZeroBubble,
+        ],
+    )
+    @parametrize(
+        "MixedPrecisionParam",
+        [
+            torch.bfloat16,
+            torch.float32,
+        ],
+    )
+    def test_replicate_pp(self, ScheduleClass, MixedPrecisionParam):
+        _device_raii = torch.device(device_type, self.device)
+        torch.accelerator.set_device_index(self.device)
+        store = torch.distributed.FileStore(self.file_name, self.world_size)
+        torch.distributed.init_process_group(
+            backend=backend,
+            store=store,
+            rank=self.rank,
+            world_size=self.world_size,
+        )
+        dim = 8
+        pp_size = 2
+        num_microbatches = 8
+        replicate_size = self.world_size // (pp_size)
+        device_mesh = init_device_mesh(
+            device_type,
+            mesh_shape=(replicate_size, 1, pp_size),
+            mesh_dim_names=("replicate", "shard", "pp"),
+        )
+        torch.manual_seed(42)
+        dp_mesh = device_mesh["replicate", "shard"]
+        pp_mesh = device_mesh["pp"]
+        pp_group = device_mesh["pp"].get_group()
+
+        # create "entire model"
+        total_layers = 8
+        full_model = nn.ModuleList([MLPModule(dim) for _ in range(total_layers)])
+        ref_full_model = copy.deepcopy(full_model)
+
+        # dummy loss needed just to force backwards to run in schedule step
+        def loss_fn(y, target):
+            return y.sum()
+
+        # Apply DP to stage module
+        def apply_replicate(partial_model):
+            # apply replicate
+            mp_policy = MixedPrecisionPolicy(
+                param_dtype=MixedPrecisionParam,
+                reduce_dtype=torch.float32,
+            )
+            replicate_config = {"mp_policy": mp_policy}
+            for layer_id in range(len(partial_model)):
+                replicate(
+                    partial_model[layer_id],
+                    device_mesh=dp_mesh,
+                    **replicate_config,
+                    reshard_after_forward=False,
+                )
+            dp_model = replicate(partial_model, device_mesh=dp_mesh, **replicate_config)
+            return dp_model
+
+        # Apply same precision to reference model (without replicate)
+        def apply_same_precision(partial_model):
+            if MixedPrecisionParam != torch.float32:
+                # Cast to same precision as pipeline model
+                partial_model = partial_model.to(dtype=MixedPrecisionParam)
+            return partial_model
+
+        # Attach to a schedule
+        if issubclass(ScheduleClass, PipelineScheduleSingle):
+            stage_idx = pp_group.rank()
+            partial_model = nn.Sequential(
+                *full_model[stage_idx * 2 : stage_idx * 2 + 2]
+            )
+            partial_model.to(self.device)
+
+            dp_model = apply_replicate(partial_model)
+            pipeline_stage = PipelineStage(
+                dp_model,
+                stage_idx,
+                pp_group.size(),
+                self.device,
+                group=pp_group,
+            )
+            partial_models = [pipeline_stage.submod]
+            pipeline_schedule = ScheduleClass(
+                pipeline_stage,
+                n_microbatches=num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+
+            ref_partial_model = nn.Sequential(
+                *ref_full_model[stage_idx * 2 : stage_idx * 2 + 2]
+            )
+            ref_partial_model.to(self.device)
+            ref_partial_model = apply_same_precision(
+                ref_partial_model
+            )  # Apply same precision
+
+            ref_pipeline_stage = PipelineStage(
+                ref_partial_model,
+                stage_idx,
+                pp_group.size(),
+                self.device,
+                group=pp_group,
+            )
+            ref_partial_models = [ref_pipeline_stage.submod]
+            ref_pipeline_schedule = ScheduleClass(
+                ref_pipeline_stage,
+                n_microbatches=num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+        else:
+            n_virtual = 2
+            num_stages = pp_group.size() * n_virtual
+            stages = []
+            ref_stages = []
+            for i in range(n_virtual):
+                stage_idx = pp_group.rank() + n_virtual * i
+                # divide the model layers by the number of stages
+                partial_model = nn.Sequential(*full_model[stage_idx : stage_idx + 1])
+                partial_model.to(self.device)
+
+                dp_model = apply_replicate(partial_model)
+                stage = PipelineStage(
+                    dp_model,
+                    stage_idx,
+                    num_stages,
+                    self.device,
+                    group=pp_group,
+                )
+
+                stages.append(stage)
+                partial_models = [pipeline_stage.submod for pipeline_stage in stages]
+
+                ref_partial_model = nn.Sequential(
+                    *ref_full_model[stage_idx : stage_idx + 1]
+                )
+                ref_partial_model.to(self.device)
+                ref_partial_model = apply_same_precision(
+                    ref_partial_model
+                )  # Apply same precision
+
+                ref_stage = PipelineStage(
+                    ref_partial_model,
+                    stage_idx,
+                    num_stages,
+                    self.device,
+                    group=pp_group,
+                )
+
+                ref_stages.append(ref_stage)
+                ref_partial_models = [
+                    pipeline_stage.submod for pipeline_stage in ref_stages
+                ]
+            pipeline_schedule = ScheduleClass(
+                stages,
+                n_microbatches=num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+
+            ref_pipeline_schedule = ScheduleClass(
+                ref_stages,
+                n_microbatches=num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+
+        optimizer_kwargs = {
+            "lr": 0.01,
+            "betas": (0.9, 0.95),
+            "weight_decay": 0.1,
+            "fused": False,
+            "foreach": True,
+        }
+
+        optimizers = [
+            torch.optim.AdamW(model.parameters(), **optimizer_kwargs)
+            for model in partial_models
+        ]
+
+        ref_optimizers = [
+            torch.optim.AdamW(model.parameters(), **optimizer_kwargs)
+            for model in ref_partial_models
+        ]
+
+        for train_step in range(5):
+            for optimizer in optimizers:
+                optimizer.zero_grad()
+            for ref_optimizer in ref_optimizers:
+                ref_optimizer.zero_grad()
+
+            inputs = torch.rand(
+                (num_microbatches, dim), device=self.device, dtype=MixedPrecisionParam
+            )
+            labels = torch.rand(
+                (num_microbatches, dim), device=self.device, dtype=MixedPrecisionParam
+            )
+            is_last_stage = pp_mesh.get_local_rank() == pp_mesh.size() - 1
+            if pp_mesh.get_local_rank() == 0:
+                pipeline_schedule.step(inputs)
+                ref_pipeline_schedule.step(inputs)
+            elif is_last_stage:
+                losses = []
+                ref_losses = []
+                pipeline_schedule.step(target=labels, losses=losses)
+                ref_pipeline_schedule.step(target=labels, losses=ref_losses)
+
+                for loss, ref_loss in zip(losses, ref_losses):
+                    self.assertEqual(loss, ref_loss)
+            else:
+                pipeline_schedule.step()
+                ref_pipeline_schedule.step()
+
+            for optimizer in optimizers:
+                optimizer.step()
+            for ref_optimizer in ref_optimizers:
+                ref_optimizer.step()
+
+        torch.distributed.destroy_process_group()
+

 instantiate_parametrized_tests(ComposabilityTest)

--- a/test/distributed/optim/test_apply_optimizer_in_backward.py
+++ b/test/distributed/optim/test_apply_optimizer_in_backward.py
@ -39,7 +39,7 @@ class ApplyOverlappedOptimizerTest(unittest.TestCase):
            with self.subTest(i):
                _validate_params(
                    [model.parameters() for model in models],
-                    torch.testing.assert_allclose,
+                    torch.testing.assert_close,
                )

            for opt in optimizers:
@ -77,7 +77,7 @@ class ApplyOverlappedOptimizerTest(unittest.TestCase):
                model.parameters(),
                model_with_opt_in_bwd.parameters(),
            ],
-            torch.testing.assert_allclose,
+            torch.testing.assert_close,
        )

        self._run_training_loop_and_validate(
@ -113,10 +113,10 @@ class ApplyOverlappedOptimizerTest(unittest.TestCase):

        for p1, p2 in zip(model_with_hook.parameters(), initial_model.parameters()):
            with self.assertRaises(AssertionError):
-                torch.testing.assert_allclose(p1, p2)
+                torch.testing.assert_close(p1, p2)

        for p1, p2 in zip(model_no_hook.parameters(), initial_model.parameters()):
-            torch.testing.assert_allclose(p1, p2)
+            torch.testing.assert_close(p1, p2)

    def test_multiple_optim_for_params(self) -> None:
        model = nn.Sequential(nn.Linear(10, 10), nn.Linear(10, 10))
--- a/test/distributed/pipelining/model_registry.py
+++ b/test/distributed/pipelining/model_registry.py
@ -8,7 +8,7 @@ from torch.distributed.pipelining import pipe_split, SplitPoint

 class ExampleCode(torch.nn.Module):
    def __init__(self, d_hid, splits=2):
-        assert splits <= 4
+        assert splits <= 8
        super().__init__()
        self.splits = splits
        self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
@ -17,6 +17,10 @@ class ExampleCode(torch.nn.Module):
        self.lin0 = torch.nn.Linear(d_hid, d_hid)
        self.lin1 = torch.nn.Linear(d_hid, d_hid)
        self.lin2 = torch.nn.Linear(d_hid, d_hid)
+        self.lin3 = torch.nn.Linear(d_hid, d_hid)
+        self.lin4 = torch.nn.Linear(d_hid, d_hid)
+        self.lin5 = torch.nn.Linear(d_hid, d_hid)
+        self.lin6 = torch.nn.Linear(d_hid, d_hid)

    def forward(self, x):
        x = torch.mm(x, self.mm_param0)
@ -35,6 +39,22 @@ class ExampleCode(torch.nn.Module):
            pipe_split()
            x = self.lin2(x)
            x = torch.relu(x)
+        if self.splits > 4:
+            pipe_split()
+            x = self.lin3(x)
+            x = torch.relu(x)
+        if self.splits > 5:
+            pipe_split()
+            x = self.lin4(x)
+            x = torch.relu(x)
+        if self.splits > 6:
+            pipe_split()
+            x = self.lin5(x)
+            x = torch.relu(x)
+        if self.splits > 7:
+            pipe_split()
+            x = self.lin6(x)
+            x = torch.relu(x)
        return x


@ -43,7 +63,7 @@ class ModelWithKwargs(torch.nn.Module):
    DEFAULT_BATCH_SIZE = 256

    def __init__(self, d_hid: int = DEFAULT_DHID, splits=2):
-        assert splits <= 4
+        assert splits <= 8
        super().__init__()
        self.splits = splits
        self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
@ -52,6 +72,10 @@ class ModelWithKwargs(torch.nn.Module):
        self.lin1 = torch.nn.Linear(d_hid, d_hid)
        self.lin2 = torch.nn.Linear(d_hid, d_hid)
        self.lin3 = torch.nn.Linear(d_hid, d_hid)
+        self.lin4 = torch.nn.Linear(d_hid, d_hid)
+        self.lin5 = torch.nn.Linear(d_hid, d_hid)
+        self.lin6 = torch.nn.Linear(d_hid, d_hid)
+        self.lin7 = torch.nn.Linear(d_hid, d_hid)

    def forward(self, x, y=torch.zeros(DEFAULT_BATCH_SIZE, DEFAULT_DHID)):
        x = torch.mm(x, self.mm_param0)
@ -70,6 +94,22 @@ class ModelWithKwargs(torch.nn.Module):
            pipe_split()
            x = self.lin3(x)
            x = torch.relu(x)
+        if self.splits > 4:
+            pipe_split()
+            x = self.lin4(x)
+            x = torch.relu(x)
+        if self.splits > 5:
+            pipe_split()
+            x = self.lin5(x)
+            x = torch.relu(x)
+        if self.splits > 6:
+            pipe_split()
+            x = self.lin6(x)
+            x = torch.relu(x)
+        if self.splits > 7:
+            pipe_split()
+            x = self.lin7(x)
+            x = torch.relu(x)
        return x


--- a/test/distributed/pipelining/test_stage.py
+++ b/test/distributed/pipelining/test_stage.py
@ -30,7 +30,7 @@ from torch.utils._pytree import tree_map_only

 d_hid = 512
 batch_size = 256
-chunks = 4
+chunks = 8

 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 backend = dist.get_default_backend_for_device(device_type)
--- a/test/distributed/tensor/debug/test_debug_mode.py
+++ b/test/distributed/tensor/debug/test_debug_mode.py
@ -253,11 +253,24 @@ class TestDTensorDebugMode(TestCase):
        x = torch.randn(1, 8, requires_grad=True)

        with DebugMode(record_torchfunction=True) as debug_mode:
-            torch.cond(torch.tensor(True), lambda x: x + 1, lambda x: x - 1, [x])
+            # rewrite torch.conda as torch.ops.higher_order.cond to avoid compilation
+            torch.ops.higher_order.cond(
+                torch.tensor(True), lambda x: x + 1, lambda x: x - 1, (x,)
+            )

        # Verify that cond operations are captured in debug mode
        self.assertIn("torch.ops.higher_order.cond", debug_mode.debug_string())

+    def test_compile(self):
+        @torch.compile
+        def f(x):
+            return x.sin().cos()
+
+        x = torch.randn(8)
+        with DebugMode() as debug_mode:
+            f(x)
+        self.assertEqual(len(debug_mode.debug_string()), 0)
+

 instantiate_parametrized_tests(TestDTensorDebugMode)

--- a/test/distributed/tensor/test_dtensor.py
+++ b/test/distributed/tensor/test_dtensor.py
@ -170,9 +170,9 @@ class DTensorTest(DTensorTestBase):
    @with_comms
    def test_from_local(self):
        device_mesh = self.build_device_mesh()
-        placements = [Shard(0)]
+        shard_spec = [Shard(0)]
        local_tensor = torch.randn(3, 3)
-        sharded_tensor = DTensor.from_local(local_tensor, device_mesh, placements)
+        sharded_tensor = DTensor.from_local(local_tensor, device_mesh, shard_spec)
        self.assertEqual(sharded_tensor.size(), torch.Size([self.world_size * 3, 3]))

        replica_spec = [Replicate()]
@ -189,14 +189,14 @@ class DTensorTest(DTensorTestBase):
        local_tensor_temp = local_tensor_with_grad * 3
        # create the dist tensor with non leaf local tensor, dist tensor created
        # should also be non leaf node
-        dist_tensor = DTensor.from_local(local_tensor_temp, device_mesh, placements)
+        dist_tensor = DTensor.from_local(local_tensor_temp, device_mesh, shard_spec)
        self.assertFalse(dist_tensor.is_leaf)
        # do some random operations on dist tensor
        output = dist_tensor * 3
        self.assertIsInstance(output, DTensor)
        # trigger .backward() on dist tensor directly
        local_grad = torch.ones(3, 3)
-        grad_output = DTensor.from_local(local_grad, device_mesh, placements)
+        grad_output = DTensor.from_local(local_grad, device_mesh, shard_spec)
        # run backward directly on dist tensor
        output.backward(grad_output)
        # check it gradients flow back to original torch.Tensor
@ -204,6 +204,16 @@ class DTensorTest(DTensorTestBase):
        expected_grad = torch.ones(3, 3) * 9
        self.assertEqual(local_tensor_with_grad.grad, expected_grad)

+        # DTensor.from_local should raise error if the `local_tensor`
+        # argument is a DTensor
+        local_tensor = torch.ones(2, 2)
+        dtensor = DTensor.from_local(local_tensor, device_mesh, shard_spec)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "the local_tensor argument only accepts torch.Tensor"
+        ):
+            DTensor.from_local(dtensor, device_mesh, shard_spec)
+
    @with_comms
    def test_from_local_uneven_sharding(self):
        device_mesh = self.build_device_mesh()
@ -870,6 +880,19 @@ class DTensorMeshTest(DTensorTestBase):
        local_expected = expected.to_local()
        self.assertEqual(local_result, local_expected)

+    @unittest.expectedFailure
+    @with_comms
+    def test_inplace_on_local_tensor_view(self):
+        mesh = self.build_device_mesh()
+        seq = 8
+        vocab = 16
+        leaf = torch.randn((seq, vocab), device=self.device_type, requires_grad=True)
+        dtensor_leaf = DTensor.from_local(leaf, mesh, [Shard(1)])
+        dtensor_vocab_parallel_logits = dtensor_leaf * 2  # make this non-leaf
+        vocab_parallel_logits = dtensor_vocab_parallel_logits.to_local()
+        logits_max = torch.randn(seq, device=self.device_type)
+        vocab_parallel_logits -= logits_max.unsqueeze(dim=1)
+
    @with_comms
    def test_auto_implicit_replication(self):
        mesh = self.build_device_mesh()
--- a/test/distributed/tensor/test_dtensor_compile.py
+++ b/test/distributed/tensor/test_dtensor_compile.py
@ -388,6 +388,47 @@ def forward(self, b_parametrizations_buffer_original0, x):
        res = opt_fn(x, y)
        self.assertEqual(res, ref)

+    def test_dtensor_dynamic_recompiles(self):
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        def inp(*shape):
+            param = torch.randn(*shape, requires_grad=True)
+            x = DTensor.from_local(param, mesh, [Shard(0)], run_check=False)
+            torch._dynamo.mark_dynamic(x, 0)
+            torch._dynamo.mark_dynamic(x, 1)
+            return x
+
+        def run(func, *shape):
+            res = func(inp(*shape))
+            res.sum().backward()
+
+        @torch.compile(backend=cnt, fullgraph=True)
+        def f(x):
+            y = x * x
+            return y.to_local()
+
+        run(f, 4, 4)
+        run(f, 6, 8)
+        run(f, 10, 10)
+        self.assertEqual(cnt.frame_count, 1)
+
+        # sanity check that shape guard recompiles are still handled
+        @torch.compile(backend=cnt, fullgraph=True)
+        def g(x):
+            if x.size(0) <= 16:
+                y = x * x
+            else:
+                y = x + x
+            return y.to_local()
+
+        cnt.clear()
+        run(g, 4, 4)
+        run(g, 8, 8)
+        self.assertEqual(cnt.frame_count, 1)
+        run(g, 64, 8)
+        self.assertEqual(cnt.frame_count, 2)
+
    def test_dtensor_attribute_access_on_intermediate(self):
        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))

--- a/test/distributed/tensor/test_dtensor_export.py
+++ b/test/distributed/tensor/test_dtensor_export.py
@ -5,7 +5,6 @@ import unittest

 import torch
 import torch.distributed as dist
-import torch.fx.traceback as fx_traceback
 from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
 from torch._functorch.aot_autograd import aot_export_joint_with_descriptors
 from torch._functorch.partitioners import min_cut_rematerialization_partition
@ -38,18 +37,6 @@ class SimpleModel(torch.nn.Module):
        return self.mlp_1(self.mlp_0(input))


-class SimpleModelAnnotated(torch.nn.Module):
-    def __init__(self, device):
-        super().__init__()
-        self.mlp_0 = MLPModule(device)
-        self.mlp_1 = MLPModule(device)
-
-    def forward(self, input):
-        with fx_traceback.annotate({"pp_stage": 0}):
-            x = self.mlp_0(input)
-        return self.mlp_1(x)
-
-
 def strict_export_and_aot_export_joint_with_descriptors(model, inputs):
    # needed for stric export
    torch.utils._pytree.register_constant(DTensorSpec)
@ -103,7 +90,7 @@ class DTensorExportTest(TestCase):
        )
        self.device_type = "cuda"

-    def _run_test(self, export_fn, test_annotation=False):
+    def _run_test(self, export_fn):
        dp_degree = 2
        tp_degree = self.world_size // dp_degree

@ -114,11 +101,7 @@ class DTensorExportTest(TestCase):
            mesh_dim_names=["dp", "tp"],
        )

-        model = None
-        if test_annotation:
-            model = SimpleModelAnnotated(self.device_type)
-        else:
-            model = SimpleModel(self.device_type)
+        model = SimpleModel(self.device_type)
        parallelize_plan = {
            "mlp_0.net1": ColwiseParallel(),
            "mlp_0.net2": RowwiseParallel(),
@ -148,116 +131,6 @@ class DTensorExportTest(TestCase):
            1,
        )

-        if test_annotation:
-
-            def has_tag(node):
-                return "custom" in node.meta and node.meta["custom"] == {"pp_stage": 0}
-
-            def marked_nodes(gm):
-                return [
-                    node.name
-                    for node in gm.graph.nodes
-                    if has_tag(node) and node.op == "call_function"
-                ]
-
-            def unmarked_nodes(gm):
-                return [
-                    node.name
-                    for node in gm.graph.nodes
-                    if not has_tag(node) and node.op == "call_function"
-                ]
-
-            marked_nodes_fw = [
-                "t",
-                "addmm",
-                "view",
-                "relu",
-                "view_1",
-                "t_1",
-                "div",
-                "addmm_1",
-                "all_reduce",
-                "wait_tensor",
-                "view_2",
-                "t_12",
-            ]
-            unmarked_nodes_fw = [
-                "view_3",
-                "t_2",
-                "addmm_2",
-                "view_4",
-                "relu_1",
-                "view_5",
-                "t_3",
-                "div_1",
-                "addmm_3",
-                "all_reduce_1",
-                "wait_tensor_1",
-                "view_6",
-                "t_4",
-                "t_8",
-            ]
-
-            marked_nodes_bw = [
-                "mm_4",
-                "t_13",
-                "view_1",
-                "mm_5",
-                "t_14",
-                "sum_3",
-                "view_9",
-                "t_15",
-                "detach",
-                "detach_1",
-                "detach_6",
-                "detach_7",
-                "threshold_backward_1",
-                "t_16",
-                "mm_6",
-                "t_17",
-                "sum_4",
-                "view_10",
-                "t_18",
-            ]
-            unmarked_nodes_bw = [
-                "mm",
-                "t_5",
-                "view_5",
-                "mm_1",
-                "t_6",
-                "sum_1",
-                "view_7",
-                "t_7",
-                "detach_2",
-                "detach_3",
-                "detach_4",
-                "detach_5",
-                "threshold_backward",
-                "mm_2",
-                "t_9",
-                "mm_3",
-                "t_10",
-                "sum_2",
-                "view_8",
-                "t_11",
-                "all_reduce_2",
-                "wait_tensor_2",
-            ]
-
-            self.assertEqual(marked_nodes(fw_gm), marked_nodes_fw)
-            self.assertEqual(unmarked_nodes(fw_gm), unmarked_nodes_fw)
-
-            self.assertEqual(marked_nodes(bw_gm), marked_nodes_bw)
-            self.assertEqual(unmarked_nodes(bw_gm), unmarked_nodes_bw)
-
-            self.assertEqual(
-                set(marked_nodes(joint_gm)), set(marked_nodes_fw + marked_nodes_bw)
-            )
-            self.assertEqual(
-                set(unmarked_nodes(joint_gm)),
-                set(unmarked_nodes_fw + unmarked_nodes_bw),
-            )
-
    @parametrize(
        "export_fn",
        [
@ -277,9 +150,6 @@ class DTensorExportTest(TestCase):
    def test_strict_export_parallelize_module_with_dtensor_input(self):
        self._run_test(strict_export_and_aot_export_joint_with_descriptors)

-    def test_annotate_aot_export_joint_with_descriptors_alone(self):
-        self._run_test(aot_export_joint_with_descriptors_alone, True)
-

 instantiate_parametrized_tests(DTensorExportTest)

--- a/test/distributed/test_data_parallel.py
+++ b/test/distributed/test_data_parallel.py
@ -96,7 +96,7 @@ class TestDataParallel(TestCase):
        step(model_dp)

        for p1, p2 in zip(model.parameters(), model_dp.parameters()):
-            self.assertTrue(p1.allclose(p2))
+            self.assertEqual(p1, p2)

    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
    def test_data_parallel_lazy_linear(self):
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@ -440,6 +440,7 @@ class DeviceMeshTestNDim(DTensorTestBase):
        ep_mesh = ep_mesh_1 if self.rank < self.world_size // 2 else ep_mesh_2
        # ep_mesh is considered different from mesh_2d["TP"]
        self.assertEqual(mesh_2d["TP"]._flatten_mesh_list, ep_mesh._flatten_mesh_list)
+        self.assertEqual(mesh_2d["TP"]._layout, ep_mesh._layout)
        self.assertEqual(mesh_2d["TP"].mesh.shape, ep_mesh.mesh.shape)
        self.assertEqual(mesh_2d["TP"].device_type, ep_mesh.device_type)
        self.assertNotEqual(mesh_2d["TP"].mesh_dim_names, ep_mesh.mesh_dim_names)
@ -454,6 +455,7 @@ class DeviceMeshTestNDim(DTensorTestBase):
        )
        # another_mesh is considered the same as ep_mesh
        self.assertEqual(ep_mesh._flatten_mesh_list, another_mesh._flatten_mesh_list)
+        self.assertEqual(ep_mesh._layout, another_mesh._layout)
        self.assertEqual(ep_mesh.mesh.shape, another_mesh.mesh.shape)
        self.assertEqual(ep_mesh.device_type, another_mesh.device_type)
        self.assertEqual(ep_mesh.mesh_dim_names, another_mesh.mesh_dim_names)
@ -539,7 +541,6 @@ class DeviceMeshTestNDim(DTensorTestBase):
            mesh_dim_names=("dp_replicate", "dp_shard"),
        )

-        # self.assertEqual(ref_mesh._dim_group_names, dp_mesh._dim_group_names)
        for mesh_dim_group, ref_mesh_dim_group in zip(
            dp_mesh.get_all_groups(), ref_mesh.get_all_groups()
        ):
@ -800,6 +801,10 @@ class TestDeviceMeshGetItem(DTensorTestBase):
        # Test slicing out 1D mesh from a sub-2D mesh.
        shard_mesh = hsdp_mesh_2["Shard"]
        self.assertEqual(shard_mesh.mesh.tolist(), shard_group[shard_group_idx])
+        replicate_mesh = hsdp_mesh_2["Replicate"]
+        self.assertEqual(
+            replicate_mesh.mesh.tolist(), replicate_group[replicate_group_idx]
+        )

    @with_comms
    def test_cache_and_reuse_submesh_slice_result(self):
@ -873,12 +878,17 @@ class TestDeviceMeshGetItem(DTensorTestBase):
        flattened_dp_cp_mesh = dp_cp_mesh._flatten()
        self.assertEqual(dp_cp_mesh.mesh.flatten(), flattened_dp_cp_mesh.mesh)
        self.assertEqual(flattened_dp_cp_mesh.mesh_dim_names[0], "dp_cp")
+        self.assertEqual(flattened_dp_cp_mesh.get_group().group_desc, "mesh_dp_cp")
        root_mesh = _mesh_resources.get_root_mesh(dp_cp_mesh)
        self.assertEqual(root_mesh, mesh_3d)
-        flatten_mesh_root_dims = _mesh_resources.flatten_name_to_root_dims[root_mesh][
+        flatten_mesh_layout = _mesh_resources.root_to_flatten_mapping[root_mesh][
            "dp_cp"
-        ]
-        self.assertEqual(flatten_mesh_root_dims, (0, 1))
+        ]._layout
+        self.assertEqual(flatten_mesh_layout, flattened_dp_cp_mesh._layout)
+        self.assertEqual(
+            flattened_dp_cp_mesh._layout.global_ranks(8),
+            [[0, 2, 4, 6], [1, 3, 5, 7]],
+        )

        ref_pg_count = _world.group_count
        # Calling flatten again should not create a new pg.
@ -893,10 +903,19 @@ class TestDeviceMeshGetItem(DTensorTestBase):
        self.assertEqual(flattened_dp_tp_mesh.mesh_dim_names[0], "dp_tp")
        root_mesh = _mesh_resources.get_root_mesh(dp_tp_mesh)
        self.assertEqual(root_mesh, mesh_3d)
-        flatten_mesh_root_dims = _mesh_resources.flatten_name_to_root_dims[root_mesh][
+        flatten_mesh_root_layout = _mesh_resources.root_to_flatten_mapping[root_mesh][
            "dp_tp"
-        ]
-        self.assertEqual(flatten_mesh_root_dims, (0, 2))
+        ]._layout
+        self.assertEqual(flatten_mesh_root_layout, flattened_dp_tp_mesh._layout)
+        self.assertEqual(
+            flattened_dp_tp_mesh._layout.global_ranks(8),
+            [[0, 1, 4, 5], [2, 3, 6, 7]],
+        )
+        with self.assertRaisesRegex(
+            NotImplementedError,
+            "Currently, this only allows slicing out a contiguous flattened dim",
+        ):
+            mesh_3d["dp_tp", "cp"]

        # Test flatten with a flattened mesh_dim_name
        cp_tp_mesh = mesh_3d["cp", "tp"]
@ -1537,6 +1556,50 @@ class CuTeLayoutTest(TestCase):
        layout8 = _Layout((3, 2), (2, 3))
        self.assertTrue(layout8.check_non_overlap())

+    def test_remap_to_tensor(self):
+        """Test the remap_to_tensor method for various scenarios."""
+        # Test 1: Consecutive ranks, full world - should return logical groups directly
+        original_mesh = torch.tensor([[0, 1], [2, 3]], dtype=torch.int)
+        layout1 = _Layout((2, 2), (2, 1))  # row-major 2x2
+        result1 = layout1.remap_to_tensor(original_mesh)
+        expected1 = torch.tensor([[[0, 1], [2, 3]]], dtype=torch.int)
+        self.assertEqual(result1, expected1)
+
+        # Test 2: Non-consecutive ranks - should map to actual ranks
+        original_mesh = torch.tensor([[10, 20], [30, 40]], dtype=torch.int)
+        layout2 = _Layout((2, 2), (2, 1))
+        result2 = layout2.remap_to_tensor(original_mesh)
+        expected2 = torch.tensor([[[10, 20], [30, 40]]], dtype=torch.int)
+        self.assertEqual(result2, expected2)
+
+        # Test 4: 1D layout with consecutive ranks
+        original_mesh = torch.tensor([0, 1, 2, 3], dtype=torch.int)
+        layout4 = _Layout((4,), (1,))
+        result4 = layout4.remap_to_tensor(original_mesh)
+        expected4 = torch.tensor([[0, 1, 2, 3]], dtype=torch.int)
+        self.assertEqual(result4, expected4)
+
+        # Test 5: Complex strided layout with non-consecutive ranks
+        original_mesh = torch.tensor([5, 10, 15, 20], dtype=torch.int)
+        layout5 = _Layout((2, 2), (2, 1))
+        result5 = layout5.remap_to_tensor(original_mesh)
+        expected5 = torch.tensor([[[5, 10], [15, 20]]], dtype=torch.int)
+        self.assertEqual(result5, expected5)
+
+        # Test 6: Tensor Cute representation of a 2D mesh
+        original_mesh = torch.tensor([[0, 2], [1, 3]], dtype=torch.int)
+        layout6 = _Layout((2, 2), (1, 2))  # column-major style
+        result6 = layout6.remap_to_tensor(original_mesh)
+        expected6 = torch.tensor([[[0, 1], [2, 3]]], dtype=torch.int)
+        self.assertEqual(result6, expected6)
+
+        # Test 7: Layout with different stride pattern
+        original_mesh = torch.tensor([0, 2, 1, 4], dtype=torch.int)
+        layout7 = _Layout((2, 2), (1, 2))  # column-major style
+        result7 = layout7.remap_to_tensor(original_mesh)
+        expected7 = torch.tensor([[[0, 1], [2, 4]]], dtype=torch.int)
+        self.assertEqual(result7, expected7)
+

 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/test_fake_pg.py
+++ b/test/distributed/test_fake_pg.py
@ -7,6 +7,7 @@ import torch
 import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
 import torch.nn as nn
+from torch._C._distributed_c10d import FakeProcessGroup
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.tensor import DeviceMesh, Shard
@ -22,6 +23,7 @@ from torch.testing._internal.common_fsdp import get_devtype
 from torch.testing._internal.common_utils import run_tests, skipIfHpu, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule
 from torch.testing._internal.distributed.fake_pg import FakeStore
+from torch.utils._python_dispatch import TorchDispatchMode


 if not dist.is_available():
@ -216,6 +218,95 @@ class TestFakePG(TestCase):
                loss.backward()
                optim.step()

+    def test_error_on_collective(self):
+        from torch.testing._internal.distributed.fake_pg import FakeStore
+
+        # Test with error_on_collective=False (default behavior)
+        store = FakeStore()
+        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
+
+        # These should work normally
+        tensor = torch.ones(3, 3)
+        dist.all_reduce(tensor)
+        self.assertEqual(tuple(tensor.shape), (3, 3))
+
+        dist.destroy_process_group()
+
+        # Test with error_on_collective=True
+        from torch._C._distributed_c10d import FakeProcessGroup
+
+        options = FakeProcessGroup.Options()
+        options.error_on_collective = True
+
+        store = FakeStore()
+        dist.init_process_group(
+            backend="fake", rank=0, world_size=2, store=store, pg_options=options
+        )
+
+        # These should now raise errors
+        tensor = torch.ones(3, 3)
+        with self.assertRaisesRegex(
+            RuntimeError, "FakeProcessGroup collective operation error"
+        ):
+            dist.all_reduce(tensor)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "FakeProcessGroup collective operation error"
+        ):
+            output_tensors = [torch.empty_like(tensor) for _ in range(2)]
+            dist.all_gather(output_tensors, tensor)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "FakeProcessGroup collective operation error"
+        ):
+            dist.broadcast(tensor, src=0)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "FakeProcessGroup collective operation error"
+        ):
+            dist.barrier()
+
+    def test_fake_process_group_direct_usage_error(self):
+        class SimpleTensorMode(TorchDispatchMode):
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+                return func(*args, **kwargs)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"FakeProcessGroup cannot be constructed directly\. "
+            r"Use torch\.distributed\.init_process_group\(backend='fake'\) instead to ensure "
+            r"proper dispatch system integration\.",
+        ):
+            fake_pg = FakeProcessGroup(rank=0, world_size=3)
+
+            with SimpleTensorMode():
+                tensor = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+                dist.all_reduce(tensor, group=fake_pg)
+
+    def test_fake_process_group_proper_usage_dispatch(self):
+        class SimpleTensorMode(TorchDispatchMode):
+            def __init__(self):
+                self.ops = []
+
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                self.ops.append(str(func))
+                if kwargs is None:
+                    kwargs = {}
+                return func(*args, **kwargs)
+
+        fake_store = FakeStore()
+        dist.init_process_group("fake", store=fake_store, rank=0, world_size=3)
+
+        with SimpleTensorMode() as mode:
+            tensor = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+            dist.all_reduce(tensor)
+
+        op_names = [str(op) for op in mode.ops]
+        self.assertIn("aten.lift_fresh.default", op_names)
+        self.assertIn("c10d.allreduce_.default", op_names)
+

 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/test_nvshmem_triton.py
+++ b/test/distributed/test_nvshmem_triton.py
@ -61,8 +61,13 @@ def my_get_kernel(
    src,
    nelems,
    pe,
+    nbi: tl.constexpr,  # use nonblocking interface if True
 ):
-    nvshmem.get(dest, src, nelems, pe)
+    if nbi:
+        nvshmem.get_nbi(dest, src, nelems, pe)
+        nvshmem.quiet()
+    else:
+        nvshmem.get(dest, src, nelems, pe)


@requires_nvshmem
@ -327,7 +332,8 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
    @skipIfRocm
    @requires_triton()
    @requires_h100()
-    def test_triton_get(self) -> None:
+    @parametrize("nbi", [False, True])  # Test both blocking and nonblocking interfaces
+    def test_triton_get(self, nbi: bool) -> None:
        torch.manual_seed(42 + self.rank)
        self._init_device()

@ -357,6 +363,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
                inp,
                numel,
                peer,
+                nbi=nbi,
            )
        if rank == 1:
            torch.testing.assert_close(
@ -397,6 +404,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
            inp,
            numel,
            peer,
+            nbi=False,
        )

        expected_value = peer
--- a/Show More
+++ b/Show More