Enable test_linalg tests on AArch64

ghstack-source-id: 1f44f0fbc9c5f30b04ded6355693260f096c247b Pull-Request: https://github.com/pytorch/pytorch/pull/167069
[inductor] coordesc not tune XBLOCK for mix-order-reduction (#166669 )
2025-11-07 01:50:04 +08:00 · 2025-11-05 15:10:24 +00:00 · 2025-11-04 20:27:07 +00:00 · 2025-11-04 20:13:33 +00:00 · 2025-11-04 20:07:00 +00:00 · 2025-11-04 19:33:14 +00:00
26 changed files with 197 additions and 1144 deletions
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -129,7 +129,7 @@ function install_129 {
 }

 function install_128 {
-  CUDNN_VERSION=9.8.0.87
+  CUDNN_VERSION=9.10.2.21
  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  # install CUDA 12.8.1 in the same container
  install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,11 +1,15 @@
-sphinx==7.2.6
+sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
-#Pinned versions: 7.2.6
+#Pinned versions: 5.3.0

-pytorch_sphinx_theme2==0.2.0
-#Description: This is needed to generate PyTorch docs
-#Pinned versions: 0.2.0
+standard-imghdr==3.13.0; python_version >= "3.13"
+#Description: This is needed by Sphinx, so it needs to be added here.
+# The reasons are as follows:
+# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
+# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
+# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.

+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
@ -32,17 +36,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 2.13.0

-breathe==4.36.0
+breathe==4.34.0
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 4.36.0
+#Pinned versions: 4.34.0

-exhale==0.3.7
+exhale==0.2.3
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.3.7
+#Pinned versions: 0.2.3

-docutils==0.20
+docutils==0.16
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.20
+#Pinned versions: 0.16

 bs4==0.0.1
 #Description: This is used to generate PyTorch C++ docs
@ -52,13 +56,13 @@ IPython==8.12.0
 #Description: This is used to generate PyTorch functorch docs
 #Pinned versions: 8.12.0

-myst-nb==1.3.0
+myst-nb==0.17.2
 #Description: This is used to generate PyTorch functorch and torch.compile docs.
-#Pinned versions: 1.3.0
+#Pinned versions: 0.17.2

 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-design==0.6.1
+sphinx-design==0.4.0
 sphinxcontrib-mermaid==1.0.0
-myst-parser==4.0.1
+myst-parser==0.18.1
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -89,41 +89,23 @@ if [ "$is_main_doc" = true ]; then

  make coverage
  # Now we have the coverage report, we need to make sure it is empty.
-  # Sphinx 7.2.6+ format: python.txt contains a statistics table with a TOTAL row
-  # showing the undocumented count in the third column.
-  # Example: | TOTAL | 99.83% | 2 |
+  # Count the number of lines in the file and turn that number into a variable
+  # $lines. The `cut -f1 ...` is to only parse the number, not the filename
+  # Skip the report header by subtracting 2: the header will be output even if
+  # there are no undocumented items.
  #
  # Also: see docs/source/conf.py for "coverage_ignore*" items, which should
  # be documented then removed from there.
-
-  # Extract undocumented count from TOTAL row in Sphinx 7.2.6 statistics table
-  # The table format is: | Module | Coverage | Undocumented |
-  # Extract the third column (undocumented count) from the TOTAL row
-  undocumented=$(grep "| TOTAL" build/coverage/python.txt | awk -F'|' '{print $4}' | tr -d ' ')
-
-  if [ -z "$undocumented" ] || ! [[ "$undocumented" =~ ^[0-9]+$ ]]; then
+  lines=$(wc -l build/coverage/python.txt 2>/dev/null |cut -f1 -d' ')
+  undocumented=$((lines - 2))
+  if [ $undocumented -lt 0 ]; then
    echo coverage output not found
    exit 1
-  elif [ "$undocumented" -gt 0 ]; then
-    set +x  # Disable command echoing for cleaner output
-    echo ""
-    echo "====================="
-    echo "UNDOCUMENTED OBJECTS:"
-    echo "====================="
-    echo ""
-    # Find the line number of the TOTAL row and print only what comes after it
-    total_line=$(grep -n "| TOTAL" build/coverage/python.txt | cut -d: -f1)
-    if [ -n "$total_line" ]; then
-      # Print only the detailed list (skip the statistics table)
-      tail -n +$((total_line + 2)) build/coverage/python.txt
-    else
-      # Fallback to showing entire file if TOTAL line not found
-      cat build/coverage/python.txt
-    fi
-    echo ""
+  elif [ $undocumented -gt 0 ]; then
+    echo undocumented objects found:
+    cat build/coverage/python.txt
    echo "Make sure you've updated relevant .rsts in docs/source!"
-    echo "You can reproduce locally by running 'cd docs && make coverage && tail -n +\$((grep -n \"| TOTAL\" build/coverage/python.txt | cut -d: -f1) + 2)) build/coverage/python.txt'"
-    set -x  # Re-enable command echoing
+    echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
    exit 1
  fi
 else
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -272,6 +272,18 @@ def smoke_test_cuda(
        torch_cudnn_version = cudnn_to_version_str(torch.backends.cudnn.version())
        print(f"Torch cuDNN version: {torch_cudnn_version}")

+        torch_cudnn_compile_version = torch._C._cudnn.getCompileVersion()
+        print(f"Torch cuDNN compile-time version: {torch_cudnn_compile_version}")
+        torch_cudnn_runtime_version = tuple(
+            [int(x) for x in torch_cudnn_version.split(".")]
+        )
+        if torch_cudnn_runtime_version != torch_cudnn_compile_version:
+            raise RuntimeError(
+                "cuDNN runtime version doesn't match comple version. "
+                f"Loaded: {torch_cudnn_runtime_version} "
+                f"Expected: {torch_cudnn_compile_version}"
+            )
+
        if sys.platform in ["linux", "linux2"]:
            torch_nccl_version = ".".join(str(v) for v in torch.cuda.nccl.version())
            print(f"Torch nccl; version: {torch_nccl_version}")
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -337,7 +337,7 @@ test_python() {

 test_python_smoke() {
  # Smoke tests for H100/B200
-  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

@ -1600,6 +1600,7 @@ test_linux_aarch64() {
        test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \
        distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
+        test_linalg \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Dynamo tests
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -97,8 +97,8 @@ jobs:
        shell: bash
        run: |
          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
-          if [[ $ngpu -lt 4 ]]; then
-            echo "Error: only $ngpu GPU(s) detected, at least 4 GPUs are needed for distributed jobs"
+          if [[ $ngpu -lt 2 ]]; then #We are temporarily reducing this down to 2 from 4 so that we can run tests on nodes with less gpus.
+            echo "Error: only $ngpu GPU(s) detected, at least 2 GPUs are needed for distributed jobs"
            exit 1
          fi

--- a/.gitignore
+++ b/.gitignore
@ -127,7 +127,6 @@ torch/test/
 torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
 torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
 torch/version.py
-torch/_inductor/kernel/vendored_templates/*
 minifier_launcher.py
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -206,41 +206,6 @@ templates_path = [
    os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
 ]
 # TODO: document these and remove them from here.
-# Fixes the duplicated
-autosummary_filename_map = {
-    "torch.nn.utils.prune.identity": "torch.nn.utils.prune.identity_function",
-    "torch.nn.utils.prune.Identity": "torch.nn.utils.prune.Identity_class",
-    "torch.optim.adamw.adamw": "torch.optim.adamw.adamw_function",
-    "torch.optim.adamw.AdamW": "torch.optim.adamw.AdamW_class",
-    "torch.optim.asgd.asgd": "torch.optim.asgd.asgd_function",
-    "torch.optim.asgd.ASGD": "torch.optim.asgd.ASGD_class",
-    "torch.optim.nadam.nadam": "torch.optim.nadam.nadam_function",
-    "torch.optim.nadam.NAdam": "torch.optim.nadam.NAdam_class",
-    "torch.optim.radam.radam": "torch.optim.radam.radam_function",
-    "torch.optim.radam.RAdam": "torch.optim.radam.RAdam_class",
-    "torch.optim.rmsprop.rmsprop": "torch.optim.rmsprop.rmsprop_function",
-    "torch.optim.rmsprop.RMSprop": "torch.optim.rmsprop.RMSprop_class",
-    "torch.optim.rprop.rprop": "torch.optim.rprop.rprop_function",
-    "torch.optim.rprop.Rprop": "torch.optim.rprop.Rprop_class",
-    "torch.optim.sgd.sgd": "torch.optim.sgd.sgd_function",
-    "torch.optim.sgd.SGD": "torch.optim.sgd.SGD_class",
-    "torch.optim.adadelta.adadelta": "torch.optim.adadelta.adadelta_function",
-    "torch.optim.adadelta.Adadelta": "torch.optim.adadelta.Adadelta_class",
-    "torch.optim.adagrad.adagrad": "torch.optim.adagrad.adagrad_function",
-    "torch.optim.adagrad.Adagrad": "torch.optim.adagrad.Adagrad_class",
-    "torch.optim.adam.adam": "torch.optim.adam.adam_function",
-    "torch.optim.adam.Adam": "torch.optim.adam.Adam_class",
-    "torch.optim.adamax.adamax": "torch.optim.adamax.adamax_function",
-    "torch.optim.adamax.Adamax": "torch.optim.adamax.Adamax_class",
-    "torch.mtia.stream": "torch.mtia.stream_function",
-    "torch.mtia.Stream": "torch.mtia.Stream_class",
-    "torch.cpu.stream": "torch.cpu.stream_function",
-    "torch.cpu.Stream": "torch.cpu.Stream_class",
-    "torch.cuda.stream": "torch.cuda.stream_function",
-    "torch.cuda.Stream": "torch.cuda.Stream_class",
-    "torch.xpu.stream": "torch.xpu.stream_function",
-    "torch.xpu.Stream": "torch.xpu.Stream_class",
-}

 coverage_ignore_functions = [
    # torch
@ -3230,11 +3195,6 @@ autodoc_type_aliases = {
 # Enable overriding of function signatures in the first line of the docstring.
 autodoc_docstring_signature = True

-# Exclude inherited IntEnum methods that have RST formatting issues in their docstrings
-autodoc_default_options = {
-    "exclude-members": "from_bytes, to_bytes",
-}
-
 # -- katex javascript in header
 #
 #    def setup(app):
--- a/docs/source/quantization-support.md
+++ b/docs/source/quantization-support.md
@ -253,6 +253,7 @@ regular full-precision tensor.
 .. autosummary::
    :toctree: generated
    :nosignatures:
+    :template: classtemplate.rst

    view
    as_strided
--- a/setup.py
+++ b/setup.py
@ -630,37 +630,6 @@ def mirror_files_into_torchgen() -> None:
        raise RuntimeError("Check the file paths in `mirror_files_into_torchgen()`")


-def mirror_inductor_external_kernels() -> None:
-    """
-    Copy external kernels into Inductor so they are importable.
-    """
-    paths = [
-        (
-            CWD / "torch/_inductor/kernel/vendored_templates/cutedsl_grouped_gemm.py",
-            CWD
-            / "third_party/cutlass/examples/python/CuTeDSL/blackwell/grouped_gemm.py",
-        ),
-    ]
-    for new_path, orig_path in paths:
-        # Create the dirs involved in new_path if they don't exist
-        if not new_path.exists():
-            new_path.parent.mkdir(parents=True, exist_ok=True)
-
-        # Copy the files from the orig location to the new location
-        if orig_path.is_file():
-            shutil.copyfile(orig_path, new_path)
-            continue
-        if orig_path.is_dir():
-            if new_path.exists():
-                # copytree fails if the tree exists already, so remove it.
-                shutil.rmtree(new_path)
-            shutil.copytree(orig_path, new_path)
-            continue
-        raise RuntimeError(
-            "Check the file paths in `mirror_inductor_external_kernels()`"
-        )
-
-
 # ATTENTION: THIS IS AI SLOP
 def extract_variant_from_version(version: str) -> str:
    """Extract variant from version string, defaulting to 'cpu'."""
@ -1647,8 +1616,6 @@ def main() -> None:
    if RUN_BUILD_DEPS:
        build_deps()

-    mirror_inductor_external_kernels()
-
    (
        ext_modules,
        cmdclass,
@ -1682,7 +1649,6 @@ def main() -> None:
        "_inductor/codegen/aoti_runtime/*.cpp",
        "_inductor/script.ld",
        "_inductor/kernel/flex/templates/*.jinja",
-        "_inductor/kernel/templates/*.jinja",
        "_export/serde/*.yaml",
        "_export/serde/*.thrift",
        "share/cmake/ATen/*.cmake",
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@ -167,6 +167,14 @@ def _pack_fp8_wrap(x):
    if not x.dtype.is_floating_point:
        return x

+    if type(x) is not torch.Tensor:
+        # Check only during compilation
+        # Test calls hooks to get reference output
+        ctx = torch._functorch._aot_autograd.graph_compile._get_saved_tensor_hook_context()
+        assert ctx["_fw_graph"] is not None
+        assert ctx["_bw_graph"] is not None
+        assert ctx["_node"] is not None
+
    return (x.dtype, x.to(torch.float8_e5m2))


@ -176,6 +184,13 @@ def _unpack_fp8_wrap(x):
        return x

    dtype, tensor = x
+    if type(tensor) is not torch.Tensor:
+        # Check only during compilation
+        # Test calls hooks to get reference output
+        ctx = torch._functorch._aot_autograd.graph_compile._get_saved_tensor_hook_context()
+        assert ctx["_fw_graph"] is not None
+        assert ctx["_bw_graph"] is not None
+        assert ctx["_node"] is not None
    return tensor.to(dtype)


--- a/test/inductor/test_cutedsl_grouped_mm.py
+++ b/test/inductor/test_cutedsl_grouped_mm.py
@ -1,154 +0,0 @@
-# Owner(s): ["module: inductor"]
-
-
-import unittest
-
-import torch
-from torch import Tensor
-from torch._inductor import config
-from torch._inductor.codegen.cuda.cuda_env import is_datacenter_blackwell_arch
-from torch._inductor.test_case import run_tests, TestCase as InductorTestCase
-from torch._inductor.utils import ensure_cute_available
-from torch.testing._internal.common_utils import (
-    instantiate_parametrized_tests,
-    parametrize,
-)
-
-
-@unittest.skipIf(
-    not (ensure_cute_available() and is_datacenter_blackwell_arch()),
-    "CuTeDSL library or Blackwell device not available",
-)
-@instantiate_parametrized_tests
-class TestCuTeDSLGroupedGemm(InductorTestCase):
-    def _get_inputs(
-        self,
-        group_size: int,
-        M_hint: int,
-        K: int,
-        N: int,
-        device: str,
-        dtype: torch.dtype,
-        alignment: int = 16,
-    ) -> tuple[Tensor, Tensor, Tensor]:
-        # --- Random, tile-aligned M sizes ---
-        M_sizes = (
-            torch.randint(1, (M_hint // alignment) + 1, (group_size,), dtype=torch.int)
-            * alignment
-        )
-
-        M_total = torch.sum(M_sizes).item()
-
-        # --- Construct input tensors ---
-        A = torch.randn(int(M_total), K, dtype=dtype, device=device) * 0.1
-        B = torch.randn((group_size, K, N), dtype=dtype, device=device) * 0.01
-
-        # --- Build offsets (no leading zero, strictly increasing) ---
-        offsets = torch.cumsum(M_sizes, dim=0).to(dtype=torch.int32, device=device)
-
-        return (A, B, offsets)
-
-    @parametrize("group_size", (2, 8))
-    @parametrize("M_hint", (256, 1024))
-    @parametrize("K", (64, 128))
-    @parametrize("N", (128, 256))
-    def test_grouped_gemm_basic(self, group_size: int, M_hint: int, K: int, N: int):
-        device = "cuda"
-        dtype = torch.bfloat16
-
-        A, B, offsets = self._get_inputs(group_size, M_hint, K, N, device, dtype)
-
-        def grouped_gemm_fn(A_packed, B_batched, offs):
-            return torch._grouped_mm(A_packed, B_batched, offs=offs)
-
-        # Eager execution
-        c_eager = grouped_gemm_fn(A, B, offsets)
-
-        # Test with Cute backend
-        with config.patch(
-            {
-                "max_autotune": True,
-                "max_autotune_gemm_backends": "CUTEDSL",
-                "test_configs.autotune_choice_name_regex": "cutedsl",
-                "autotune_fallback_to_aten": False,
-            }
-        ):
-            grouped_gemm_compiled = torch.compile(
-                grouped_gemm_fn, backend="inductor", dynamic=False
-            )
-            c_compiled = grouped_gemm_compiled(A, B, offsets)
-
-        self.assertEqual(c_eager.dtype, dtype)
-        self.assertEqual(c_compiled.dtype, dtype)
-        torch.testing.assert_close(c_eager, c_compiled)
-
-    @parametrize("layout_A", ("contiguous", "offset", "padded", "view"))
-    @parametrize("layout_B", ("contiguous", "broadcasted"))
-    def test_grouped_gemm_assorted_layouts(
-        self,
-        layout_A: str,
-        layout_B: str,
-    ):
-        device = "cuda"
-        dtype = torch.bfloat16
-
-        G, K, N = 8, 64, 128
-        M_sizes = [128] * G
-        sum_M = sum(M_sizes)
-        offsets = torch.tensor(
-            [sum(M_sizes[: i + 1]) for i in range(G)], dtype=torch.int32, device=device
-        )
-
-        A_base = torch.randn(sum_M, K, device=device, dtype=dtype)
-        A = A_base
-
-        if layout_A == "offset":
-            # allocate bigger buffer than needed, use nonzero storage offset
-            storage = torch.randn(sum_M * K + 512, device=device, dtype=dtype)
-            offset = 128  # skip first 128 elements
-            A = torch.as_strided(storage[offset:], (sum_M, K), (K, 1))
-        elif layout_A == "padded":
-            # simulate row pitch > K (row_stride = K + pad)
-            row_pitch = K + 8
-            storage = torch.randn(sum_M * row_pitch, device=device, dtype=dtype)
-            A = torch.as_strided(storage, (sum_M, K), (row_pitch, 1))
-        elif layout_A == "view":
-            A_storage = torch.randn(sum_M * K, device=device, dtype=dtype)
-            A = A_storage.view(sum_M, K)
-            assert A._base is not None
-            assert A.shape == (sum_M, K)
-
-        B = torch.randn((G, K, N), dtype=dtype, device=device) * 0.01
-
-        if layout_B == "broadcasted":
-            # Broadcast B across groups (zero stride along G)
-            B = B[0].expand(G, K, N)
-            assert B.stride(0) == 0
-
-        def grouped_gemm_fn(A_packed, B_batched, offs):
-            return torch._grouped_mm(A_packed, B_batched, offs=offs)
-
-        # --- eager ---
-        c_eager = grouped_gemm_fn(A, B, offsets)
-
-        # --- compiled (CUTE backend) ---
-        with config.patch(
-            {
-                "max_autotune": True,
-                "max_autotune_gemm_backends": "CUTEDSL",
-                "test_configs.autotune_choice_name_regex": "cutedsl",
-                "autotune_fallback_to_aten": False,
-            }
-        ):
-            grouped_gemm_compiled = torch.compile(
-                grouped_gemm_fn, backend="inductor", dynamic=False
-            )
-            c_compiled = grouped_gemm_compiled(A, B, offsets)
-
-        self.assertEqual(c_eager.dtype, dtype)
-        self.assertEqual(c_compiled.dtype, dtype)
-        torch.testing.assert_close(c_eager, c_compiled)
-
-
-if __name__ == "__main__":
-    run_tests()
--- a/test/inductor/test_mix_order_reduction.py
+++ b/test/inductor/test_mix_order_reduction.py
@ -117,6 +117,22 @@ class MixOrderReductionTest(TestBase):
            metrics.codegen_mix_order_reduction,
        )

+    @inductor_config.patch(coordinate_descent_tuning=True)
+    def test_XBLOCK_coordest_tuning(self):
+        """
+        We should skip XBLOCK coordinate descent tuning for
+        mix order reduction.
+        """
+        if not inductor_config.triton.mix_order_reduction:
+            self.skipTest("Mix order reduction not enabled")
+
+        def f(x):
+            return x.sum(dim=-1), x.sum(dim=0)
+
+        x = torch.randn(32768, 256, dtype=torch.float, device=GPU_TYPE)
+        self.check_numeric(f, (x,))
+        self.assertEqual(metrics.codegen_mix_order_reduction, 1)
+
    @inductor_config.patch(unroll_reductions_threshold=1)
    def test_3layer_split_reduction(self):
        """
--- a/torch/_functorch/_aot_autograd/graph_compile.py
+++ b/torch/_functorch/_aot_autograd/graph_compile.py
@ -25,6 +25,9 @@ from typing import Any, Optional, TYPE_CHECKING, Union
 if TYPE_CHECKING:
    from collections.abc import Sequence

+import threading
+from contextlib import contextmanager
+
 import torch
 import torch.utils._pytree as pytree
 import torch.utils.dlpack
@ -97,6 +100,43 @@ from .utils import (
 )


+_thread_local = threading.local()
+
+
+# Saved tensor hooks context
+# Compiled saved tensor hooks are convenient way to inline some logic in the graphs
+# for saved nodes from forward to backward. (E.g. activations quantization)
+# In base implementation user does not have any additional information about saved value
+# in the hook, except FakeTensor shape, dtype, device etc.
+# _get_saved_tensor_hook_context gives additional graph information about that saved value,
+# that can be used to make a decisions which pack/unpack to apply for particular saved value.
+# This allows user to reuse saved tensors hooks api to apply selective pack/unpack in
+# graph aware way.
+# Alternative to this will be making user to write a custom pass that mucks with forward outputs,
+# backward input metadata, which requires significantly more effort.
+#
+# As for now in context we expose forward graph, backward graph and current saved node,
+# which contains node.meta with additional information about that fx.Node.
+# Warning: This API may change without backward compatibility.
+@contextmanager
+def _saved_tensor_hook_context(state: dict[str, Any]):
+    previous_state = getattr(_thread_local, "state", None)
+    try:
+        _thread_local.state = state
+        yield
+    finally:
+        # Clean up: restore previous state or remove attribute
+        if previous_state is not None:
+            _thread_local.state = previous_state
+        else:
+            if hasattr(_thread_local, "state"):
+                delattr(_thread_local, "state")
+
+
+def _get_saved_tensor_hook_context() -> dict[str, Any] | None:
+    return getattr(_thread_local, "state", None)
+
+
 zip = strict_zip

 log = logging.getLogger(__name__)
@ -1097,7 +1137,11 @@ def maybe_inline_graph_saved_tensors_hooks(
        if not isinstance(val, torch.Tensor):
            continue

-        pack_out_val = pack_hook_gm(val)
+        def _get_extra_info() -> dict[str, Any]:
+            return {"_fw_graph": fw_g, "_bw_graph": bw_g, "_node": saved}
+
+        with _saved_tensor_hook_context(_get_extra_info()):
+            pack_out_val = pack_hook_gm(val)

        requires_sc_handling = any(
            is_traceable_wrapper_subclass(x) for x in pytree.tree_leaves(pack_out_val)
@ -1109,16 +1153,17 @@ def maybe_inline_graph_saved_tensors_hooks(
                " in the pack hook, and reconstructing the subclass in the unpack hook"
            )

-        pack_gm = prepare_hook_gm(aot_config, pack_hook_gm, (val,))
-        pack_g = pack_gm.graph
-        maybe_log_graph(
-            pack_gm,
-            f"saved_tensors_pack_hook {saved.name}",
-            aot_config,
-            lambda: f"aot_saved_tensors_hooks_pack {saved.name}",
-            structured_logs,
-        )
-        pack_out_val = pack_gm(val)
+        with _saved_tensor_hook_context(_get_extra_info()):
+            pack_gm = prepare_hook_gm(aot_config, pack_hook_gm, (val,))
+            pack_g = pack_gm.graph
+            maybe_log_graph(
+                pack_gm,
+                f"saved_tensors_pack_hook {saved.name}",
+                aot_config,
+                lambda: f"aot_saved_tensors_hooks_pack {saved.name}",
+                structured_logs,
+            )
+            pack_out_val = pack_gm(val)

        # Install pack hook graph as eiplogue of fw_module.
        # Saved tensor output becomes input of pack hook graph.
@ -1188,15 +1233,16 @@ def maybe_inline_graph_saved_tensors_hooks(
        # Install unpack hook graph as a prologue of backward graph
        # Saved tensors inputs are replaced with packed tensors and packed sym scalars.
        # The saved tensors inputs usages in the graph are replaced with unpack hook graph outputs.
-        unpack_gm = prepare_hook_gm(aot_config, unpack_hook_gm, (pack_out_val,))
-        unpack_g = unpack_gm.graph
-        maybe_log_graph(
-            unpack_gm,
-            f"saved_tensors_unpack_hook {saved.name}",
-            aot_config,
-            lambda: f"aot_saved_tensors_hooks_unpack {saved.name}",
-            structured_logs,
-        )
+        with _saved_tensor_hook_context(_get_extra_info()):
+            unpack_gm = prepare_hook_gm(aot_config, unpack_hook_gm, (pack_out_val,))
+            unpack_g = unpack_gm.graph
+            maybe_log_graph(
+                unpack_gm,
+                f"saved_tensors_unpack_hook {saved.name}",
+                aot_config,
+                lambda: f"aot_saved_tensors_hooks_unpack {saved.name}",
+                structured_logs,
+            )

        def find_saved_in_bw_inputs(bw_inputs):
            for n in bw_inputs:
--- a/torch/_higher_order_ops/triton_kernel_wrap.py
+++ b/torch/_higher_order_ops/triton_kernel_wrap.py
@ -498,6 +498,7 @@ def generate_ttir(
        # pyrefly: ignore  # missing-attribute
        codegen_fns = backend.get_codegen_implementation(*codegen_args)
        module_map = backend.get_module_map()
+        # pyrefly: ignore[missing-argument,bad-argument-type]
        ttir_module = src.make_ir(options, codegen_fns, module_map, context)
    else:
        codegen_args = [options] if get_codegen_implementation_sig_params == 1 else []
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@ -546,10 +546,6 @@ max_autotune_flex_search_space: Literal["DEFAULT", "EXHAUSTIVE"] = os.environ.ge
    "TORCHINDUCTOR_MAX_AUTOTUNE_FLEX_SEARCH_SPACE", "DEFAULT"
 ).upper()  # type: ignore[assignment]

-cutedsl_enable_autotuning: bool = (
-    os.environ.get("CUTEDSL_ENABLE_AUTOTUNING", "0") == "1"
-)
-
 # DEPRECATED. This setting is ignored.
 autotune_fallback_to_aten = False

--- a/torch/_inductor/kernel/mm_common.py
+++ b/torch/_inductor/kernel/mm_common.py
@ -1,8 +1,6 @@
 # mypy: allow-untyped-defs
 import logging
 from collections.abc import Sequence
-from functools import partial
-from pathlib import Path
 from typing import Any

 import torch
@ -14,7 +12,6 @@ from torch.fx.experimental.symbolic_shapes import has_free_unbacked_symbols
 from .. import config
 from ..codegen.wrapper import PythonWrapperCodegen
 from ..ir import _IntLike, Layout, TensorBox
-from ..utils import load_template


 log = logging.getLogger(__name__)
@ -257,7 +254,3 @@ def is_batch_stride_largest_or_zero(mat1, mat2, layout) -> bool:
            return False

    return True
-
-
-_KERNEL_TEMPLATE_DIR = Path(__file__).parent / "templates"
-load_kernel_template = partial(load_template, template_dir=_KERNEL_TEMPLATE_DIR)
--- a/torch/_inductor/kernel/mm_grouped.py
+++ b/torch/_inductor/kernel/mm_grouped.py
@ -1,11 +1,10 @@
 # mypy: allow-untyped-defs
 import logging
-from dataclasses import asdict, dataclass
+from dataclasses import dataclass
 from typing import Any, Optional

 import torch
 from torch._dynamo.utils import counters
-from torch._inductor.codegen.cutedsl.cutedsl_template import CuteDSLTemplate
 from torch._inductor.runtime.triton_compat import tl
 from torch._inductor.virtualized import V
 from torch.utils._triton import has_triton
@ -19,25 +18,19 @@ from ..select_algorithm import (
    TritonTemplate,
 )
 from ..utils import (
-    ensure_cute_available,
    get_gpu_shared_memory,
    get_num_sms,
    has_free_symbols,
    use_aten_gemm_kernels,
-    use_blackwell_cutedsl_grouped_mm,
    use_triton_template,
 )
 from .mm_common import (
    _is_static_problem,
    check_supported_striding,
-    load_kernel_template,
    persistent_grouped_mm_grid,
 )


-if ensure_cute_available():
-    from torch._inductor.template_heuristics.cutedsl import get_groupgemm_configs
-
 log = logging.getLogger(__name__)
 aten = torch.ops.aten

@ -520,11 +513,6 @@ triton_scaled_grouped_mm_template = TritonTemplate(
    source=triton_grouped_mm_source,
 )

-cutedsl_grouped_mm_template = CuteDSLTemplate(
-    name="grouped_gemm_cutedsl",
-    source=load_kernel_template("cutedsl_mm_grouped"),
-)
-

 def grouped_mm_args(
    mat1: TensorBox,
@ -726,44 +714,43 @@ def _tuned_grouped_mm_common(
    # Checking only for the equality of corresponding dims of
    # multiplicands here, relying on meta function checks for
    # everything else.
-    if len(m1_size) == 2:
-        if len(m2_size) == 2:
-            m, k1 = m1_size
-            k2, _ = m2_size
-            # pyrefly: ignore [missing-attribute]
-            g = offs.get_size()[0]
-            V.graph.sizevars.check_equals(k1, k2)
-            a_is_2d, b_is_2d = True, True
-        else:
-            # pyrefly: ignore [missing-attribute]
-            g1 = offs.layout.size[0]
-            m, k1 = m1_size
-            g2, k2, _ = m2_size
-            g = V.graph.sizevars.check_equals_and_simplify(g1, g2)
-            V.graph.sizevars.check_equals(k1, k2)
-            a_is_2d, b_is_2d = True, False
-    else:
-        if len(m2_size) == 2:
-            # pyrefly: ignore [missing-attribute]
-            g1 = offs.layout.size[0]
-            g2, m, k1 = m1_size
-            k2, _ = m2_size
-            g = V.graph.sizevars.check_equals_and_simplify(g1, g2)
-            V.graph.sizevars.check_equals(k1, k2)
-            a_is_2d, b_is_2d = False, True
-        else:
-            g1, m, k1 = m1_size
-            g2, k2, _ = m2_size
-            g = V.graph.sizevars.check_equals_and_simplify(g1, g2)
-            V.graph.sizevars.check_equals(k1, k2)
-            a_is_2d, b_is_2d = False, False
-
    if (
        is_nonzero
        and use_triton_template(layout)
        and can_use_triton_kernel(mat_a, mat_b, offs, bias, scale_result)
    ):
        scaled = scale_a is not None
+        if len(m1_size) == 2:
+            if len(m2_size) == 2:
+                m, k1 = m1_size
+                k2, _ = m2_size
+                # pyrefly: ignore [missing-attribute]
+                g = offs.get_size()[0]
+                V.graph.sizevars.check_equals(k1, k2)
+                a_is_2d, b_is_2d = True, True
+            else:
+                # pyrefly: ignore [missing-attribute]
+                g1 = offs.layout.size[0]
+                m, k1 = m1_size
+                g2, k2, _ = m2_size
+                g = V.graph.sizevars.check_equals_and_simplify(g1, g2)
+                V.graph.sizevars.check_equals(k1, k2)
+                a_is_2d, b_is_2d = True, False
+        else:
+            if len(m2_size) == 2:
+                # pyrefly: ignore [missing-attribute]
+                g1 = offs.layout.size[0]
+                g2, m, k1 = m1_size
+                k2, _ = m2_size
+                g = V.graph.sizevars.check_equals_and_simplify(g1, g2)
+                V.graph.sizevars.check_equals(k1, k2)
+                a_is_2d, b_is_2d = False, True
+            else:
+                g1, m, k1 = m1_size
+                g2, k2, _ = m2_size
+                g = V.graph.sizevars.check_equals_and_simplify(g1, g2)
+                V.graph.sizevars.check_equals(k1, k2)
+                a_is_2d, b_is_2d = False, False

        a_is_k_major = mat_a.get_stride()[-1] == 1
        b_is_k_major = mat_b.get_stride()[-2] == 1
@ -801,22 +788,6 @@ def _tuned_grouped_mm_common(
                **config.kwargs,
            )

-    if use_blackwell_cutedsl_grouped_mm(
-        mat_a, mat_b, layout, a_is_2d, b_is_2d, offs, bias, scale_result
-    ):
-        for config in get_groupgemm_configs():
-            kwargs = dict(
-                ACC_DTYPE="cutlass.Float32",
-            )
-
-            cutedsl_grouped_mm_template.maybe_append_choice(
-                choices,
-                input_nodes=input_nodes,
-                layout=layout,
-                **kwargs,
-                **asdict(config),
-            )
-
    input_gen_fns = {
        4: lambda x: create_offsets(
            x, m1_size, m2_size, offs.get_size() if offs is not None else None
--- a/torch/_inductor/kernel/templates/cutedsl_mm_grouped.py.jinja
+++ b/torch/_inductor/kernel/templates/cutedsl_mm_grouped.py.jinja
@ -1,333 +0,0 @@
-import functools
-from torch._inductor.runtime.runtime_utils import ceildiv
-from cutlass.utils import TensorMapUpdateMode
-{{gen_defines()}}
-# ---- Import GroupedGemm implementation, copied on PyTorch build from Cutlass repository: cutlass/examples/python/CuTeDSL/blackwell/grouped_gemm.py ----
-from torch._inductor.kernel.vendored_templates.cutedsl_grouped_gemm import (
-    GroupedGemmKernel,
-)
-
-
-# Note about caching:
-# Each instantiated CuTeDSL grouped GEMM kernel file generated by Inductor
-# maintains its own local caching system. At this stage, all compile-time
-# constexprs (e.g., TILE_M, TILE_N, CLUSTER_M/N, USE_2_CTA) and the kernel
-# name itself ({{kernel_name}}) are permanently baked into the file, so they
-# do not need to be included in any cache key.
-#
-# The caching mechanism is split into two levels:
-#
-#   1. prep_cache
-#      Caches the compiled executor for build_group_ptrs_from_bases(). This
-#      kernel depends only on the tensor shapes, strides, and dtypes of A/B/C,
-#      and can therefore be safely reused across runs with different group
-#      partitioning (`offs`).
-#
-#   2. gemm_cache
-#      Caches the compiled Grouped GEMM executor. Its key extends the prep
-#      cache key with hardware- and grid-specific parameters:
-#      (prep_cache_key, max_active_clusters, total_num_clusters).
-#      This is necessary because different `offs` tensors can change the
-#      per-group problem sizes and thus alter `total_num_clusters`, which in
-#      turn changes the grid shape and persistent scheduler configuration.
-#      Kernels compiled for one grid cannot be safely reused for another.
-#
-#
-# Additionally, note the @lru_cache decorator on get_hardware_info(). Empirically,
-# hw.get_max_active_clusters() triggers significant MLIR recompilation overhead,
-# despite depending only on the GPU type. We cache this function to mitigate
-# redundant recompiles even when shape/stride/dtype cache misses force kernel
-# regeneration. A follow-up study will investigate the root cause.
-
-prep_cache = {}
-gemm_cache = {}
-
-
-@functools.lru_cache
-def get_hardware_info():
-    hw = cutlass.utils.HardwareInfo()
-    sm_count = hw.get_max_active_clusters(1)
-    max_active_clusters = hw.get_max_active_clusters(CLUSTER_M * CLUSTER_N)
-
-    return (sm_count, max_active_clusters)
-
-
-def get_prep_cache_key(input_a, input_b, output):
-    """
-    Returns a tuple key for caching the preprocessing kernel executor based on kernel name,
-    shapes, strides, and dtypes of input/output tensors.
-    """
-    return (
-        tuple(input_a.shape),
-        tuple(input_a.stride()),
-        input_a.dtype,
-        tuple(input_b.shape),
-        tuple(input_b.stride()),
-        input_b.dtype,
-        tuple(output.shape),
-        tuple(output.stride()),
-        output.dtype,
-    )
-
-
-def get_gemm_cache_key(prep_cache_key, max_active_clusters, total_num_clusters):
-    """
-    Returns a tuple key for caching the gemm kernel executor by extending the
-    prep cache key with hardware- and grid-specific parameters.
-    """
-    return (
-        prep_cache_key,
-        max_active_clusters,
-        total_num_clusters,
-    )
-
-
-@cute.kernel
-def build_group_ptrs_from_bases_kernel(
-    base_A_u64: cutlass.Int64,  # device addr of input_a (bytes)
-    base_B_u64: cutlass.Int64,  # device addr of input_b (bytes)
-    base_C_u64: cutlass.Int64,  # device addr of Output (bytes)
-    offs: cute.Tensor,  # [G], cutlass.Int32/64 cumulative
-    K: cutlass.Constexpr,
-    N: cutlass.Constexpr,
-    sizeof_element: cutlass.Int32,  # bytes
-    # -------- STRIDES (in ELEMENTS) --------
-    stride_A_m_elems: cutlass.Constexpr,  # A.stride(0)
-    stride_A_k_elems: cutlass.Constexpr,  # A.stride(1)
-    stride_B0_elems: cutlass.Constexpr,  # B.stride(0)
-    stride_Bk_elems: cutlass.Constexpr,  # B.stride(1)
-    stride_Bn_elems: cutlass.Constexpr,  # B.stride(2)
-    stride_C_m_elems: cutlass.Constexpr,  # C.stride(0)
-    stride_C_n_elems: cutlass.Constexpr,  # C.stride(1)
-    # -------- OUTPUTS --------
-    out_ptrs: cute.Tensor,  # [G,3] cutlass.Int64: (A_ptr, B_ptr, C_ptr)
-    out_problem: cute.Tensor,  # [G,4] cutlass.Int32: (m_g, n, k, 1)
-    out_strides_abc: cute.Tensor,  # [G,3,2] cutlass.Int32 [[A_m,A_k],[B_n,B_k],[C_m,C_n]]
-):
-    tidx, _, _ = cute.arch.thread_idx()
-    g = tidx
-
-    m_beg_i32 = 0
-    if g > 0:
-        m_beg_i32 = offs[g - 1]
-    m_end_i32 = offs[g]
-    m_g_i32 = m_end_i32 - m_beg_i32
-
-    a_byte_off = (
-        cutlass.Int64(m_beg_i32) * stride_A_m_elems * cutlass.Int64(sizeof_element)
-    )
-    c_byte_off = (
-        cutlass.Int64(m_beg_i32) * stride_C_m_elems * cutlass.Int64(sizeof_element)
-    )
-    b_byte_off = cutlass.Int64(g) * stride_B0_elems * cutlass.Int64(sizeof_element)
-
-    # ---- pointers ----
-    out_ptrs[g, 0] = base_A_u64 + a_byte_off
-    out_ptrs[g, 1] = base_B_u64 + b_byte_off
-    out_ptrs[g, 2] = base_C_u64 + c_byte_off
-
-    # ---- (m, n, k, 1) ----
-    out_problem[g, 0] = m_g_i32
-    out_problem[g, 1] = N
-    out_problem[g, 2] = K
-    out_problem[g, 3] = cutlass.Int32(1)
-
-    # ---- strides ----
-    out_strides_abc[g, 0, 0] = cutlass.Int32(stride_A_m_elems)
-    out_strides_abc[g, 0, 1] = cutlass.Int32(stride_A_k_elems)
-    out_strides_abc[g, 1, 0] = cutlass.Int32(stride_Bn_elems)
-    out_strides_abc[g, 1, 1] = cutlass.Int32(stride_Bk_elems)
-    out_strides_abc[g, 2, 0] = cutlass.Int32(stride_C_m_elems)
-    out_strides_abc[g, 2, 1] = cutlass.Int32(stride_C_n_elems)
-
-
-@cute.jit
-def launch_build_group_ptrs_from_bases(
-    base_A_u64: cutlass.Int64,
-    base_B_u64: cutlass.Int64,
-    base_C_u64: cutlass.Int64,
-    offs: cute.Tensor,
-    G: cutlass.Constexpr,
-    K: cutlass.Constexpr,
-    N: cutlass.Constexpr,
-    sizeof_element: cutlass.Constexpr,
-    stride_A_m_elems: cutlass.Constexpr,
-    stride_A_k_elems: cutlass.Constexpr,
-    stride_B0_elems: cutlass.Constexpr,
-    stride_Bk_elems: cutlass.Constexpr,
-    stride_Bn_elems: cutlass.Constexpr,
-    stride_C_m_elems: cutlass.Constexpr,
-    stride_C_n_elems: cutlass.Constexpr,
-    out_ptrs: cute.Tensor,  # [G,3] cutlass.Int64
-    out_problem: cute.Tensor,  # [G,4] cutlass.Int32
-    out_strides_abc: cute.Tensor,  # [3,2] cutlass.Int32
-    stream: cuda.CUstream,
-):
-    build_group_ptrs_from_bases_kernel(
-        base_A_u64,
-        base_B_u64,
-        base_C_u64,
-        offs,
-        K,
-        N,
-        sizeof_element,
-        stride_A_m_elems,
-        stride_A_k_elems,
-        stride_B0_elems,
-        stride_Bk_elems,
-        stride_Bn_elems,
-        stride_C_m_elems,
-        stride_C_n_elems,
-        out_ptrs,
-        out_problem,
-        out_strides_abc,
-    ).launch(grid=(1, 1, 1), block=(G, 1, 1), stream=stream)
-
-
-{{def_kernel("input_a", "input_b", "input_a_offs")}}
-    stream = cuda.CUstream(stream)
-
-    input_b = input_b.transpose(1, 2)
-
-    sumM, K = input_a.shape
-    G, N, Kb = input_b.shape
-
-    dev = input_a.device
-
-    base_A_u64 = int(input_a.data_ptr())
-    base_B_u64 = int(input_b.data_ptr())
-    base_C_u64 = int({{get_output()}}.data_ptr())
-
-    ptrs_t = torch.empty((G, 3), device=dev, dtype=torch.int64)
-    probs_t = torch.empty((G, 4), device=dev, dtype=torch.int32)
-    strides_t = torch.empty((G, 3, 2), device=dev, dtype=torch.int32)
-    ptrs = from_dlpack(ptrs_t)
-    probs = from_dlpack(probs_t)
-    strides = from_dlpack(strides_t)
-
-    prep_cache_key = get_prep_cache_key(input_a, input_b, {{get_output()}})
-    prep_executor = prep_cache.get(prep_cache_key)
-
-    if prep_executor is None:
-        sizeof_element = int(input_a.element_size())
-        sA_m, sA_k = map(int, input_a.stride())
-        sB_0, sB_n, sB_k = map(int, input_b.stride())
-        sC_m, sC_n = map(int, {{get_output()}}.stride())
-
-        prep_executor = cute.compile(
-            launch_build_group_ptrs_from_bases,
-            base_A_u64=base_A_u64,
-            base_B_u64=base_B_u64,
-            base_C_u64=base_C_u64,
-            offs=from_dlpack(input_a_offs),
-            G=int(G),
-            K=int(K),
-            N=int(N),
-            sizeof_element=sizeof_element,
-            stride_A_m_elems=sA_m,
-            stride_A_k_elems=sA_k,
-            stride_B0_elems=sB_0,
-            stride_Bk_elems=sB_k,
-            stride_Bn_elems=sB_n,
-            stride_C_m_elems=sC_m,
-            stride_C_n_elems=sC_n,
-            out_ptrs=ptrs,
-            out_problem=probs,
-            out_strides_abc=strides,
-            stream=stream,
-        )
-
-        prep_cache[prep_cache_key] = prep_executor
-
-    prep_executor(
-        base_A_u64=base_A_u64,
-        base_B_u64=base_B_u64,
-        base_C_u64=base_C_u64,
-        offs=from_dlpack(input_a_offs),
-        out_ptrs=ptrs,
-        out_problem=probs,
-        out_strides_abc=strides,
-        stream=stream,
-    )
-
-    # --- Tensormap workspace per SM ---
-    num_tensormap_buffers, max_active_clusters = get_hardware_info()
-    tensormap_shape = (
-        num_tensormap_buffers,
-        GroupedGemmKernel.num_tensormaps,
-        GroupedGemmKernel.bytes_per_tensormap // 8,
-    )
-    tensormap_workspace_t = torch.empty(tensormap_shape, device=dev, dtype=torch.int64)
-    tensormap_workspace = from_dlpack(tensormap_workspace_t)
-
-    # --- Total clusters ---
-    def compute_total_num_clusters(
-        problem_sizes_mnkl,
-        cluster_tile_shape_mn,
-    ):
-        total_num_clusters = 0
-        for m, n, _, _ in problem_sizes_mnkl:
-            num_clusters_mn = tuple(
-                ceildiv(x, y) for x, y in zip((m, n), cluster_tile_shape_mn)
-            )
-            total_num_clusters += functools.reduce(lambda x, y: x * y, num_clusters_mn)
-        return total_num_clusters
-
-    # Compute cluster tile shape
-    def compute_cluster_tile_shape(
-        mma_tiler_mn,
-        cluster_shape_mn,
-        use_2cta_instrs,
-    ):
-        cta_tile_shape_mn = list(mma_tiler_mn)
-        if use_2cta_instrs:
-            cta_tile_shape_mn[0] = cta_tile_shape_mn[0] // 2
-        return tuple(x * y for x, y in zip(cta_tile_shape_mn, cluster_shape_mn))
-
-    cluster_tile_shape_mn = compute_cluster_tile_shape(
-        (TILE_M, TILE_N), (CLUSTER_M, CLUSTER_N), bool(USE_2_CTA)
-    )
-
-    total_num_clusters = int(compute_total_num_clusters(probs_t, cluster_tile_shape_mn))
-
-    gemm_cache_key = get_gemm_cache_key(
-        prep_cache_key, max_active_clusters, total_num_clusters
-    )
-    gemm_executor = gemm_cache.get(gemm_cache_key)
-
-    if gemm_executor is None:
-        grouped_gemm = GroupedGemmKernel(
-            acc_dtype=ACC_DTYPE,
-            use_2cta_instrs=USE_2_CTA,
-            mma_tiler_mn=(TILE_M, TILE_N),
-            cluster_shape_mn=(CLUSTER_M, CLUSTER_N),
-            tensormap_update_mode=TENSORMAP_UPDATE_MODE,
-        )
-
-        gemm_executor = cute.compile(
-            grouped_gemm,
-            from_dlpack(input_a.unsqueeze(-1), assumed_align=16),
-            from_dlpack(input_b[0].unsqueeze(-1), assumed_align=16),
-            from_dlpack({{get_output()}}.unsqueeze(-1), assumed_align=16),
-            G,
-            probs,
-            strides,
-            ptrs,
-            total_num_clusters,
-            tensormap_workspace,
-            max_active_clusters,
-            stream,
-        )
-
-        gemm_cache[gemm_cache_key] = gemm_executor
-
-    gemm_executor(
-        from_dlpack(input_a.unsqueeze(-1), assumed_align=16),
-        from_dlpack(input_b[0].unsqueeze(-1), assumed_align=16),
-        from_dlpack({{get_output()}}.unsqueeze(-1), assumed_align=16),
-        probs,
-        strides,
-        ptrs,
-        tensormap_workspace,
-        stream,
-    )
--- a/torch/_inductor/runtime/coordinate_descent_tuner.py
+++ b/torch/_inductor/runtime/coordinate_descent_tuner.py
@ -5,6 +5,8 @@ import logging
 from collections.abc import Callable
 from typing import TYPE_CHECKING

+from torch.utils._ordered_set import OrderedSet
+
 from .hints import TRITON_MAX_BLOCK
 from .runtime_utils import red_text, triton_config_to_hashable

@ -54,6 +56,7 @@ class CoordescTuner:
        name="unknown",
        size_hints=None,
        inductor_meta=None,
+        frozen_fields=None,
    ):
        self.is_mm = is_mm  # we will tune num_stages for mm

@ -66,6 +69,9 @@ class CoordescTuner:
        self.name = name
        self.size_hints = size_hints
        self.inductor_meta = inductor_meta or {}
+        self.frozen_fields: OrderedSet[str] = (
+            OrderedSet(frozen_fields) if frozen_fields is not None else OrderedSet()
+        )

    def get_config_max(self, prefix: str) -> int:
        max_block = TRITON_MAX_BLOCK[prefix.upper()]
@ -117,7 +123,7 @@ class CoordescTuner:
            out.append("num_stages")
            out.remove("ZBLOCK")  # ZBLOCK=1 always in native matmul

-        return out
+        return [f for f in out if f not in self.frozen_fields]

    def value_too_large(self, name: str, val: int) -> bool:
        block_suffix = "BLOCK"
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@ -336,6 +336,7 @@ class CachingAutotuner(KernelInterface):
            name=self.fn.__name__,
            size_hints=size_hints,
            inductor_meta=self.inductor_meta,
+            frozen_fields=self.get_coordesc_frozen_fields(),
        )
        self.filename = filename

@ -365,6 +366,13 @@ class CachingAutotuner(KernelInterface):
        # Mode for launch grid calculation
        self.grid_mode: Literal["python", "cpp"] = "python"

+    def get_coordesc_frozen_fields(self) -> OrderedSet[str]:
+        out: OrderedSet[str] = OrderedSet()
+        if self.inductor_meta.get("RSPLIT_SIZE"):
+            # We fix XBLOCK for mix order reduction
+            out.add("XBLOCK")
+        return out
+
    def is_statically_launchable(self):
        """
        Checks if every compiled kernel is statically launchable, which
--- a/torch/_inductor/template_heuristics/cutedsl.py
+++ b/torch/_inductor/template_heuristics/cutedsl.py
@ -1,141 +0,0 @@
-from dataclasses import dataclass
-from enum import auto, Enum
-from itertools import product
-
-import torch._inductor.config as config
-
-
-class TensorMapUpdateMode(Enum):
-    """Enum mirroring cutlass.utils.TensorMapUpdateMode to decouple this file from a cutlass dependency."""
-
-    SMEM = auto()
-    GMEM = auto()
-
-
-@dataclass(frozen=True)
-class CuTeGemmConfig:
-    TILE_M: int = 128
-    TILE_N: int = 192
-    CLUSTER_M: int = 2
-    CLUSTER_N: int = 1
-    USE_2_CTA: bool = False
-    TENSORMAP_UPDATE_MODE: TensorMapUpdateMode = TensorMapUpdateMode.SMEM
-
-
-def get_exhaustive_groupgemm_configs() -> list[CuTeGemmConfig]:
-    """
-    Returns the exhaustive configuration set for the Blackwell CuTeDSL Grouped GEMM kernel.
-    For information regarding valid config sets, see:
-    https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/blackwell/grouped_gemm.py
-    """
-
-    # Tile_n is always the same regardless of 2cta
-    tile_n_vals = [32, 64, 96, 128, 160, 192, 224, 256]
-
-    # Valid clusters
-    clusters_no_2cta = [
-        (1, 1),
-        (1, 2),
-        (1, 4),
-        (1, 8),
-        (1, 16),
-        (2, 1),
-        (2, 2),
-        (2, 4),
-        (2, 8),
-        (4, 1),
-        (4, 2),
-        (4, 4),
-        (8, 1),
-        (8, 2),
-        (16, 1),
-    ]
-    clusters_2cta = [
-        (2, 1),
-        (2, 2),
-        (2, 4),
-        (2, 8),
-        (4, 1),
-        (4, 2),
-        (4, 4),
-        (8, 1),
-        (8, 2),
-        (16, 1),
-    ]
-
-    configs: list[CuTeGemmConfig] = []
-
-    for use_2cta, cluster_set, tile_m_range in [
-        (False, clusters_no_2cta, [64, 128]),
-        (True, clusters_2cta, [128, 256]),
-    ]:
-        for tensormap_update_mode, tile_m, tile_n, (cluster_m, cluster_n) in product(
-            [TensorMapUpdateMode.SMEM, TensorMapUpdateMode.GMEM],
-            tile_m_range,
-            tile_n_vals,
-            cluster_set,
-        ):
-            configs.append(
-                CuTeGemmConfig(
-                    tile_m,
-                    tile_n,
-                    cluster_m,
-                    cluster_n,
-                    USE_2_CTA=use_2cta,
-                    TENSORMAP_UPDATE_MODE=tensormap_update_mode,
-                )
-            )
-
-    return configs
-
-
-def get_default_groupgemm_configs() -> list[CuTeGemmConfig]:
-    """
-    Returns the default configuration set for the Blackwell CuTeDSL Grouped GEMM kernel.
-    """
-
-    config_tuples = [
-        (128, 256, 2, 1, False, TensorMapUpdateMode.SMEM),
-        (256, 160, 2, 1, True, TensorMapUpdateMode.GMEM),
-        (256, 256, 2, 1, True, TensorMapUpdateMode.GMEM),
-        (64, 32, 1, 1, False, TensorMapUpdateMode.GMEM),
-        (64, 256, 1, 2, False, TensorMapUpdateMode.SMEM),
-        (128, 256, 1, 2, False, TensorMapUpdateMode.SMEM),
-        (256, 256, 2, 2, True, TensorMapUpdateMode.GMEM),
-        (128, 256, 1, 2, False, TensorMapUpdateMode.GMEM),
-        (64, 32, 1, 1, False, TensorMapUpdateMode.SMEM),
-        (256, 256, 2, 1, True, TensorMapUpdateMode.SMEM),
-        (128, 256, 1, 1, False, TensorMapUpdateMode.GMEM),
-        (256, 256, 8, 1, True, TensorMapUpdateMode.GMEM),
-        (64, 32, 1, 2, False, TensorMapUpdateMode.SMEM),
-        (256, 192, 2, 1, True, TensorMapUpdateMode.GMEM),
-        (256, 256, 2, 2, True, TensorMapUpdateMode.SMEM),
-        (128, 96, 1, 2, False, TensorMapUpdateMode.SMEM),
-        (64, 192, 1, 1, False, TensorMapUpdateMode.SMEM),
-        (64, 64, 1, 1, False, TensorMapUpdateMode.GMEM),
-        (64, 192, 1, 1, False, TensorMapUpdateMode.GMEM),
-        (128, 64, 1, 1, False, TensorMapUpdateMode.GMEM),
-        (64, 160, 1, 1, False, TensorMapUpdateMode.GMEM),
-        (64, 256, 1, 1, False, TensorMapUpdateMode.GMEM),
-    ]
-
-    return [CuTeGemmConfig(*args) for args in config_tuples]
-
-
-def get_groupgemm_configs() -> list[CuTeGemmConfig]:
-    """
-    Returns the configuration set for the Blackwell CuTeDSL Grouped GEMM kernel.
-
-    Note: CuTeDSL autotuning is still experimental — enabling it may trigger kernel launch failures
-    or unstable results. By default, autotuning is disabled and we return only
-    a single baseline config.
-    """
-    if (
-        config.cutedsl_enable_autotuning
-        and config.max_autotune_gemm_search_space == "EXHAUSTIVE"
-    ):
-        return get_exhaustive_groupgemm_configs()
-    elif config.cutedsl_enable_autotuning:
-        return get_default_groupgemm_configs()
-    else:
-        return [get_default_groupgemm_configs()[0]]
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@ -1975,77 +1975,6 @@ def use_triton_blackwell_tma_template(
    return has_triton_tensor_descriptor_host_tma() and is_datacenter_blackwell_arch()


-@functools.lru_cache(maxsize=1)
-def ensure_cute_available() -> bool:
-    """Check if CuTeDSL is importable; cache the result for reuse.
-
-    Call ensure_cute_available.cache_clear() after installing CuTeDSL
-    in the same interpreter to retry the import.
-    """
-    try:
-        return importlib.util.find_spec("cutlass.cute") is not None
-    except ImportError:
-        return False
-
-
-def use_blackwell_cutedsl_grouped_mm(
-    mat_a: Any,
-    mat_b: Any,
-    layout: Layout,
-    a_is_2d: bool,
-    b_is_2d: bool,
-    offs: Optional[Any],
-    bias: Optional[Any],
-    scale_result: Optional[Any],
-) -> bool:
-    """
-    Returns True if we can use the blackwell kernel for grouped mm.
-    Required conditions:
-        1. CuTeDSL is available
-        2. We are on a blackwell arch
-        3. The dtype is bf16
-        4. Max autotune or max autotune gemm is enabled
-        6. A, B, and the output are 16B aligned
-        7. We are not using dynamic shapes
-        8. A is 2d
-        9. B is 3d
-        10. Offsets are provided
-        11. Bias and Scale are not provided
-    """
-    if not ensure_cute_available():
-        return False
-
-    from .codegen.cuda.cuda_env import is_datacenter_blackwell_arch
-
-    if not is_gpu(layout.device.type) and is_datacenter_blackwell_arch():
-        return False
-
-    layout_dtypes = [torch.bfloat16]
-    if not _use_template_for_gpu(layout, layout_dtypes):
-        return False
-
-    if not (config.max_autotune or config.max_autotune_gemm):
-        return False
-
-    # Checks for 16B ptr and stride alignment
-    if not can_use_tma(mat_a, mat_b, output_layout=layout):
-        return False
-
-    if any(is_dynamic(x) for x in [mat_a, mat_b]):
-        return False
-
-    if not a_is_2d or b_is_2d:
-        return False
-
-    if offs is None:
-        return False
-
-    if bias is not None or scale_result is not None:
-        return False
-
-    return True
-
-
 def use_cutlass_template(layout: Layout, m: int, n: int, k: int) -> bool:
    from .virtualized import V

--- a/torch/cuda/init.py
+++ b/torch/cuda/init.py
@ -1228,7 +1228,7 @@ def _get_pynvml_handler(device: "Device" = None):
            "nvidia-ml-py does not seem to be installed or it can't be imported."
            # pyrefly: ignore [invalid-inheritance]
        ) from _PYNVML_ERR
-    # pyrefly: ignore [import-error]
+    # pyrefly: ignore [import-error,missing-module-attribute]
    from pynvml import NVMLError_DriverNotLoaded

    try:
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@ -828,7 +828,7 @@ def list_gpu_processes(device: "Device" = None) -> str:
            import pynvml  # type: ignore[import]
        except ModuleNotFoundError:
            return "pynvml module not found, please install nvidia-ml-py"
-        # pyrefly: ignore [import-error]
+        # pyrefly: ignore [import-error,missing-module-attribute]
        from pynvml import NVMLError_DriverNotLoaded

        try:
--- a/torch/package/_stdlib.py
+++ b/torch/package/_stdlib.py
@ -17,230 +17,5 @@ def is_stdlib_module(module: str) -> bool:


 def _get_stdlib_modules():
-    if sys.version_info.major == 3:  # noqa: UP036
-        if sys.version_info.minor == 9:
-            return stdlib3_9
-        if sys.version_info.minor >= 10:  # noqa: YTT204
-            return sys.stdlib_module_names  # type: ignore[attr-defined]
-    elif sys.version_info.major > 3:  # noqa: UP036
-        return sys.stdlib_module_names  # type: ignore[attr-defined]
-
-    raise RuntimeError(f"Unsupported Python version: {sys.version_info}")
-
-
-stdlib3_9 = {
-    "_thread",
-    "abc",
-    "aifc",
-    "argparse",
-    "array",
-    "ast",
-    "asynchat",
-    "asyncio",
-    "asyncore",
-    "atexit",
-    "audioop",
-    "base64",
-    "bdb",
-    "binascii",
-    "binhex",
-    "bisect",
-    "builtins",
-    "bz2",
-    "cProfile",
-    "calendar",
-    "cgi",
-    "cgitb",
-    "chunk",
-    "cmath",
-    "cmd",
-    "code",
-    "codecs",
-    "codeop",
-    "collections",
-    "colorsys",
-    "compileall",
-    "concurrent",
-    "configparser",
-    "contextlib",
-    "contextvars",
-    "copy",
-    "copyreg",
-    "crypt",
-    "csv",
-    "ctypes",
-    "curses",
-    "dataclasses",
-    "datetime",
-    "dbm",
-    "decimal",
-    "difflib",
-    "dis",
-    "distutils",
-    "doctest",
-    "email",
-    "encodings",
-    "ensurepip",
-    "enum",
-    "errno",
-    "faulthandler",
-    "fcntl",
-    "filecmp",
-    "fileinput",
-    "fnmatch",
-    "formatter",
-    "fractions",
-    "ftplib",
-    "functools",
-    "gc",
-    "getopt",
-    "getpass",
-    "gettext",
-    "glob",
-    "graphlib",
-    "grp",
-    "gzip",
-    "hashlib",
-    "heapq",
-    "hmac",
-    "html",
-    "http",
-    "imaplib",
-    "imghdr",
-    "imp",
-    "importlib",
-    "inspect",
-    "io",
-    "ipaddress",
-    "itertools",
-    "json",
-    "keyword",
-    "lib2to3",
-    "linecache",
-    "locale",
-    "logging",
-    "lzma",
-    "mailbox",
-    "mailcap",
-    "marshal",
-    "math",
-    "mimetypes",
-    "mmap",
-    "modulefinder",
-    "msilib",
-    "msvcrt",
-    "multiprocessing",
-    "netrc",
-    "nis",
-    "nntplib",
-    "ntpath",
-    "numbers",
-    "operator",
-    "optparse",
-    "os",
-    "ossaudiodev",
-    "parser",
-    "pathlib",
-    "pdb",
-    "pickle",
-    "pickletools",
-    "pipes",
-    "pkgutil",
-    "platform",
-    "plistlib",
-    "poplib",
-    "posix",
-    "posixpath",
-    "pprint",
-    "profile",
-    "pstats",
-    "pty",
-    "pwd",
-    "py_compile",
-    "pyclbr",
-    "pydoc",
-    "queue",
-    "quopri",
-    "random",
-    "re",
-    "readline",
-    "reprlib",
-    "resource",
-    "rlcompleter",
-    "runpy",
-    "sched",
-    "secrets",
-    "select",
-    "selectors",
-    "shelve",
-    "shlex",
-    "shutil",
-    "signal",
-    "site",
-    "smtpd",
-    "smtplib",
-    "sndhdr",
-    "socket",
-    "socketserver",
-    "spwd",
-    "sqlite3",
-    "sre",
-    "sre_compile",
-    "sre_constants",
-    "sre_parse",
-    "ssl",
-    "stat",
-    "statistics",
-    "string",
-    "stringprep",
-    "struct",
-    "subprocess",
-    "sunau",
-    "symbol",
-    "symtable",
-    "sys",
-    "sysconfig",
-    "syslog",
-    "tabnanny",
-    "tarfile",
-    "telnetlib",
-    "tempfile",
-    "termios",
-    "test",
-    "textwrap",
-    "threading",
-    "time",
-    "timeit",
-    "tkinter",
-    "token",
-    "tokenize",
-    "trace",
-    "traceback",
-    "tracemalloc",
-    "tty",
-    "turtle",
-    "turtledemo",
-    "types",
-    "typing",
-    "unicodedata",
-    "unittest",
-    "urllib",
-    "uu",
-    "uuid",
-    "venv",
-    "warnings",
-    "wave",
-    "weakref",
-    "webbrowser",
-    "winreg",
-    "winsound",
-    "wsgiref",
-    "xdrlib",
-    "xml",
-    "xmlrpc",
-    "zipapp",
-    "zipfile",
-    "zipimport",
-    "zlib",
-    "zoneinfo",
-}
+    assert sys.version_info >= (3, 10)
+    return sys.stdlib_module_names
Author	SHA1	Message	Date
Robert Hardwick	fa1a4b342c	Enable test_linalg tests on AArch64 ghstack-source-id: 1f44f0fbc9c5f30b04ded6355693260f096c247b Pull-Request: https://github.com/pytorch/pytorch/pull/167069	2025-11-05 15:10:24 +00:00
Shunting Zhang	397d9fe2ae	[inductor] coordesc not tune XBLOCK for mix-order-reduction (#166669 ) For mix-order reduction, we current force XBLOCK to be 1 to simplify codegen. Don't tune it in CDT. Differential Revision: [](https://our.internmc.facebook.com/intern/diff/) Differential Revision: [D86224689](https://our.internmc.facebook.com/intern/diff/D86224689) Pull Request resolved: https://github.com/pytorch/pytorch/pull/166669 Approved by: https://github.com/jansel, https://github.com/mlazos, https://github.com/eellison, https://github.com/v0i0	2025-11-04 20:27:07 +00:00
PyTorch MergeBot	d77c24caac	Revert "[Inductor][Grouped Gemm] Add Blackwell CuTeDSL Kernel (#165036 )" This reverts commit 0e1a88904f4a5e30634b196678b56e1d6ec074f5. Reverted https://github.com/pytorch/pytorch/pull/165036 on behalf of https://github.com/atalman due to regressed vllm signal: [GH job link](https://github.com/pytorch/pytorch/actions/runs/19059329909/job/54439919668) [HUD commit link](`0e1a88904f`) ([comment](https://github.com/pytorch/pytorch/pull/165036#issuecomment-3487846555))	2025-11-04 20:13:33 +00:00
IvanKobzarev	cef98ae5cb	[aotd] Compiled saved tensor hooks context (#166887 ) Draft to expose compiled saved tensor hook context to selectively apply them. Exposing node, fw_graph, bw_graph. Pull Request resolved: https://github.com/pytorch/pytorch/pull/166887 Approved by: https://github.com/bdhirsh	2025-11-04 20:07:00 +00:00
Nikita Shulga	52ea135f77	[BE] Delete Python-3.9 stdlib definitions from torch.package (#166768 ) And simplify the entire function to just assert and return Pull Request resolved: https://github.com/pytorch/pytorch/pull/166768 Approved by: https://github.com/cyyever, https://github.com/atalman	2025-11-04 19:33:14 +00:00
Oguz Ulgen	a5f3035aaf	More pyrefly local errors (#166976 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/166976 Approved by: https://github.com/maggiemoss, https://github.com/Skylion007	2025-11-04 18:51:35 +00:00
Eddie Yan	1d3f5e19da	[cuDNN] Smoke-test runtime cuDNN version matches compile time version in CI (#165922 ) Fix and regression test for https://github.com/pytorch/pytorch/issues/165801 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165922 Approved by: https://github.com/malfet, https://github.com/atalman, https://github.com/Skylion007, https://github.com/drisspg Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com> Co-authored-by: Andrey Talman <atalman@fb.com>	2025-11-04 18:46:43 +00:00
amdfaa	496277a8ff	[ROCm][CI] Lower runner check gpu count for distributed jobs (#166961 ) This is a PR to temporarily relieve the queueing that is caused by an mi250 node outage. See this ticket for more information: https://github.com/pytorch/pytorch/issues/166866 It relaxes the GPU count check to allow distributed jobs to run on 2-GPU runners Pull Request resolved: https://github.com/pytorch/pytorch/pull/166961 Approved by: https://github.com/jeffdaily	2025-11-04 18:44:21 +00:00