Update on "[Inductor XPU GEMM] Step 1/N: Refactor cutlass configuration."

This PR is the first step toward implementing RFC #160175. Currently, all Cutlass-related Torch Inductor configs are located in `torch._inductor.config.cuda`. This PR refactors the device-agnostic Cutlass configurations into `torch._inductor.config.cutlass`, so they can be shared and reused by XPU as well. cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx ipiszy chenyang78 kadeng muchulee8 amjames chauhang aakhundov coconutruben mlazos [ghstack-poisoned]
Update base for Update on "[Inductor XPU GEMM] Step 1/N: Refactor cutlass configuration."
2025-11-12 06:44:55 +08:00 · 2025-11-07 06:18:39 +00:00 · 2025-11-07 06:18:39 +00:00 · 2025-11-07 01:52:26 +00:00 · 2025-11-07 01:52:26 +00:00 · 2025-11-06 19:34:19 +00:00
12 changed files with 141 additions and 121 deletions
--- a/benchmarks/inductor_backends/cutlass.py
+++ b/benchmarks/inductor_backends/cutlass.py
@ -125,7 +125,7 @@ class CutlassExperimentConfig(ExperimentConfig):
    def to_options(self) -> dict[str, Any]:
        return {
            **super().to_options(),
-            "cuda.cutlass_instantiation_level": self.cutlass_instantiation_level,
+            "cutlass.cutlass_instantiation_level": self.cutlass_instantiation_level,
        }


--- a/test/inductor/test_cutlass_backend.py
+++ b/test/inductor/test_cutlass_backend.py
@ -133,10 +133,10 @@ use_evt_config = config.patch(
    {
        "max_autotune": True,
        "max_autotune_gemm_backends": "CUTLASS",
-        "cuda.cutlass_max_profiling_configs": 1,
+        "cutlass.cutlass_max_profiling_configs": 1,
        "benchmark_epilogue_fusion": False,  # EVT doesn't support benchmark fusion yet
-        "cuda.cutlass_tma_only": True,
-        "cuda.cutlass_epilogue_fusion_enabled": True,
+        "cutlass.cutlass_tma_only": True,
+        "cutlass.cutlass_epilogue_fusion_enabled": True,
    }
 )

@ -144,9 +144,9 @@ fp8_config = config.patch(
    {
        "max_autotune": True,
        "max_autotune_gemm_backends": "CUTLASS",
-        "cuda.cutlass_max_profiling_configs": 1,
+        "cutlass.cutlass_max_profiling_configs": 1,
        "benchmark_epilogue_fusion": False,  # EVT doesn't support benchmark fusion yet
-        "cuda.cutlass_tma_only": True,
+        "cutlass.cutlass_tma_only": True,
    }
 )

@ -234,8 +234,8 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "max_autotune_gemm_backends": "CUTLASS",
                "compile_threads": 4,
-                "cuda.cutlass_backend_min_gemm_size": 100000,
-                "cuda.cutlass_max_profiling_configs": 2,
+                "cutlass.cutlass_backend_min_gemm_size": 100000,
+                "cutlass.cutlass_max_profiling_configs": 2,
            }
        ):
            with mock.patch(
@ -287,7 +287,7 @@ class TestCutlassBackend(TestCase):
                "autotune_in_subproc": True,
                "max_autotune_gemm_backends": "CUTLASS",
                "compile_threads": 4,
-                "cuda.cutlass_max_profiling_configs": 4,
+                "cutlass.cutlass_max_profiling_configs": 4,
            }
        ):
            Y_compiled = torch.compile(torch.mm)(a, b)
@ -324,7 +324,7 @@ class TestCutlassBackend(TestCase):
                "autotune_in_subproc": True,
                "max_autotune_gemm_backends": "CUTLASS",
                "compile_threads": 4,
-                "cuda.cutlass_max_profiling_configs": 4,
+                "cutlass.cutlass_max_profiling_configs": 4,
            }
        ):
            for x_shape in x_shapes:
@ -354,7 +354,7 @@ class TestCutlassBackend(TestCase):
                "autotune_in_subproc": True,
                "max_autotune_gemm_backends": "CUTLASS",
                "compile_threads": 4,
-                "cuda.cutlass_max_profiling_configs": 4,
+                "cutlass.cutlass_max_profiling_configs": 4,
            }
        ):
            Y_compiled = torch.compile(torch.bmm)(a, b)
@ -386,7 +386,7 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "autotune_in_subproc": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                "cuda.cutlass_max_profiling_configs": 1,
+                "cutlass.cutlass_max_profiling_configs": 1,
            }
        ):
            from torch._inductor.utils import run_and_get_code
@ -428,8 +428,8 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "autotune_in_subproc": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                "cuda.cutlass_max_profiling_configs": 1,
-                "cuda.cutlass_max_profiling_swizzle_options": [
+                "cutlass.cutlass_max_profiling_configs": 1,
+                "cutlass.cutlass_max_profiling_swizzle_options": [
                    1,
                    2,
                    4,
@ -505,7 +505,7 @@ class TestCutlassBackend(TestCase):
                {
                    "max_autotune": True,
                    "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                    "cuda.cutlass_max_profiling_configs": 2,
+                    "cutlass.cutlass_max_profiling_configs": 2,
                }
            ),
            dynamo_config.patch({"error_on_recompile": dynamic}),
@ -595,9 +595,9 @@ class TestCutlassBackend(TestCase):
                {
                    "max_autotune": True,
                    "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                    "cuda.cutlass_max_profiling_configs": 2,
+                    "cutlass.cutlass_max_profiling_configs": 2,
                    "benchmark_epilogue_fusion": False,  # EVT doesn't support benchmark fusion yet
-                    "cuda.cutlass_tma_only": True,
+                    "cutlass.cutlass_tma_only": True,
                }
            ),
            dynamo_config.patch({"error_on_recompile": dynamic}),
@ -677,7 +677,7 @@ class TestCutlassBackend(TestCase):
                    {
                        "max_autotune": True,
                        "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                        "cuda.cutlass_max_profiling_configs": 2,
+                        "cutlass.cutlass_max_profiling_configs": 2,
                    }
                ),
                dynamo_config.patch({"error_on_recompile": dynamic}),
@ -746,7 +746,7 @@ class TestCutlassBackend(TestCase):
            {
                "max_autotune": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                "cuda.cutlass_max_profiling_configs": 2,
+                "cutlass.cutlass_max_profiling_configs": 2,
            }
        ):
            expected = [model(*input) for input in inputs]
@ -775,8 +775,8 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "autotune_in_subproc": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                "cuda.cutlass_max_profiling_configs": 2,
-                "cuda.cutlass_op_allowlist_regex": "stream_k",  # only stream-k GEMM Kernels
+                "cutlass.cutlass_max_profiling_configs": 2,
+                "cutlass.cutlass_op_allowlist_regex": "stream_k",  # only stream-k GEMM Kernels
            }
        ):
            for M, K, N in (
@ -819,7 +819,7 @@ class TestCutlassBackend(TestCase):
            {
                "max_autotune": True,
                "max_autotune_gemm_backends": "CUTLASS",
-                "cuda.cutlass_op_allowlist_regex": "stream_k",  # only stream-k GEMM Kernels
+                "cutlass.cutlass_op_allowlist_regex": "stream_k",  # only stream-k GEMM Kernels
            }
        ):
            with self.assertRaisesRegex(InductorError, r".*NoValidChoicesError.*"):
@ -849,8 +849,8 @@ class TestCutlassBackend(TestCase):
                {
                    "max_autotune": True,
                    "max_autotune_gemm_backends": "CUTLASS",
-                    "cuda.cutlass_max_profiling_configs": 1,
-                    "cuda.cutlass_op_allowlist_regex": "stream_k",  # only stream-k GEMM Kernels
+                    "cutlass.cutlass_max_profiling_configs": 1,
+                    "cutlass.cutlass_op_allowlist_regex": "stream_k",  # only stream-k GEMM Kernels
                }
            ):
                _ = compiled_model(a, b)
@ -884,7 +884,7 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "autotune_in_subproc": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                "cuda.cutlass_max_profiling_configs": 4,
+                "cutlass.cutlass_max_profiling_configs": 4,
                "cuda.version": "12.2",  # required to enable the Kernels we need
            }
        ):
@ -983,7 +983,7 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "autotune_in_subproc": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                "cuda.cutlass_max_profiling_configs": 2,
+                "cutlass.cutlass_max_profiling_configs": 2,
            }
        ):
            Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
@ -1002,7 +1002,7 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "autotune_in_subproc": False,
                "max_autotune_gemm_backends": "CUTLASS",
-                "cuda.cutlass_max_profiling_configs": 2,
+                "cutlass.cutlass_max_profiling_configs": 2,
            }
        ):
            model = MyModel()
@ -1040,7 +1040,7 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "autotune_in_subproc": False,
                "max_autotune_gemm_backends": "CUTLASS",
-                "cuda.cutlass_max_profiling_configs": 2,
+                "cutlass.cutlass_max_profiling_configs": 2,
            }
        ):
            model = MyModel()
@ -1073,8 +1073,8 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "autotune_in_subproc": False,
                "max_autotune_gemm_backends": "CUTLASS",
-                "cuda.cutlass_op_allowlist_regex": "128x256x64.*stream_k_warpspecialized_cooperative_epi_nosmem",
-                "cuda.cutlass_max_profiling_configs": 1,
+                "cutlass.cutlass_op_allowlist_regex": "128x256x64.*stream_k_warpspecialized_cooperative_epi_nosmem",
+                "cutlass.cutlass_max_profiling_configs": 1,
            }
        ):
            model = MyModel()
@ -1117,7 +1117,7 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "autotune_in_subproc": True,
                "max_autotune_gemm_backends": "CUTLASS",
-                "cuda.cutlass_max_profiling_configs": 2,
+                "cutlass.cutlass_max_profiling_configs": 2,
                "autotune_local_cache": True,
            }
        ):
@ -1157,9 +1157,9 @@ class TestCutlassBackend(TestCase):
                {
                    "max_autotune": True,
                    "max_autotune_gemm_backends": "CUTLASS",
-                    "cuda.cutlass_max_profiling_configs": 2,
-                    "cuda.cutlass_op_allowlist_regex": "",
-                    "cuda.cutlass_op_denylist_regex": "pingpong",
+                    "cutlass.cutlass_max_profiling_configs": 2,
+                    "cutlass.cutlass_op_allowlist_regex": "",
+                    "cutlass.cutlass_op_denylist_regex": "pingpong",
                }
            ):
                with mock.patch(
@ -1202,9 +1202,9 @@ class TestCutlassBackend(TestCase):
                {
                    "max_autotune": True,
                    "max_autotune_gemm_backends": "CUTLASS",
-                    "cuda.cutlass_max_profiling_configs": 2,
-                    "cuda.cutlass_op_allowlist_regex": "pingpong",
-                    "cuda.cutlass_op_denylist_regex": None,
+                    "cutlass.cutlass_max_profiling_configs": 2,
+                    "cutlass.cutlass_op_allowlist_regex": "pingpong",
+                    "cutlass.cutlass_op_denylist_regex": None,
                }
            ):
                with mock.patch(
@ -1273,7 +1273,7 @@ class TestCutlassBackend(TestCase):
                    {
                        "max_autotune": True,
                        "max_autotune_gemm_backends": "CUTLASS",
-                        "cuda.cutlass_max_profiling_configs": 2,
+                        "cutlass.cutlass_max_profiling_configs": 2,
                    }
                ):
                    with mock.patch(
@ -1350,7 +1350,7 @@ class TestCutlassBackend(TestCase):
                {
                    "max_autotune": True,
                    "max_autotune_gemm_backends": "CUTLASS",
-                    "cuda.cutlass_max_profiling_configs": 2,
+                    "cutlass.cutlass_max_profiling_configs": 2,
                }
            ),
            mock.patch(
@ -1461,8 +1461,8 @@ class TestCutlassBackend(TestCase):
            {
                "max_autotune": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                "cuda.cutlass_max_profiling_configs": 2,
-                "cuda.generate_test_runner": True,  # put standalone runner in the generated code
+                "cutlass.cutlass_max_profiling_configs": 2,
+                "cutlass.generate_test_runner": True,  # put standalone runner in the generated code
            }
        ):
            from tempfile import NamedTemporaryFile
@ -1544,7 +1544,7 @@ class TestCutlassBackend(TestCase):
            {
                "max_autotune": True,
                "max_autotune_gemm_backends": "ATEN,TRITON,CUTLASS",
-                "cuda.cutlass_max_profiling_configs": 2,
+                "cutlass.cutlass_max_profiling_configs": 2,
                # needed for log searching
                "fx_graph_cache": False,
                "fx_graph_remote_cache": False,
@ -1608,8 +1608,8 @@ class TestCutlassBackend(TestCase):
                    "max_autotune_gemm_backends": "CUTLASS",
                    "fx_graph_cache": False,
                    "fx_graph_remote_cache": False,
-                    "cuda.enable_caching_codegen": True,
-                    "cuda.cutlass_max_profiling_configs": 2,
+                    "cutlass.enable_caching_codegen": True,
+                    "cutlass.cutlass_max_profiling_configs": 2,
                }
            ):
                compiled_model = torch.compile(model, fullgraph=True)
@ -1660,10 +1660,10 @@ class TestCutlassBackend(TestCase):
                {
                    "max_autotune": True,
                    "max_autotune_gemm_backends": "CUTLASS",
-                    "cuda.cutlass_max_profiling_configs": 2,
+                    "cutlass.cutlass_max_profiling_configs": 2,
                    "fx_graph_cache": False,
                    "fx_graph_remote_cache": False,
-                    "cuda.enable_caching_codegen": True,
+                    "cutlass.enable_caching_codegen": True,
                }
            ):
                # Get expected results
@ -1721,10 +1721,10 @@ class TestCutlassBackend(TestCase):
                {
                    "max_autotune": True,
                    "max_autotune_gemm_backends": "CUTLASS",
-                    "cuda.cutlass_max_profiling_configs": 2,
+                    "cutlass.cutlass_max_profiling_configs": 2,
                    "fx_graph_cache": False,
                    "fx_graph_remote_cache": False,
-                    "cuda.enable_caching_codegen": True,
+                    "cutlass.enable_caching_codegen": True,
                }
            ):
                # Get expected results
@ -1752,7 +1752,7 @@ class TestCutlassBackend(TestCase):
            {
                "max_autotune": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                "cuda.cutlass_max_profiling_configs": 2,
+                "cutlass.cutlass_max_profiling_configs": 2,
            }
        ):
            compiled = torch.compile(torch.mm)
@ -1771,7 +1771,7 @@ class TestCutlassBackend(TestCase):
            {
                "max_autotune": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                "cuda.cutlass_max_profiling_configs": 2,
+                "cutlass.cutlass_max_profiling_configs": 2,
            }
        ):
            compiled = torch.compile(torch.mm)
@ -1795,7 +1795,7 @@ class TestCutlassBackend(TestCase):
            {
                "max_autotune": True,
                "max_autotune_gemm_backends": "CUTLASS",
-                "cuda.cutlass_max_profiling_configs": 1,
+                "cutlass.cutlass_max_profiling_configs": 1,
            }
        ):
            _ = torch.compile(model)(B)
@ -1817,7 +1817,7 @@ class TestCutlassBackend(TestCase):
            {
                "max_autotune": True,
                "max_autotune_gemm_backends": "CUTLASS",
-                "cuda.cutlass_max_profiling_configs": 1,
+                "cutlass.cutlass_max_profiling_configs": 1,
            }
        ):
            _ = torch.compile(model)(B)
@ -1845,7 +1845,7 @@ class TestCutlassBackend(TestCase):
            {
                "max_autotune": True,
                "max_autotune_gemm_backends": "CUTLASS",
-                "cuda.cutlass_max_profiling_configs": 1,
+                "cutlass.cutlass_max_profiling_configs": 1,
            }
        ):
            _ = torch.compile(model)(B)
@ -1871,7 +1871,7 @@ class TestCutlassBackend(TestCase):
            {
                "max_autotune": True,
                "max_autotune_gemm_backends": "CUTLASS",
-                "cuda.cutlass_max_profiling_configs": 1,
+                "cutlass.cutlass_max_profiling_configs": 1,
            }
        ):
            if use_aoti:
@ -1968,7 +1968,7 @@ class TestCutlassBackend(TestCase):

        # baseline is cutlass kernel + triton
        # matches expected casting behavior
-        with config.patch({"cuda.cutlass_epilogue_fusion_enabled": False}):
+        with config.patch({"cutlass.cutlass_epilogue_fusion_enabled": False}):
            ref_result = torch.compile(model)(a, b, extra_args)

        self.assertEqual(
@ -2368,7 +2368,7 @@ class TestCutlassBackend(TestCase):
                        "max_autotune_gemm_backends": "CUTLASS",
                        # needed for log searching
                        "force_disable_caches": True,
-                        "cuda.cutlass_max_profiling_swizzle_options": [2],
+                        "cutlass.cutlass_max_profiling_swizzle_options": [2],
                    }
                ):
                    with mock.patch(
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@ -34,7 +34,7 @@ from pathlib import Path
 from tempfile import _TemporaryFileWrapper
 from time import time, time_ns
 from types import ModuleType
-from typing import Any, cast, Generic, NoReturn, TYPE_CHECKING, TypeVar, Union
+from typing import Any, cast, Generic, NoReturn, Optional, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import override, Self

 import torch
@ -3741,7 +3741,7 @@ def _load_triton_kernel_from_source(
    return getattr(PyCodeCache.load(source_code), kernel_name)


-def _cuda_compiler() -> str | None:
+def _cuda_compiler() -> Optional[str]:
    if cuda_env.nvcc_exist(config.cuda.cuda_cxx):
        return config.cuda.cuda_cxx
    if config.is_fbcode():
@ -3759,7 +3759,7 @@ def _cutlass_path() -> str:

        return parutil.get_dir_path("cutlass-4-headers")
    else:
-        return config.cuda.cutlass_dir
+        return config.cutlass.cutlass_dir


 def _cutlass_paths() -> list[str]:
@ -3805,7 +3805,7 @@ def cutlass_key() -> bytes:
                return resource_file.read().encode()

    combined_hash = hashlib.sha256()
-    build_code_hash([config.cuda.cutlass_dir], "", combined_hash)
+    build_code_hash([config.cutlass.cutlass_dir], "", combined_hash)
    return combined_hash.digest()


@ -3875,14 +3875,14 @@ def _nvcc_compiler_options() -> list[str]:
        "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
        "-w",
        f"-gencode=arch=compute_{arch},code=[{','.join(code)}]",
-        config.cuda.compile_opt_level,
+        config.cutlass.compile_opt_level,
        "-std=c++17",
        "--expt-relaxed-constexpr",
        "-DNDEBUG",
    ]
    if config.is_fbcode():
        options.extend(["-ccbin", os.path.dirname(build_paths.gcc)])
-    if config.cuda.enable_debug_info:
+    if config.cutlass.enable_debug_info:
        options.extend(["-lineinfo", "-g", "-DCUTLASS_DEBUG_TRACE_LEVEL=1"])
    if config.cuda.enable_ptxas_info:
        options.extend(
@ -3894,7 +3894,7 @@ def _nvcc_compiler_options() -> list[str]:
                "--source-in-ptx",
            ]
        )  # Annotate the ptx file with source information
-    if config.cuda.use_fast_math:
+    if config.cutlass.use_fast_math:
        options.extend(
            [
                "--use_fast_math",
@ -4098,7 +4098,7 @@ class CUDACodeCache:
        Returns the hash key of source code, and the path to the file.
        """

-        if config.cuda.cutlass_hash_with_compile_cmd:
+        if config.cutlass.cutlass_hash_with_compile_cmd:
            cuda_command = repr(
                cuda_compile_command(["dummy_input"], "dummy_output", dst_file_ext)
            )
@ -4149,7 +4149,7 @@ class CUDACodeCache:
                output_path = input_path[: -len(cls._SOURCE_CODE_SUFFIX)] + dst_file_ext
                error_path = binary_error_path(output_path)
                binary_remote_cache = cls.get_kernel_binary_remote_cache(
-                    caching_enabled=config.cuda.use_binary_remote_cache
+                    caching_enabled=config.cutlass.use_binary_remote_cache
                    and not config.force_disable_caches,
                    caching_available=config.is_fbcode(),
                )
@ -4164,13 +4164,13 @@ class CUDACodeCache:
                    cmd_parts, error_output = json.loads(error_json)
                    if (
                        binary_remote_cache is not None
-                        and config.cuda.upload_to_binary_remote_cache
+                        and config.cutlass.upload_to_binary_remote_cache
                    ):
                        # This ensures that a local error is uploaded to the remote cache,
                        # as we make no assumptions about the remote cache having the same
                        # information as the local cache
                        binary_remote_cache.put(
-                            error_path, config.cuda.binary_remote_cache_force_write
+                            error_path, config.cutlass.binary_remote_cache_force_write
                        )
                    cls.cache[key_with_ext] = CUDACodeCache.CacheEntry(
                        input_path, output_path, error_json
@ -4234,11 +4234,11 @@ class CUDACodeCache:
                # Upload to remote cache if enabled
                if (
                    binary_remote_cache is not None
-                    and config.cuda.upload_to_binary_remote_cache
+                    and config.cutlass.upload_to_binary_remote_cache
                ):
                    # will log on errors, but not fail out
                    binary_remote_cache.put(
-                        output_path, config.cuda.binary_remote_cache_force_write
+                        output_path, config.cutlass.binary_remote_cache_force_write
                    )
                cls.cache[key_with_ext] = CUDACodeCache.CacheEntry(
                    input_path, output_path, None
@ -4291,10 +4291,10 @@ class CUDACodeCache:
        # Upload to remote cache directly from memory if enabled
        if (
            binary_remote_cache is not None
-            and config.cuda.upload_to_binary_remote_cache
+            and config.cutlass.upload_to_binary_remote_cache
        ):
            binary_remote_cache.put(
-                error_path, config.cuda.binary_remote_cache_force_write
+                error_path, config.cutlass.binary_remote_cache_force_write
            )


--- a/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
+++ b/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
@ -257,7 +257,7 @@ size: {cuda_template_buffer.get_size()}"
            )
            return False
        elif (
-            not config.cuda.cutlass_epilogue_fusion_enabled
+            not config.cutlass.cutlass_epilogue_fusion_enabled
            or not config.epilogue_fusion
        ):
            why("cutlass epilogue fusion is not enabled")
--- a/torch/_inductor/codegen/cuda/cuda_template.py
+++ b/torch/_inductor/codegen/cuda/cuda_template.py
@ -110,7 +110,7 @@ class CUDATemplate(KernelTemplate):
        args are different.
        """
        key: Optional[str] = None
-        if config.cuda.enable_caching_codegen:
+        if config.cutlass.enable_caching_codegen:
            key = self.make_key(name=name, input_key=input_key, layout_repr=layout_repr)

        if key is not None and key in self.code_cache:
--- a/torch/_inductor/codegen/cuda/cutlass_cache.py
+++ b/torch/_inductor/codegen/cuda/cutlass_cache.py
@ -75,7 +75,7 @@ def maybe_fetch_ops() -> Optional[list[Any]]:
    # get_cuda_version might return "12.4.0" or "12.4"
    # but we want to use "12.4"
    version: str = ".".join(get_cuda_version().split(".")[:2])
-    instantiation_level: str = config.cuda.cutlass_instantiation_level
+    instantiation_level: str = config.cutlass.cutlass_instantiation_level

    # filename and filepath
    request_key: str = get_config_request_key(arch, version, instantiation_level)
--- a/torch/_inductor/codegen/cuda/cutlass_utils.py
+++ b/torch/_inductor/codegen/cuda/cutlass_utils.py
@ -98,7 +98,7 @@ def try_import_cutlass() -> bool:

    # contains both cutlass and cutlass_library
    # we need cutlass for eVT
-    cutlass_python_path = path_join(config.cuda.cutlass_dir, "python")
+    cutlass_python_path = path_join(config.cutlass.cutlass_dir, "python")
    torch_root = os.path.abspath(os.path.dirname(torch.__file__))
    mock_src_path = os.path.join(
        torch_root,
@ -252,7 +252,7 @@ def _gen_ops_cached(arch, version) -> dict[Any, Any]:
        )
        return {}
    arch = _normalize_cuda_arch(arch)
-    instantiation_level: str = config.cuda.cutlass_instantiation_level
+    instantiation_level: str = config.cutlass.cutlass_instantiation_level
    args = CUTLASSArgs(
        architectures=arch,
        cuda_version=version,
--- a/torch/_inductor/codegen/cuda/gemm_template.py
+++ b/torch/_inductor/codegen/cuda/gemm_template.py
@ -19,7 +19,7 @@ from torch._inductor.select_algorithm import create_inputs_key
 from torch._inductor.utils import clear_on_fresh_cache

 from ... import ir
-from ...config import cuda as inductor_cuda_config
+from ...config import cutlass as inductor_cutlass_config
 from ...ir import (
    Buffer,
    ChoiceCaller,
@ -578,7 +578,7 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
            for name, op in ops:
                for (
                    swizzle
-                ) in inductor_cuda_config.cutlass_max_profiling_swizzle_options:
+                ) in inductor_cutlass_config.cutlass_max_profiling_swizzle_options:
                    description = f"{name} swizzle={swizzle}"
                    self.maybe_append_choice(
                        choices,
@ -635,7 +635,7 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
                #include "cutlass/util/tensor_view_io.h"
            """
        )
-        if inductor_cuda_config.generate_test_runner and not is_dynamic(
+        if inductor_cutlass_config.generate_test_runner and not is_dynamic(
            *self.input_nodes, self.output_node
        ):
            res.splice(GEMM_STANDALONE_RUNNER_ADDITIONAL_INCLUDES)
@ -953,7 +953,7 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
            )
            return None

-        if inductor_cuda_config.cutlass_tma_only and not self._has_tma_epilogue(op):
+        if inductor_cutlass_config.cutlass_tma_only and not self._has_tma_epilogue(op):
            return None

        # Set epilogue.
@ -975,14 +975,16 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
            return None

        # Apply regex filters at the end when configuration name doesn't change anymore
-        if inductor_cuda_config.cutlass_op_allowlist_regex:
+        if inductor_cutlass_config.cutlass_op_allowlist_regex:
            if not re.search(
-                inductor_cuda_config.cutlass_op_allowlist_regex, op.configuration_name()
+                inductor_cutlass_config.cutlass_op_allowlist_regex,
+                op.configuration_name(),
            ):
                return None
-        if inductor_cuda_config.cutlass_op_denylist_regex is not None:
+        if inductor_cutlass_config.cutlass_op_denylist_regex is not None:
            if re.search(
-                inductor_cuda_config.cutlass_op_denylist_regex, op.configuration_name()
+                inductor_cutlass_config.cutlass_op_denylist_regex,
+                op.configuration_name(),
            ):
                return None

@ -1035,7 +1037,7 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
            time.time() - start_time,
        )
        sorted_res = sorted(res.items())
-        ret_res = sorted_res[: inductor_cuda_config.cutlass_max_profiling_configs]
+        ret_res = sorted_res[: inductor_cutlass_config.cutlass_max_profiling_configs]
        if len(self.filtered_ops_cache) < 50:
            self.filtered_ops_cache[self.cache_key] = ret_res
        else:
@ -1277,7 +1279,9 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
        }
        options.update(dict(zip(extra_names, extra_inputs)))
        res = self._template_from_string(self._get_template()).render(**options)
-        if inductor_cuda_config.generate_test_runner and not is_dynamic(X, W, Y, Bias):
+        if inductor_cutlass_config.generate_test_runner and not is_dynamic(
+            X, W, Y, Bias
+        ):
            test_runner_code = self._template_from_string(
                GEMM_STANDALONE_RUNNER_TEMPLATE
            ).render(**options)
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@ -1767,28 +1767,12 @@ class aot_inductor_mode:
    compile_standalone: bool = False


-class cuda:
-    """Settings for cuda backend, today this consists of cutlass"""
-
-    # CUDA arch to use for CUDA template kernel compilation.
-    # e.g. "70", "75", "80", "90", etc.
-    # When arch is None, Inductor uses torch.cuda.get_device_capability(0).
-    arch: Optional[str] = None
-
-    # CUDA version to use for CUDA template kernel compilation.
-    # e.g. "11.4", "12.1", etc.
-    # When version is None, Inductor uses torch.version.cuda.
-    version: Optional[str] = None
+class cutlass:
+    """Settings for cutlass backend, today this consists of cutlass"""

    # Optimization level for the host compiler.
    compile_opt_level: Literal["-O0", "-O1", "-O2", "-O3", "-OS"] = "-O1"

-    # Whether to enable device LTO (link-time-optimization).
-    enable_cuda_lto = False
-
-    # Whether to keep intermediate files dring compilation.
-    enable_ptxas_info = False
-
    # Whether to enable debug info, e.g. line number, cutlass debug info.
    enable_debug_info = False

@ -1800,7 +1784,10 @@ class cuda:
    cutlass_dir = os.path.realpath(
        os.environ.get(
            "TORCHINDUCTOR_CUTLASS_DIR",
-            os.path.join(os.path.dirname(torch.__file__), "../third_party/cutlass/"),
+            os.path.join(
+                os.path.dirname(torch.__file__),
+                "../third_party/cutlass/",
+            ),
        )
    )

@ -1820,14 +1807,6 @@ class cuda:
    # Whether to only use TMA-compatible kernels in CUTLASS
    cutlass_tma_only = False

-    # Path to CUDA NVCC.
-    # NVCC search order:
-    # 1) cuda_cxx set in this config
-    # 2) CUDACXX environment variable
-    # 3) CUDA_HOME environment variable
-    # 4) default system search PATH.
-    cuda_cxx: Optional[str] = None
-
    # Minimum value of M*N*K to consider the CUTLASS backend for GEMM ops.
    cutlass_backend_min_gemm_size: int = 1

@ -1897,6 +1876,43 @@ class cuda:
    enable_caching_codegen: bool = True


+class cuda(cutlass):
+    """Settings for cutlass backend, today this consists of cutlass"""
+
+    # CUDA arch to use for CUDA template kernel compilation.
+    # e.g. "70", "75", "80", "90", etc.
+    # When arch is None, Inductor uses torch.cuda.get_device_capability(0).
+    arch: Optional[str] = None
+
+    # CUDA version to use for CUDA template kernel compilation.
+    # e.g. "11.4", "12.1", etc.
+    # When version is None, Inductor uses torch.version.cuda.
+    version: Optional[str] = None
+
+    # Path to CUDA NVCC.
+    # NVCC search order:
+    # 1) cuda_cxx set in this config
+    # 2) CUDACXX environment variable
+    # 3) CUDA_HOME environment variable
+    # 4) default system search PATH.
+    cuda_cxx: Optional[str] = None
+
+    # Whether to enable device LTO (link-time-optimization).
+    enable_cuda_lto = False
+
+    # Whether to keep intermediate files dring compilation.
+    enable_ptxas_info = False
+
+
+class xpu(cutlass):
+    # Xe arch to use for SYCL template kernel compilation.
+    # eg. 12, 20, which corresponding to Xe12(PVC) and Xe20 (BMG)
+    arch: Optional[str] = None
+    # oneAPI version to use for SYCL template kernel compilation.
+    # e.g. "20250201".
+    version: Optional[str] = None
+
+
 class rocm:
    # Offload arch list for device code compilation, e.g. ["gfx90a", "gfx942"].
    # If empty, the `native` arch is used
@ -2105,7 +2121,7 @@ _cache_config_ignore_prefix: list[str] = [
    # trace functions are not relevant to config caching
    "trace",
    # uses absolute path
-    "cuda.cutlass_dir",
+    "cutlass.cutlass_dir",
    # not relevant
    "worker_start_method",
    "compile_threads",
--- a/torch/_inductor/fuzzer.py
+++ b/torch/_inductor/fuzzer.py
@ -480,7 +480,7 @@ MODULE_DEFAULTS: dict[str, ConfigType] = {
        "aot_inductor.presets": DEFAULT,  # Typing
        "cuda.arch": DEFAULT,  # Out of Scope
        "cuda.version": DEFAULT,  # Out of Scope
-        "cuda.cutlass_dir": DEFAULT,  # Out of Scope
+        "cutlass.cutlass_dir": DEFAULT,  # Out of Scope
        "cuda.cuda_cxx": DEFAULT,  # Out of Scope
        "rocm.arch": DEFAULT,  # Out of Scope
        "rocm.ck_supported_arch": DEFAULT,  # Out of Scope
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@ -3481,8 +3481,8 @@ class AlgorithmSelectorCache(PersistentCache):

        candidates = []
        if (
-            config.cuda.cutlass_prescreening
-            and len(config.cuda.cutlass_max_profiling_swizzle_options) > 1
+            config.cutlass.cutlass_prescreening
+            and len(config.cutlass.cutlass_max_profiling_swizzle_options) > 1
        ):
            candidates.extend(
                [
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@ -1993,7 +1993,7 @@ def use_cutlass_template(layout: Layout, m: int, n: int, k: int) -> bool:
    from .virtualized import V

    gemm_size = V.graph.sizevars.size_hint(m * n * k, fallback=-1)
-    if gemm_size <= 0 or gemm_size < config.cuda.cutlass_backend_min_gemm_size:
+    if gemm_size <= 0 or gemm_size < config.cutlass.cutlass_backend_min_gemm_size:
        return False
    from .codegen.cuda.cutlass_utils import try_import_cutlass

@ -2014,9 +2014,9 @@ def use_cutlass_template(layout: Layout, m: int, n: int, k: int) -> bool:
        if not try_import_cutlass():
            log.warning(
                "Failed to import CUTLASS lib. Please check whether "
-                "_inductor.config.cuda.cutlass_dir %s is set correctly. "
+                "_inductor.config.cutlass.cutlass_dir %s is set correctly. "
                "Skipping CUTLASS backend for now.",
-                config.cuda.cutlass_dir,
+                config.cutlass.cutlass_dir,
            )
            return False
    return res
@ -2024,7 +2024,7 @@ def use_cutlass_template(layout: Layout, m: int, n: int, k: int) -> bool:

 def _use_cutlass_for_op(op_name: str) -> bool:
    """Check if CUTLASS should be used for the given operation."""
-    enabled_ops = config.cuda.cutlass_enabled_ops.upper()
+    enabled_ops = config.cutlass.cutlass_enabled_ops.upper()
    if enabled_ops == "ALL":
        return True
    return op_name.upper() in [x.strip() for x in enabled_ops.split(",")]