mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-02 06:24:59 +08:00
Compare commits
27 Commits
ci_attn
...
ciflow/b20
| Author | SHA1 | Date | |
|---|---|---|---|
| ea79bcf9c5 | |||
| 02eef6b8b6 | |||
| 658f05ab90 | |||
| 713b9e40d0 | |||
| 1a8cb2c98f | |||
| 3ef674fe75 | |||
| d13fc2dd9d | |||
| a6f51e6d02 | |||
| a521ffb214 | |||
| b0d2b3f6a4 | |||
| a99a09a945 | |||
| 4479694979 | |||
| 1df8fd081e | |||
| b06b512a2d | |||
| 7dc23f1fe1 | |||
| 1283c15de0 | |||
| d87a398e50 | |||
| 0ad432492c | |||
| 6d9cbbded9 | |||
| 7aad0baaac | |||
| 36fbc8b6d7 | |||
| 0481049a2f | |||
| 08f5fe8139 | |||
| 8604af4bfa | |||
| f4ef77b220 | |||
| 44789dc58d | |||
| 2ddfc76a10 |
@ -125,7 +125,7 @@ class CutlassExperimentConfig(ExperimentConfig):
|
||||
def to_options(self) -> dict[str, Any]:
|
||||
return {
|
||||
**super().to_options(),
|
||||
"cuda.cutlass_instantiation_level": self.cutlass_instantiation_level,
|
||||
"cutlass.cutlass_instantiation_level": self.cutlass_instantiation_level,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -14,7 +14,9 @@ from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from torch._dynamo.exc import BackendCompilerFailed
|
||||
from torch._inductor.codegen.cuda.serialization import get_cutlass_operation_serializer
|
||||
from torch._inductor.codegen.cutlass.serialization import (
|
||||
get_cutlass_operation_serializer,
|
||||
)
|
||||
from torch._inductor.utils import clear_caches
|
||||
from torch.export import Dim
|
||||
from torch.testing._internal.logging_utils import log_settings
|
||||
@ -32,11 +34,8 @@ import torch.version
|
||||
from torch._dynamo import config as dynamo_config
|
||||
from torch._dynamo.utils import counters
|
||||
from torch._inductor import config
|
||||
from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
|
||||
from torch._inductor.codegen.cuda.cutlass_utils import (
|
||||
_gen_ops_cached,
|
||||
get_max_alignment,
|
||||
)
|
||||
from torch._inductor.codegen.cutlass.cuda_kernel import CUDATemplateCaller
|
||||
from torch._inductor.codegen.cutlass.utils import _gen_ops_cached, get_max_alignment
|
||||
from torch._inductor.exc import InductorError
|
||||
from torch._inductor.ir import FixedLayout
|
||||
from torch._inductor.select_algorithm import NoValidChoicesError
|
||||
@ -133,10 +132,10 @@ use_evt_config = config.patch(
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 1,
|
||||
"cutlass.cutlass_max_profiling_configs": 1,
|
||||
"benchmark_epilogue_fusion": False, # EVT doesn't support benchmark fusion yet
|
||||
"cuda.cutlass_tma_only": True,
|
||||
"cuda.cutlass_epilogue_fusion_enabled": True,
|
||||
"cutlass.cutlass_tma_only": True,
|
||||
"cutlass.cutlass_epilogue_fusion_enabled": True,
|
||||
}
|
||||
)
|
||||
|
||||
@ -144,9 +143,9 @@ fp8_config = config.patch(
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 1,
|
||||
"cutlass.cutlass_max_profiling_configs": 1,
|
||||
"benchmark_epilogue_fusion": False, # EVT doesn't support benchmark fusion yet
|
||||
"cuda.cutlass_tma_only": True,
|
||||
"cutlass.cutlass_tma_only": True,
|
||||
}
|
||||
)
|
||||
|
||||
@ -206,7 +205,7 @@ class TestCutlassBackend(TestCase):
|
||||
def test_check_paths(self):
|
||||
cutlass_mock_imports_path = os.path.join(
|
||||
os.path.dirname(torch.__file__),
|
||||
"_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports",
|
||||
"_inductor/codegen/cutlass/lib_extensions/cutlass_mock_imports",
|
||||
)
|
||||
cutlass_mock_cuda_path = os.path.join(cutlass_mock_imports_path, "cuda")
|
||||
cutlass_mock_pydot_path = os.path.join(cutlass_mock_imports_path, "pydot")
|
||||
@ -234,8 +233,8 @@ class TestCutlassBackend(TestCase):
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"compile_threads": 4,
|
||||
"cuda.cutlass_backend_min_gemm_size": 100000,
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_backend_min_gemm_size": 100000,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
}
|
||||
):
|
||||
with mock.patch(
|
||||
@ -251,7 +250,7 @@ class TestCutlassBackend(TestCase):
|
||||
|
||||
@mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
|
||||
def test_import_cutlass(self):
|
||||
from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
|
||||
from torch._inductor.codegen.cutlass.utils import try_import_cutlass
|
||||
|
||||
self.assertTrue(try_import_cutlass())
|
||||
|
||||
@ -259,7 +258,7 @@ class TestCutlassBackend(TestCase):
|
||||
import cutlass_library # noqa: F401
|
||||
|
||||
def test_cutlass_key(self):
|
||||
from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
|
||||
from torch._inductor.codegen.cutlass.utils import try_import_cutlass
|
||||
|
||||
self.assertTrue(try_import_cutlass())
|
||||
from torch._inductor.codecache import cutlass_key
|
||||
@ -287,7 +286,7 @@ class TestCutlassBackend(TestCase):
|
||||
"autotune_in_subproc": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"compile_threads": 4,
|
||||
"cuda.cutlass_max_profiling_configs": 4,
|
||||
"cutlass.cutlass_max_profiling_configs": 4,
|
||||
}
|
||||
):
|
||||
Y_compiled = torch.compile(torch.mm)(a, b)
|
||||
@ -324,7 +323,7 @@ class TestCutlassBackend(TestCase):
|
||||
"autotune_in_subproc": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"compile_threads": 4,
|
||||
"cuda.cutlass_max_profiling_configs": 4,
|
||||
"cutlass.cutlass_max_profiling_configs": 4,
|
||||
}
|
||||
):
|
||||
for x_shape in x_shapes:
|
||||
@ -354,7 +353,7 @@ class TestCutlassBackend(TestCase):
|
||||
"autotune_in_subproc": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"compile_threads": 4,
|
||||
"cuda.cutlass_max_profiling_configs": 4,
|
||||
"cutlass.cutlass_max_profiling_configs": 4,
|
||||
}
|
||||
):
|
||||
Y_compiled = torch.compile(torch.bmm)(a, b)
|
||||
@ -386,7 +385,7 @@ class TestCutlassBackend(TestCase):
|
||||
"max_autotune": True,
|
||||
"autotune_in_subproc": True,
|
||||
"max_autotune_gemm_backends": max_autotune_gemm_backends,
|
||||
"cuda.cutlass_max_profiling_configs": 1,
|
||||
"cutlass.cutlass_max_profiling_configs": 1,
|
||||
}
|
||||
):
|
||||
from torch._inductor.utils import run_and_get_code
|
||||
@ -428,8 +427,8 @@ class TestCutlassBackend(TestCase):
|
||||
"max_autotune": True,
|
||||
"autotune_in_subproc": True,
|
||||
"max_autotune_gemm_backends": max_autotune_gemm_backends,
|
||||
"cuda.cutlass_max_profiling_configs": 1,
|
||||
"cuda.cutlass_max_profiling_swizzle_options": [
|
||||
"cutlass.cutlass_max_profiling_configs": 1,
|
||||
"cutlass.cutlass_max_profiling_swizzle_options": [
|
||||
1,
|
||||
2,
|
||||
4,
|
||||
@ -505,7 +504,7 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": max_autotune_gemm_backends,
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
}
|
||||
),
|
||||
dynamo_config.patch({"error_on_recompile": dynamic}),
|
||||
@ -595,9 +594,9 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": max_autotune_gemm_backends,
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
"benchmark_epilogue_fusion": False, # EVT doesn't support benchmark fusion yet
|
||||
"cuda.cutlass_tma_only": True,
|
||||
"cutlass.cutlass_tma_only": True,
|
||||
}
|
||||
),
|
||||
dynamo_config.patch({"error_on_recompile": dynamic}),
|
||||
@ -677,7 +676,7 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": max_autotune_gemm_backends,
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
}
|
||||
),
|
||||
dynamo_config.patch({"error_on_recompile": dynamic}),
|
||||
@ -746,7 +745,7 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": max_autotune_gemm_backends,
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
}
|
||||
):
|
||||
expected = [model(*input) for input in inputs]
|
||||
@ -775,8 +774,8 @@ class TestCutlassBackend(TestCase):
|
||||
"max_autotune": True,
|
||||
"autotune_in_subproc": True,
|
||||
"max_autotune_gemm_backends": max_autotune_gemm_backends,
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cuda.cutlass_op_allowlist_regex": "stream_k", # only stream-k GEMM Kernels
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_op_allowlist_regex": "stream_k", # only stream-k GEMM Kernels
|
||||
}
|
||||
):
|
||||
for M, K, N in (
|
||||
@ -819,7 +818,7 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_op_allowlist_regex": "stream_k", # only stream-k GEMM Kernels
|
||||
"cutlass.cutlass_op_allowlist_regex": "stream_k", # only stream-k GEMM Kernels
|
||||
}
|
||||
):
|
||||
with self.assertRaisesRegex(InductorError, r".*NoValidChoicesError.*"):
|
||||
@ -849,8 +848,8 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 1,
|
||||
"cuda.cutlass_op_allowlist_regex": "stream_k", # only stream-k GEMM Kernels
|
||||
"cutlass.cutlass_max_profiling_configs": 1,
|
||||
"cutlass.cutlass_op_allowlist_regex": "stream_k", # only stream-k GEMM Kernels
|
||||
}
|
||||
):
|
||||
_ = compiled_model(a, b)
|
||||
@ -884,8 +883,8 @@ class TestCutlassBackend(TestCase):
|
||||
"max_autotune": True,
|
||||
"autotune_in_subproc": True,
|
||||
"max_autotune_gemm_backends": max_autotune_gemm_backends,
|
||||
"cuda.cutlass_max_profiling_configs": 4,
|
||||
"cuda.version": "12.2", # required to enable the Kernels we need
|
||||
"cutlass.cutlass_max_profiling_configs": 4,
|
||||
"cuda.cuda_version": "12.2", # required to enable the Kernels we need
|
||||
}
|
||||
):
|
||||
counters["inductor"]["cuda_epilogue_fusion_counter"] = 0
|
||||
@ -983,7 +982,7 @@ class TestCutlassBackend(TestCase):
|
||||
"max_autotune": True,
|
||||
"autotune_in_subproc": True,
|
||||
"max_autotune_gemm_backends": max_autotune_gemm_backends,
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
}
|
||||
):
|
||||
Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
|
||||
@ -1002,7 +1001,7 @@ class TestCutlassBackend(TestCase):
|
||||
"max_autotune": True,
|
||||
"autotune_in_subproc": False,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
}
|
||||
):
|
||||
model = MyModel()
|
||||
@ -1040,7 +1039,7 @@ class TestCutlassBackend(TestCase):
|
||||
"max_autotune": True,
|
||||
"autotune_in_subproc": False,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
}
|
||||
):
|
||||
model = MyModel()
|
||||
@ -1073,8 +1072,8 @@ class TestCutlassBackend(TestCase):
|
||||
"max_autotune": True,
|
||||
"autotune_in_subproc": False,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_op_allowlist_regex": "128x256x64.*stream_k_warpspecialized_cooperative_epi_nosmem",
|
||||
"cuda.cutlass_max_profiling_configs": 1,
|
||||
"cutlass.cutlass_op_allowlist_regex": "128x256x64.*stream_k_warpspecialized_cooperative_epi_nosmem",
|
||||
"cutlass.cutlass_max_profiling_configs": 1,
|
||||
}
|
||||
):
|
||||
model = MyModel()
|
||||
@ -1117,7 +1116,7 @@ class TestCutlassBackend(TestCase):
|
||||
"max_autotune": True,
|
||||
"autotune_in_subproc": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
"autotune_local_cache": True,
|
||||
}
|
||||
):
|
||||
@ -1157,9 +1156,9 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cuda.cutlass_op_allowlist_regex": "",
|
||||
"cuda.cutlass_op_denylist_regex": "pingpong",
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_op_allowlist_regex": "",
|
||||
"cutlass.cutlass_op_denylist_regex": "pingpong",
|
||||
}
|
||||
):
|
||||
with mock.patch(
|
||||
@ -1202,9 +1201,9 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cuda.cutlass_op_allowlist_regex": "pingpong",
|
||||
"cuda.cutlass_op_denylist_regex": None,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_op_allowlist_regex": "pingpong",
|
||||
"cutlass.cutlass_op_denylist_regex": None,
|
||||
}
|
||||
):
|
||||
with mock.patch(
|
||||
@ -1273,7 +1272,7 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
}
|
||||
):
|
||||
with mock.patch(
|
||||
@ -1350,7 +1349,7 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
}
|
||||
),
|
||||
mock.patch(
|
||||
@ -1461,13 +1460,13 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": max_autotune_gemm_backends,
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cuda.generate_test_runner": True, # put standalone runner in the generated code
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.generate_test_runner": True, # put standalone runner in the generated code
|
||||
}
|
||||
):
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
from torch._inductor.codegen.cuda.cutlass_utils import (
|
||||
from torch._inductor.codegen.cutlass.utils import (
|
||||
cuda_standalone_runner_compile_command,
|
||||
CUDACompileSourceCapturingContext,
|
||||
)
|
||||
@ -1544,7 +1543,7 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "ATEN,TRITON,CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
# needed for log searching
|
||||
"fx_graph_cache": False,
|
||||
"fx_graph_remote_cache": False,
|
||||
@ -1553,7 +1552,7 @@ class TestCutlassBackend(TestCase):
|
||||
with (
|
||||
log_settings("+inductor"),
|
||||
self.assertLogs(
|
||||
logger="torch._inductor.codegen.cuda", level=logging.DEBUG
|
||||
logger="torch._inductor.codegen.cutlass", level=logging.DEBUG
|
||||
) as test_log,
|
||||
):
|
||||
Y_compiled = torch.compile(mm, dynamic=False)(a, b)
|
||||
@ -1591,7 +1590,7 @@ class TestCutlassBackend(TestCase):
|
||||
expected = model(A, B)
|
||||
|
||||
# Track render calls
|
||||
from torch._inductor.codegen.cuda.gemm_template import CUTLASSGemmTemplate
|
||||
from torch._inductor.codegen.cutlass.gemm_template import CUTLASSGemmTemplate
|
||||
|
||||
original_render = CUTLASSGemmTemplate.render
|
||||
render_call_count = 0
|
||||
@ -1608,8 +1607,8 @@ class TestCutlassBackend(TestCase):
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"fx_graph_cache": False,
|
||||
"fx_graph_remote_cache": False,
|
||||
"cuda.enable_caching_codegen": True,
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.enable_caching_codegen": True,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
}
|
||||
):
|
||||
compiled_model = torch.compile(model, fullgraph=True)
|
||||
@ -1645,7 +1644,7 @@ class TestCutlassBackend(TestCase):
|
||||
d = torch.randn(64, 128).cuda().half().t()
|
||||
|
||||
# Track render calls
|
||||
from torch._inductor.codegen.cuda.gemm_template import CUTLASSGemmTemplate
|
||||
from torch._inductor.codegen.cutlass.gemm_template import CUTLASSGemmTemplate
|
||||
|
||||
original_render = CUTLASSGemmTemplate.render
|
||||
render_call_count = 0
|
||||
@ -1660,10 +1659,10 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
"fx_graph_cache": False,
|
||||
"fx_graph_remote_cache": False,
|
||||
"cuda.enable_caching_codegen": True,
|
||||
"cutlass.enable_caching_codegen": True,
|
||||
}
|
||||
):
|
||||
# Get expected results
|
||||
@ -1706,7 +1705,7 @@ class TestCutlassBackend(TestCase):
|
||||
b = torch.randn(32, 64).cuda().half().t()
|
||||
|
||||
# Track render calls
|
||||
from torch._inductor.codegen.cuda.gemm_template import CUTLASSGemmTemplate
|
||||
from torch._inductor.codegen.cutlass.gemm_template import CUTLASSGemmTemplate
|
||||
|
||||
original_render = CUTLASSGemmTemplate.render
|
||||
render_call_count = 0
|
||||
@ -1721,10 +1720,10 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
"fx_graph_cache": False,
|
||||
"fx_graph_remote_cache": False,
|
||||
"cuda.enable_caching_codegen": True,
|
||||
"cutlass.enable_caching_codegen": True,
|
||||
}
|
||||
):
|
||||
# Get expected results
|
||||
@ -1752,7 +1751,7 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": max_autotune_gemm_backends,
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
}
|
||||
):
|
||||
compiled = torch.compile(torch.mm)
|
||||
@ -1771,7 +1770,7 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": max_autotune_gemm_backends,
|
||||
"cuda.cutlass_max_profiling_configs": 2,
|
||||
"cutlass.cutlass_max_profiling_configs": 2,
|
||||
}
|
||||
):
|
||||
compiled = torch.compile(torch.mm)
|
||||
@ -1795,7 +1794,7 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 1,
|
||||
"cutlass.cutlass_max_profiling_configs": 1,
|
||||
}
|
||||
):
|
||||
_ = torch.compile(model)(B)
|
||||
@ -1817,7 +1816,7 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 1,
|
||||
"cutlass.cutlass_max_profiling_configs": 1,
|
||||
}
|
||||
):
|
||||
_ = torch.compile(model)(B)
|
||||
@ -1845,7 +1844,7 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 1,
|
||||
"cutlass.cutlass_max_profiling_configs": 1,
|
||||
}
|
||||
):
|
||||
_ = torch.compile(model)(B)
|
||||
@ -1871,7 +1870,7 @@ class TestCutlassBackend(TestCase):
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
"cuda.cutlass_max_profiling_configs": 1,
|
||||
"cutlass.cutlass_max_profiling_configs": 1,
|
||||
}
|
||||
):
|
||||
if use_aoti:
|
||||
@ -1968,7 +1967,7 @@ class TestCutlassBackend(TestCase):
|
||||
|
||||
# baseline is cutlass kernel + triton
|
||||
# matches expected casting behavior
|
||||
with config.patch({"cuda.cutlass_epilogue_fusion_enabled": False}):
|
||||
with config.patch({"cutlass.cutlass_epilogue_fusion_enabled": False}):
|
||||
ref_result = torch.compile(model)(a, b, extra_args)
|
||||
|
||||
self.assertEqual(
|
||||
@ -2368,7 +2367,7 @@ class TestCutlassBackend(TestCase):
|
||||
"max_autotune_gemm_backends": "CUTLASS",
|
||||
# needed for log searching
|
||||
"force_disable_caches": True,
|
||||
"cuda.cutlass_max_profiling_swizzle_options": [2],
|
||||
"cutlass.cutlass_max_profiling_swizzle_options": [2],
|
||||
}
|
||||
):
|
||||
with mock.patch(
|
||||
|
||||
@ -5,7 +5,7 @@ import sympy
|
||||
|
||||
import torch
|
||||
from torch._dynamo.test_case import TestCase
|
||||
from torch._inductor.codegen.cuda.cutlass_utils import (
|
||||
from torch._inductor.codegen.cutlass.utils import (
|
||||
torch_dtype_to_cutlass_type,
|
||||
try_import_cutlass,
|
||||
)
|
||||
@ -28,7 +28,7 @@ if try_import_cutlass():
|
||||
DataType = cutlass_lib.DataType
|
||||
from cutlass_cppgen.backend.evt.ir.tensor import Tensor as CutlassTensor
|
||||
|
||||
from torch._inductor.codegen.cuda.cutlass_lib_extensions.evt_extensions import (
|
||||
from torch._inductor.codegen.cutlass.lib_extensions.evt_extensions import (
|
||||
_render_argument_type,
|
||||
_trace,
|
||||
trace,
|
||||
@ -107,7 +107,7 @@ class TestCutlassEVT(TestCase):
|
||||
@unittest.skipIf(not SM90OrLater, "need sm_90")
|
||||
@unittest.skipIf(not try_import_cutlass(), "requires cutlass")
|
||||
def test_py_codegen_accumulator_return(self):
|
||||
from torch._inductor.codegen.cuda.cutlass_python_evt import CutlassEVTCodegen
|
||||
from torch._inductor.codegen.cutlass.python_evt import CutlassEVTCodegen
|
||||
from torch._inductor.virtualized import V
|
||||
|
||||
size = (100, 300, 200)
|
||||
@ -164,7 +164,7 @@ return tmp_0, tmp_2, D""",
|
||||
@unittest.skipIf(not SM90OrLater, "need sm_90")
|
||||
@unittest.skipIf(not try_import_cutlass(), "requires cutlass")
|
||||
def test_py_codegen_disjoint_read_indexing(self):
|
||||
from torch._inductor.codegen.cuda.cutlass_python_evt import CutlassEVTCodegen
|
||||
from torch._inductor.codegen.cutlass.python_evt import CutlassEVTCodegen
|
||||
from torch._inductor.virtualized import V
|
||||
|
||||
size = (100, 300, 200)
|
||||
@ -213,7 +213,7 @@ index strides [200, 60000, 1], and layout stride [60000, 200, 1]""",
|
||||
@unittest.skipIf(not SM90OrLater, "need sm_90")
|
||||
@unittest.skipIf(not try_import_cutlass(), "requires cutlass")
|
||||
def test_py_codegen_broadcasting(self):
|
||||
from torch._inductor.codegen.cuda.cutlass_python_evt import CutlassEVTCodegen
|
||||
from torch._inductor.codegen.cutlass.python_evt import CutlassEVTCodegen
|
||||
from torch._inductor.virtualized import V
|
||||
|
||||
size = (100, 300, 200)
|
||||
@ -273,7 +273,7 @@ return tmp_0, tmp_2, D""",
|
||||
@unittest.skipIf(not SM90OrLater, "need sm_90")
|
||||
@unittest.skipIf(not try_import_cutlass(), "requires cutlass")
|
||||
def test_py_codegen(self):
|
||||
from torch._inductor.codegen.cuda.cutlass_python_evt import CutlassEVTCodegen
|
||||
from torch._inductor.codegen.cutlass.python_evt import CutlassEVTCodegen
|
||||
from torch._inductor.virtualized import V
|
||||
|
||||
size = (100, 300, 200)
|
||||
@ -329,7 +329,7 @@ return tmp_1, D""",
|
||||
@unittest.skipIf(not SM90OrLater, "need sm_90")
|
||||
@unittest.skipIf(not try_import_cutlass(), "requires cutlass")
|
||||
def test_example_tensor_creation(self):
|
||||
from torch._inductor.codegen.cuda.cutlass_lib_extensions.evt_extensions import (
|
||||
from torch._inductor.codegen.cutlass.lib_extensions.evt_extensions import (
|
||||
create_example_tensors,
|
||||
)
|
||||
from torch._inductor.virtualized import V
|
||||
|
||||
@ -292,7 +292,7 @@ class TestPublicBindings(TestCase):
|
||||
# do not get imported by public code.
|
||||
# DO NOT add public modules here.
|
||||
private_allowlist = {
|
||||
"torch._inductor.codegen.cuda.cuda_kernel",
|
||||
"torch._inductor.codegen.cutlass.kernel",
|
||||
# TODO(#133647): Remove the onnx._internal entries after
|
||||
# onnx and onnxscript are installed in CI.
|
||||
"torch.onnx._internal.exporter",
|
||||
@ -357,8 +357,8 @@ class TestPublicBindings(TestCase):
|
||||
"torch.testing._internal.distributed.rpc.rpc_test",
|
||||
"torch.testing._internal.distributed.rpc.tensorpipe_rpc_agent_test_fixture",
|
||||
"torch.testing._internal.distributed.rpc_utils",
|
||||
"torch._inductor.codegen.cuda.cuda_template",
|
||||
"torch._inductor.codegen.cuda.gemm_template",
|
||||
"torch._inductor.codegen.cutlass.template",
|
||||
"torch._inductor.codegen.cutlass.gemm_template",
|
||||
"torch._inductor.codegen.cpp_template",
|
||||
"torch._inductor.codegen.cpp_gemm_template",
|
||||
"torch._inductor.codegen.cpp_micro_gemm",
|
||||
|
||||
@ -264,4 +264,4 @@
|
||||
"torch/_inductor/utils.py": {
|
||||
"class IndentedBuffer": 145
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -34,7 +34,17 @@ from pathlib import Path
|
||||
from tempfile import _TemporaryFileWrapper
|
||||
from time import time, time_ns
|
||||
from types import ModuleType
|
||||
from typing import Any, Callable, cast, Generic, NoReturn, TYPE_CHECKING, TypeVar, Union
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
cast,
|
||||
Generic,
|
||||
NoReturn,
|
||||
Optional,
|
||||
TYPE_CHECKING,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
from typing_extensions import override, Self
|
||||
|
||||
import torch
|
||||
@ -3686,7 +3696,7 @@ def _load_triton_kernel_from_source(
|
||||
return getattr(PyCodeCache.load(source_code), kernel_name)
|
||||
|
||||
|
||||
def _cuda_compiler() -> str | None:
|
||||
def _cuda_compiler() -> Optional[str]:
|
||||
if cuda_env.nvcc_exist(config.cuda.cuda_cxx):
|
||||
return config.cuda.cuda_cxx
|
||||
if config.is_fbcode():
|
||||
@ -3704,7 +3714,7 @@ def _cutlass_path() -> str:
|
||||
|
||||
return parutil.get_dir_path("cutlass-4-headers")
|
||||
else:
|
||||
return config.cuda.cutlass_dir
|
||||
return config.cutlass.cutlass_dir
|
||||
|
||||
|
||||
def _cutlass_paths() -> list[str]:
|
||||
@ -3750,7 +3760,7 @@ def cutlass_key() -> bytes:
|
||||
return resource_file.read().encode()
|
||||
|
||||
combined_hash = hashlib.sha256()
|
||||
build_code_hash([config.cuda.cutlass_dir], "", combined_hash)
|
||||
build_code_hash([config.cutlass.cutlass_dir], "", combined_hash)
|
||||
return combined_hash.digest()
|
||||
|
||||
|
||||
@ -3820,14 +3830,14 @@ def _nvcc_compiler_options() -> list[str]:
|
||||
"-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
|
||||
"-w",
|
||||
f"-gencode=arch=compute_{arch},code=[{','.join(code)}]",
|
||||
config.cuda.compile_opt_level,
|
||||
config.cutlass.compile_opt_level,
|
||||
"-std=c++17",
|
||||
"--expt-relaxed-constexpr",
|
||||
"-DNDEBUG",
|
||||
]
|
||||
if config.is_fbcode():
|
||||
options.extend(["-ccbin", os.path.dirname(build_paths.gcc)])
|
||||
if config.cuda.enable_debug_info:
|
||||
if config.cutlass.enable_debug_info:
|
||||
options.extend(["-lineinfo", "-g", "-DCUTLASS_DEBUG_TRACE_LEVEL=1"])
|
||||
if config.cuda.enable_ptxas_info:
|
||||
options.extend(
|
||||
@ -3839,7 +3849,7 @@ def _nvcc_compiler_options() -> list[str]:
|
||||
"--source-in-ptx",
|
||||
]
|
||||
) # Annotate the ptx file with source information
|
||||
if config.cuda.use_fast_math:
|
||||
if config.cutlass.use_fast_math:
|
||||
options.extend(
|
||||
[
|
||||
"--use_fast_math",
|
||||
@ -4043,7 +4053,7 @@ class CUDACodeCache:
|
||||
Returns the hash key of source code, and the path to the file.
|
||||
"""
|
||||
|
||||
if config.cuda.cutlass_hash_with_compile_cmd:
|
||||
if config.cutlass.cutlass_hash_with_compile_cmd:
|
||||
cuda_command = repr(
|
||||
cuda_compile_command(["dummy_input"], "dummy_output", dst_file_ext)
|
||||
)
|
||||
@ -4094,7 +4104,7 @@ class CUDACodeCache:
|
||||
output_path = input_path[: -len(cls._SOURCE_CODE_SUFFIX)] + dst_file_ext
|
||||
error_path = binary_error_path(output_path)
|
||||
binary_remote_cache = cls.get_kernel_binary_remote_cache(
|
||||
caching_enabled=config.cuda.use_binary_remote_cache
|
||||
caching_enabled=config.cutlass.use_binary_remote_cache
|
||||
and not config.force_disable_caches,
|
||||
caching_available=config.is_fbcode(),
|
||||
)
|
||||
@ -4109,13 +4119,13 @@ class CUDACodeCache:
|
||||
cmd_parts, error_output = json.loads(error_json)
|
||||
if (
|
||||
binary_remote_cache is not None
|
||||
and config.cuda.upload_to_binary_remote_cache
|
||||
and config.cutlass.upload_to_binary_remote_cache
|
||||
):
|
||||
# This ensures that a local error is uploaded to the remote cache,
|
||||
# as we make no assumptions about the remote cache having the same
|
||||
# information as the local cache
|
||||
binary_remote_cache.put(
|
||||
error_path, config.cuda.binary_remote_cache_force_write
|
||||
error_path, config.cutlass.binary_remote_cache_force_write
|
||||
)
|
||||
cls.cache[key_with_ext] = CUDACodeCache.CacheEntry(
|
||||
input_path, output_path, error_json
|
||||
@ -4179,11 +4189,11 @@ class CUDACodeCache:
|
||||
# Upload to remote cache if enabled
|
||||
if (
|
||||
binary_remote_cache is not None
|
||||
and config.cuda.upload_to_binary_remote_cache
|
||||
and config.cutlass.upload_to_binary_remote_cache
|
||||
):
|
||||
# will log on errors, but not fail out
|
||||
binary_remote_cache.put(
|
||||
output_path, config.cuda.binary_remote_cache_force_write
|
||||
output_path, config.cutlass.binary_remote_cache_force_write
|
||||
)
|
||||
cls.cache[key_with_ext] = CUDACodeCache.CacheEntry(
|
||||
input_path, output_path, None
|
||||
@ -4236,10 +4246,10 @@ class CUDACodeCache:
|
||||
# Upload to remote cache directly from memory if enabled
|
||||
if (
|
||||
binary_remote_cache is not None
|
||||
and config.cuda.upload_to_binary_remote_cache
|
||||
and config.cutlass.upload_to_binary_remote_cache
|
||||
):
|
||||
binary_remote_cache.put(
|
||||
error_path, config.cuda.binary_remote_cache_force_write
|
||||
error_path, config.cutlass.binary_remote_cache_force_write
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -2618,7 +2618,7 @@ class CSEProxy(DefaultHandler):
|
||||
"""
|
||||
from ..bounds import ValueRangeAnalysis
|
||||
from ..select_algorithm import TritonTemplateKernel
|
||||
from .cuda.cuda_kernel import CUDATemplateKernel
|
||||
from .cutlass.cuda_kernel import CUDATemplateKernel
|
||||
|
||||
if isinstance(V.kernel, TritonTemplateKernel):
|
||||
return ValueRanges.unknown()
|
||||
|
||||
@ -4,7 +4,7 @@ import logging
|
||||
from collections.abc import Sequence
|
||||
from typing import cast
|
||||
|
||||
from torch._inductor.codegen.cuda.cutlass_python_evt import (
|
||||
from torch._inductor.codegen.cutlass.python_evt import (
|
||||
CutlassEVTCodegen,
|
||||
MockCutlassHandler,
|
||||
)
|
||||
@ -257,7 +257,7 @@ size: {cuda_template_buffer.get_size()}"
|
||||
)
|
||||
return False
|
||||
elif (
|
||||
not config.cuda.cutlass_epilogue_fusion_enabled
|
||||
not config.cutlass.cutlass_epilogue_fusion_enabled
|
||||
or not config.epilogue_fusion
|
||||
):
|
||||
why("cutlass epilogue fusion is not enabled")
|
||||
@ -267,9 +267,7 @@ size: {cuda_template_buffer.get_size()}"
|
||||
return False
|
||||
|
||||
try:
|
||||
from torch._inductor.codegen.cuda.cutlass_python_evt import (
|
||||
CutlassEVTCodegen,
|
||||
)
|
||||
from torch._inductor.codegen.cutlass.python_evt import CutlassEVTCodegen
|
||||
|
||||
CutlassEVTCodegen.ir_to_evt_python_code(
|
||||
cuda_template_buffer.get_name(),
|
||||
|
||||
@ -10,9 +10,11 @@ from typing import Any, Optional
|
||||
|
||||
import torch._inductor.config as config
|
||||
from torch._inductor.codecache import cutlass_key
|
||||
from torch._inductor.codegen.cuda import cutlass_utils, serialization
|
||||
from torch._inductor.codegen.cuda.cuda_env import get_cuda_arch, get_cuda_version
|
||||
from torch._inductor.codegen.cuda.serialization import get_cutlass_operation_serializer
|
||||
from torch._inductor.codegen.cutlass import serialization, utils
|
||||
from torch._inductor.codegen.cutlass.serialization import (
|
||||
get_cutlass_operation_serializer,
|
||||
)
|
||||
from torch._inductor.runtime.cache_dir_utils import cache_dir
|
||||
from torch._inductor.utils import clear_on_fresh_cache
|
||||
|
||||
@ -39,7 +41,7 @@ def get_config_request_key(
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
|
||||
serialization_hash = get_file_hash(serialization)
|
||||
cutlass_utils_hash = get_file_hash(cutlass_utils)
|
||||
cutlass_utils_hash = get_file_hash(utils)
|
||||
|
||||
hash_target = "-".join(
|
||||
[
|
||||
@ -75,7 +77,7 @@ def maybe_fetch_ops() -> Optional[list[Any]]:
|
||||
# get_cuda_version might return "12.4.0" or "12.4"
|
||||
# but we want to use "12.4"
|
||||
version: str = ".".join(get_cuda_version().split(".")[:2])
|
||||
instantiation_level: str = config.cuda.cutlass_instantiation_level
|
||||
instantiation_level: str = config.cutlass.cutlass_instantiation_level
|
||||
|
||||
# filename and filepath
|
||||
request_key: str = get_config_request_key(arch, version, instantiation_level)
|
||||
@ -15,7 +15,7 @@ from torch._inductor.scheduler import BaseSchedulerNode
|
||||
from torch._inductor.utils import do_bench_using_profiling, OrderedSet, Placeholder
|
||||
from torch.utils._sympy.value_ranges import ValueRanges
|
||||
|
||||
from .cutlass_utils import DTYPE_TO_CUTLASS_TYPE
|
||||
from .utils import DTYPE_TO_CUTLASS_TYPE
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -46,7 +46,7 @@ from ..cpp_utils import CppPrinter, DTYPE_TO_CPP
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch._inductor.codegen.cuda.cuda_template import CUDATemplate
|
||||
from torch._inductor.codegen.cutlass.cuda_template import CUDATemplate
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
@ -423,7 +423,7 @@ class CUDATemplateKernel(CUDAKernel):
|
||||
# Helper method, called into from CUTLASSGemmTemplate
|
||||
if node is None:
|
||||
return default_dtype
|
||||
from torch._inductor.codegen.cuda.cuda_template import CUTLASSTemplate
|
||||
from torch._inductor.codegen.cutlass.cuda_template import CUTLASSTemplate
|
||||
|
||||
return CUTLASSTemplate._DTYPE_TO_CUTLASS[node.get_layout().dtype]
|
||||
|
||||
@ -20,7 +20,7 @@ from ...utils import IndentedBuffer, unique
|
||||
from ...virtualized import V
|
||||
from ..common import KernelTemplate
|
||||
from .cuda_kernel import CUDATemplateCaller, CUDATemplateKernel
|
||||
from .cutlass_utils import DTYPE_TO_CUTLASS_TYPE
|
||||
from .utils import DTYPE_TO_CUTLASS_TYPE
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -110,7 +110,7 @@ class CUDATemplate(KernelTemplate):
|
||||
args are different.
|
||||
"""
|
||||
key: Optional[str] = None
|
||||
if config.cuda.enable_caching_codegen:
|
||||
if config.cutlass.enable_caching_codegen:
|
||||
key = self.make_key(name=name, input_key=input_key, layout_repr=layout_repr)
|
||||
|
||||
if key is not None and key in self.code_cache:
|
||||
@ -11,7 +11,7 @@ from typing import Any, Optional, Union
|
||||
import torch
|
||||
import torch.utils._pytree as pytree
|
||||
from torch._inductor.autotune_process import TensorMeta
|
||||
from torch._inductor.codegen.cuda.cutlass_cache import maybe_fetch_ops
|
||||
from torch._inductor.codegen.cutlass.cache import maybe_fetch_ops
|
||||
from torch._inductor.codegen.wrapper import PythonWrapperCodegen
|
||||
from torch._inductor.runtime.runtime_utils import dynamo_timed
|
||||
from torch._inductor.scheduler import BaseSchedulerNode
|
||||
@ -19,7 +19,7 @@ from torch._inductor.select_algorithm import create_inputs_key
|
||||
from torch._inductor.utils import clear_on_fresh_cache
|
||||
|
||||
from ... import ir
|
||||
from ...config import cuda as inductor_cuda_config
|
||||
from ...config import cutlass as inductor_cutlass_config
|
||||
from ...ir import (
|
||||
Buffer,
|
||||
ChoiceCaller,
|
||||
@ -32,11 +32,13 @@ from ...ir import (
|
||||
from ...utils import is_dynamic, Placeholder
|
||||
from ...virtualized import V
|
||||
from ..common import IndentedBuffer
|
||||
from . import cutlass_utils
|
||||
from . import utils as cutlass_utils
|
||||
from .cuda_kernel import CUDATemplateKernel
|
||||
from .cuda_template import CUTLASSTemplate
|
||||
from .cutlass_python_evt import CutlassEVTCodegen, scaled_mm_evt
|
||||
from .cutlass_utils import (
|
||||
from .python_evt import CutlassEVTCodegen, scaled_mm_evt
|
||||
from .utils import (
|
||||
ACCUMULATOR_DTYPES,
|
||||
dtype_match,
|
||||
torch_dtype_to_cutlass_type,
|
||||
@ -578,7 +580,7 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
|
||||
for name, op in ops:
|
||||
for (
|
||||
swizzle
|
||||
) in inductor_cuda_config.cutlass_max_profiling_swizzle_options:
|
||||
) in inductor_cutlass_config.cutlass_max_profiling_swizzle_options:
|
||||
description = f"{name} swizzle={swizzle}"
|
||||
self.maybe_append_choice(
|
||||
choices,
|
||||
@ -635,7 +637,7 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
"""
|
||||
)
|
||||
if inductor_cuda_config.generate_test_runner and not is_dynamic(
|
||||
if inductor_cutlass_config.generate_test_runner and not is_dynamic(
|
||||
*self.input_nodes, self.output_node
|
||||
):
|
||||
res.splice(GEMM_STANDALONE_RUNNER_ADDITIONAL_INCLUDES)
|
||||
@ -953,7 +955,7 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
|
||||
)
|
||||
return None
|
||||
|
||||
if inductor_cuda_config.cutlass_tma_only and not self._has_tma_epilogue(op):
|
||||
if inductor_cutlass_config.cutlass_tma_only and not self._has_tma_epilogue(op):
|
||||
return None
|
||||
|
||||
# Set epilogue.
|
||||
@ -975,14 +977,16 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
|
||||
return None
|
||||
|
||||
# Apply regex filters at the end when configuration name doesn't change anymore
|
||||
if inductor_cuda_config.cutlass_op_allowlist_regex:
|
||||
if inductor_cutlass_config.cutlass_op_allowlist_regex:
|
||||
if not re.search(
|
||||
inductor_cuda_config.cutlass_op_allowlist_regex, op.configuration_name()
|
||||
inductor_cutlass_config.cutlass_op_allowlist_regex,
|
||||
op.configuration_name(),
|
||||
):
|
||||
return None
|
||||
if inductor_cuda_config.cutlass_op_denylist_regex is not None:
|
||||
if inductor_cutlass_config.cutlass_op_denylist_regex is not None:
|
||||
if re.search(
|
||||
inductor_cuda_config.cutlass_op_denylist_regex, op.configuration_name()
|
||||
inductor_cutlass_config.cutlass_op_denylist_regex,
|
||||
op.configuration_name(),
|
||||
):
|
||||
return None
|
||||
|
||||
@ -1035,7 +1039,7 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
|
||||
time.time() - start_time,
|
||||
)
|
||||
sorted_res = sorted(res.items())
|
||||
ret_res = sorted_res[: inductor_cuda_config.cutlass_max_profiling_configs]
|
||||
ret_res = sorted_res[: inductor_cutlass_config.cutlass_max_profiling_configs]
|
||||
if len(self.filtered_ops_cache) < 50:
|
||||
self.filtered_ops_cache[self.cache_key] = ret_res
|
||||
else:
|
||||
@ -1277,7 +1281,9 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
|
||||
}
|
||||
options.update(dict(zip(extra_names, extra_inputs)))
|
||||
res = self._template_from_string(self._get_template()).render(**options)
|
||||
if inductor_cuda_config.generate_test_runner and not is_dynamic(X, W, Y, Bias):
|
||||
if inductor_cutlass_config.generate_test_runner and not is_dynamic(
|
||||
X, W, Y, Bias
|
||||
):
|
||||
test_runner_code = self._template_from_string(
|
||||
GEMM_STANDALONE_RUNNER_TEMPLATE
|
||||
).render(**options)
|
||||
@ -1483,7 +1489,7 @@ class CUTLASS3xGemmTemplate(CUTLASSGemmTemplate):
|
||||
output_dtype: torch.dtype,
|
||||
accumulator_dtype: torch.dtype,
|
||||
) -> tuple[str, str, str, EVTArgRenames]:
|
||||
from .cutlass_lib_extensions.evt_extensions import create_example_tensors, trace
|
||||
from .lib_extensions.evt_extensions import create_example_tensors, trace
|
||||
|
||||
acc_dtype = torch_dtype_to_cutlass_type(accumulator_dtype)
|
||||
output_dtype = torch_dtype_to_cutlass_type(output_dtype)
|
||||
@ -1570,7 +1576,7 @@ class CUTLASS3xGemmTemplate(CUTLASSGemmTemplate):
|
||||
assert cutlass_utils.try_import_cutlass()
|
||||
import cutlass_library.library as cutlass_lib
|
||||
|
||||
from .cutlass_lib_extensions import gemm_operation_extensions as gemm_extensions
|
||||
from .lib_extensions import gemm_operation_extensions as gemm_extensions
|
||||
|
||||
emitter = gemm_extensions.EmitGemmUniversal3xInstanceWithEVT(evt_name=evt_name) # type: ignore[call-arg]
|
||||
|
||||
@ -1710,6 +1716,8 @@ class CUTLASS3xGemmTemplate(CUTLASSGemmTemplate):
|
||||
|
||||
|
||||
class CUTLASS2xGemmTemplate(CUTLASSGemmTemplate):
|
||||
"""CUTLASS 2x GEMM Template, which is used to generate CUTLASS GEMM kernels"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_nodes: list[Buffer],
|
||||
@ -10,7 +10,7 @@ from torch._inductor.ir import (
|
||||
)
|
||||
from torch.utils._ordered_set import OrderedSet
|
||||
|
||||
from ..cutlass_utils import torch_dtype_to_cutlass_type, try_import_cutlass
|
||||
from ..utils import torch_dtype_to_cutlass_type, try_import_cutlass
|
||||
|
||||
|
||||
EpilogueFunctor = Any # EpilogueFunctor local class defined in _trace
|
||||
@ -1,5 +1,5 @@
|
||||
# mypy: ignore-errors
|
||||
from ..cutlass_utils import try_import_cutlass
|
||||
from ..utils import try_import_cutlass
|
||||
|
||||
|
||||
# copied / modified from original at
|
||||
@ -4,7 +4,7 @@ import json
|
||||
from enum import Enum
|
||||
from typing import Any, Optional
|
||||
|
||||
from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
|
||||
from torch._inductor.codegen.cutlass.utils import try_import_cutlass
|
||||
|
||||
|
||||
class CUTLASSOperationSerializer:
|
||||
@ -23,7 +23,7 @@ from ...ir import Layout
|
||||
from ...runtime.runtime_utils import cache_dir
|
||||
from ...virtualized import V
|
||||
from ..cpp_utils import DTYPE_TO_CPP
|
||||
from .cuda_env import get_cuda_arch, get_cuda_version
|
||||
from ..cuda.cuda_env import get_cuda_arch, get_cuda_version
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@ -98,14 +98,14 @@ def try_import_cutlass() -> bool:
|
||||
|
||||
# contains both cutlass and cutlass_library
|
||||
# we need cutlass for eVT
|
||||
cutlass_python_path = path_join(config.cuda.cutlass_dir, "python")
|
||||
cutlass_python_path = path_join(config.cutlass.cutlass_dir, "python")
|
||||
torch_root = os.path.abspath(os.path.dirname(torch.__file__))
|
||||
mock_src_path = os.path.join(
|
||||
torch_root,
|
||||
"_inductor",
|
||||
"codegen",
|
||||
"cuda",
|
||||
"cutlass_lib_extensions",
|
||||
"cutlass",
|
||||
"lib_extensions",
|
||||
"cutlass_mock_imports",
|
||||
)
|
||||
|
||||
@ -252,7 +252,7 @@ def _gen_ops_cached(arch, version) -> dict[Any, Any]:
|
||||
)
|
||||
return {}
|
||||
arch = _normalize_cuda_arch(arch)
|
||||
instantiation_level: str = config.cuda.cutlass_instantiation_level
|
||||
instantiation_level: str = config.cutlass.cutlass_instantiation_level
|
||||
args = CUTLASSArgs(
|
||||
architectures=arch,
|
||||
cuda_version=version,
|
||||
@ -1713,28 +1713,12 @@ class aot_inductor_mode:
|
||||
compile_standalone: bool = False
|
||||
|
||||
|
||||
class cuda:
|
||||
"""Settings for cuda backend, today this consists of cutlass"""
|
||||
|
||||
# CUDA arch to use for CUDA template kernel compilation.
|
||||
# e.g. "70", "75", "80", "90", etc.
|
||||
# When arch is None, Inductor uses torch.cuda.get_device_capability(0).
|
||||
arch: Optional[str] = None
|
||||
|
||||
# CUDA version to use for CUDA template kernel compilation.
|
||||
# e.g. "11.4", "12.1", etc.
|
||||
# When version is None, Inductor uses torch.version.cuda.
|
||||
version: Optional[str] = None
|
||||
class cutlass:
|
||||
"""Settings for cutlass backend, today this consists of cutlass"""
|
||||
|
||||
# Optimization level for the host compiler.
|
||||
compile_opt_level: Literal["-O0", "-O1", "-O2", "-O3", "-OS"] = "-O1"
|
||||
|
||||
# Whether to enable device LTO (link-time-optimization).
|
||||
enable_cuda_lto = False
|
||||
|
||||
# Whether to keep intermediate files dring compilation.
|
||||
enable_ptxas_info = False
|
||||
|
||||
# Whether to enable debug info, e.g. line number, cutlass debug info.
|
||||
enable_debug_info = False
|
||||
|
||||
@ -1746,7 +1730,10 @@ class cuda:
|
||||
cutlass_dir = os.path.realpath(
|
||||
os.environ.get(
|
||||
"TORCHINDUCTOR_CUTLASS_DIR",
|
||||
os.path.join(os.path.dirname(torch.__file__), "../third_party/cutlass/"),
|
||||
os.path.join(
|
||||
os.path.dirname(torch.__file__),
|
||||
"../third_party/cutlass/",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
@ -1766,14 +1753,6 @@ class cuda:
|
||||
# Whether to only use TMA-compatible kernels in CUTLASS
|
||||
cutlass_tma_only = False
|
||||
|
||||
# Path to CUDA NVCC.
|
||||
# NVCC search order:
|
||||
# 1) cuda_cxx set in this config
|
||||
# 2) CUDACXX environment variable
|
||||
# 3) CUDA_HOME environment variable
|
||||
# 4) default system search PATH.
|
||||
cuda_cxx: Optional[str] = None
|
||||
|
||||
# Minimum value of M*N*K to consider the CUTLASS backend for GEMM ops.
|
||||
cutlass_backend_min_gemm_size: int = 1
|
||||
|
||||
@ -1843,6 +1822,53 @@ class cuda:
|
||||
enable_caching_codegen: bool = True
|
||||
|
||||
|
||||
class cuda(cutlass):
|
||||
"""Settings for cutlass backend, today this consists of cutlass"""
|
||||
|
||||
# CUDA arch to use for CUDA template kernel compilation.
|
||||
# e.g. "70", "75", "80", "90", etc.
|
||||
# When arch is None, Inductor uses torch.cuda.get_device_capability(0).
|
||||
arch: Optional[str] = None
|
||||
|
||||
# CUDA version to use for CUDA template kernel compilation.
|
||||
# e.g. "11.4", "12.1", etc.
|
||||
# When version is None, Inductor uses torch.version.cuda.
|
||||
version: Optional[str] = None
|
||||
|
||||
# Path to CUDA NVCC.
|
||||
# NVCC search order:
|
||||
# 1) cuda_cxx set in this config
|
||||
# 2) CUDACXX environment variable
|
||||
# 3) CUDA_HOME environment variable
|
||||
# 4) default system search PATH.
|
||||
cuda_cxx: Optional[str] = None
|
||||
|
||||
# Whether to enable device LTO (link-time-optimization).
|
||||
enable_cuda_lto = False
|
||||
|
||||
# Whether to keep intermediate files dring compilation.
|
||||
enable_ptxas_info = False
|
||||
|
||||
|
||||
class xpu(cutlass):
|
||||
# Xe arch to use for SYCL template kernel compilation.
|
||||
# eg. 12, 20, which corresponding to Xe12(PVC) and Xe20 (BMG)
|
||||
arch: Optional[str] = None
|
||||
# oneAPI version to use for SYCL template kernel compilation.
|
||||
# e.g. "20250201".
|
||||
version: Optional[str] = None
|
||||
|
||||
cutlass_dir = os.path.realpath(
|
||||
os.environ.get(
|
||||
"TORCHINDUCTOR_CUTLASS_DIR",
|
||||
os.path.join(
|
||||
os.path.dirname(torch.__file__),
|
||||
"../third_party/sycl-tla/",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class rocm:
|
||||
# Offload arch list for device code compilation, e.g. ["gfx90a", "gfx942"].
|
||||
# If empty, the `native` arch is used
|
||||
|
||||
@ -489,7 +489,7 @@ MODULE_DEFAULTS: dict[str, ConfigType] = {
|
||||
"aot_inductor.presets": DEFAULT, # Typing
|
||||
"cuda.arch": DEFAULT, # Out of Scope
|
||||
"cuda.version": DEFAULT, # Out of Scope
|
||||
"cuda.cutlass_dir": DEFAULT, # Out of Scope
|
||||
"cutlass.cutlass_dir": DEFAULT, # Out of Scope
|
||||
"cuda.cuda_cxx": DEFAULT, # Out of Scope
|
||||
"rocm.arch": DEFAULT, # Out of Scope
|
||||
"rocm.ck_supported_arch": DEFAULT, # Out of Scope
|
||||
|
||||
@ -124,7 +124,7 @@ if TYPE_CHECKING:
|
||||
from torch.fx.experimental.symbolic_shapes import SympyBoolean
|
||||
from torch.fx.node import Argument
|
||||
|
||||
from .codegen.cuda.cuda_template import CUDATemplate
|
||||
from .codegen.cutlass.cuda_template import CUDATemplate
|
||||
from .codegen.wrapper import PythonWrapperCodegen
|
||||
from .graph import GraphLowering
|
||||
from .utils import IndentedBuffer
|
||||
|
||||
@ -261,7 +261,7 @@ def tuned_bmm(mat1, mat2, out_dtype=None, *, layout=None):
|
||||
and use_cutlass_template(layout, m, n, k)
|
||||
and _use_cutlass_for_op(name)
|
||||
):
|
||||
from ..codegen.cuda.gemm_template import CUTLASS3xGemmTemplate
|
||||
from ..codegen.cutlass.gemm_template import CUTLASS3xGemmTemplate
|
||||
|
||||
CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
|
||||
choices, layout, kernel_inputs.nodes()
|
||||
|
||||
@ -20,7 +20,7 @@ from torch.nn.functional import ScalingType # type: ignore[attr-defined]
|
||||
from torch.torch_version import TorchVersion
|
||||
|
||||
from .. import config as inductor_config
|
||||
from ..codegen.cuda.gemm_template import CUTLASS2xGemmTemplate, CUTLASS3xGemmTemplate
|
||||
from ..codegen.cutlass.gemm_template import CUTLASS2xGemmTemplate, CUTLASS3xGemmTemplate
|
||||
from ..codegen.rocm.ck_tile_universal_gemm_template import CKTileGemmTemplate
|
||||
from ..codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
|
||||
from ..codegen.subgraph import SubgraphChoiceCaller, SubgraphTemplate
|
||||
|
||||
@ -2665,7 +2665,7 @@ class AlgorithmSelectorCache(PersistentCache):
|
||||
return_multi_template=False,
|
||||
best_config_future=None,
|
||||
):
|
||||
from .codegen.cuda.cuda_kernel import CUDATemplateCaller
|
||||
from .codegen.cutlass.cuda_kernel import CUDATemplateCaller
|
||||
|
||||
# Run preprocessing functions on choices
|
||||
for preprocessing_fn in self.preprocessing_fns:
|
||||
@ -3086,7 +3086,7 @@ class AlgorithmSelectorCache(PersistentCache):
|
||||
"select_algorithm_num_precompilation_exceptions"
|
||||
] += 1
|
||||
exceptions.append((futures[future], e))
|
||||
from torch._inductor.codegen.cuda.cuda_kernel import (
|
||||
from torch._inductor.codegen.cutlass.cuda_kernel import (
|
||||
CUDATemplateCaller,
|
||||
)
|
||||
|
||||
@ -3272,8 +3272,10 @@ class AlgorithmSelectorCache(PersistentCache):
|
||||
for choice in choices:
|
||||
try:
|
||||
timing = cls.benchmark_choice(choice, autotune_args)
|
||||
except CUDACompileError:
|
||||
from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
|
||||
except CUDACompileError as e:
|
||||
from torch._inductor.codegen.cutlass.cuda_kernel import (
|
||||
CUDATemplateCaller,
|
||||
)
|
||||
|
||||
if not isinstance(choice, CUDATemplateCaller):
|
||||
log.exception(
|
||||
@ -3284,7 +3286,9 @@ class AlgorithmSelectorCache(PersistentCache):
|
||||
log.warning("Not yet implemented", exc_info=True)
|
||||
timing = float("inf")
|
||||
except RuntimeError as e:
|
||||
from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
|
||||
from torch._inductor.codegen.cutlass.cuda_kernel import (
|
||||
CUDATemplateCaller,
|
||||
)
|
||||
|
||||
msg = str(e)
|
||||
if "invalid argument" in msg:
|
||||
@ -3429,12 +3433,12 @@ class AlgorithmSelectorCache(PersistentCache):
|
||||
return prescreen_winners
|
||||
|
||||
# prescreen cutlass
|
||||
from .codegen.cuda.cuda_kernel import CUDATemplateCaller
|
||||
from .codegen.cutlass.cuda_kernel import CUDATemplateCaller
|
||||
|
||||
candidates = []
|
||||
if (
|
||||
config.cuda.cutlass_prescreening
|
||||
and len(config.cuda.cutlass_max_profiling_swizzle_options) > 1
|
||||
config.cutlass.cutlass_prescreening
|
||||
and len(config.cutlass.cutlass_max_profiling_swizzle_options) > 1
|
||||
):
|
||||
candidates.extend(
|
||||
[
|
||||
@ -3463,7 +3467,7 @@ class AlgorithmSelectorCache(PersistentCache):
|
||||
"""
|
||||
Prune the choices after prescreening.
|
||||
"""
|
||||
from .codegen.cuda.cuda_kernel import CUDATemplateCaller
|
||||
from .codegen.cutlass.cuda_kernel import CUDATemplateCaller
|
||||
|
||||
prescreen_key = f"{name}:{inputs_key}"
|
||||
|
||||
|
||||
@ -1872,9 +1872,9 @@ def use_cutlass_template(layout: Layout, m: int, n: int, k: int) -> bool:
|
||||
from .virtualized import V
|
||||
|
||||
gemm_size = V.graph.sizevars.size_hint(m * n * k, fallback=-1)
|
||||
if gemm_size <= 0 or gemm_size < config.cuda.cutlass_backend_min_gemm_size:
|
||||
if gemm_size <= 0 or gemm_size < config.cutlass.cutlass_backend_min_gemm_size:
|
||||
return False
|
||||
from .codegen.cuda.cutlass_utils import try_import_cutlass
|
||||
from .codegen.cutlass.utils import try_import_cutlass
|
||||
|
||||
# Do not use cutlass template on ROCm
|
||||
if torch.version.hip:
|
||||
@ -1893,9 +1893,9 @@ def use_cutlass_template(layout: Layout, m: int, n: int, k: int) -> bool:
|
||||
if not try_import_cutlass():
|
||||
log.warning(
|
||||
"Failed to import CUTLASS lib. Please check whether "
|
||||
"_inductor.config.cuda.cutlass_dir %s is set correctly. "
|
||||
"_inductor.config.cutlass.cutlass_dir %s is set correctly. "
|
||||
"Skipping CUTLASS backend for now.",
|
||||
config.cuda.cutlass_dir,
|
||||
config.cutlass.cutlass_dir,
|
||||
)
|
||||
return False
|
||||
return res
|
||||
@ -1903,7 +1903,7 @@ def use_cutlass_template(layout: Layout, m: int, n: int, k: int) -> bool:
|
||||
|
||||
def _use_cutlass_for_op(op_name: str) -> bool:
|
||||
"""Check if CUTLASS should be used for the given operation."""
|
||||
enabled_ops = config.cuda.cutlass_enabled_ops.upper()
|
||||
enabled_ops = config.cutlass.cutlass_enabled_ops.upper()
|
||||
if enabled_ops == "ALL":
|
||||
return True
|
||||
return op_name.upper() in [x.strip() for x in enabled_ops.split(",")]
|
||||
|
||||
Reference in New Issue
Block a user