Support triton.language.dtype with torch.compile (#121690)

Putting this PR as an RFC since I have resorted to some horrible hacks in order to make this work. ``` (Pdb) p triton.language.float32 triton.language.fp32 (Pdb) p str(triton.language.float32) 'fp32' (Pdb) p repr(triton.language.float32) 'triton.language.fp32' ``` This means that we need to "rewrite" them for fx graph and inductor execution. This PR allows Mamba2 to work with `torch.compile`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/121690 Approved by: https://github.com/Skylion007
2025-11-06 17:24:59 +08:00 · 2024-03-12 13:02:22 -07:00
parent 22bb24986d
commit 79ee6bbde3
7 changed files with 105 additions and 5 deletions
--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@ -1086,6 +1086,51 @@ def forward(self, x_1, output_1):
        with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, msg):
            f(x, x)
    @requires_cuda
    @skipIfRocm
    @common_utils.parametrize("dynamic", [False, True])
    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
    def test_triton_kernel_triton_dtype(self, dynamic, backend):
        @triton.jit
        def add_kernel_with_dtype(
            in_ptr0,
            in_ptr1,
            out_ptr,
            dtype: "tl.constexpr",
            n_elements,
            BLOCK_SIZE: "tl.constexpr",
        ):
            pid = tl.program_id(axis=0)
            block_start = pid * BLOCK_SIZE
            offsets = block_start + tl.arange(0, BLOCK_SIZE)
            mask = offsets < n_elements
            x = tl.load(in_ptr0 + offsets, mask=mask).to(dtype)
            y = tl.load(in_ptr1 + offsets, mask=mask).to(dtype)
            output = x + y
            tl.store(out_ptr + offsets, output, mask=mask)
        def f(x, y, dtype_torch, dtype_triton):
            output = torch.zeros_like(x).to(dtype=dtype_torch)
            n_elements = output.numel()
            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
            add_kernel_with_dtype[grid](
                x, y, output, dtype_triton, n_elements, BLOCK_SIZE=4
            )
            return output
        x = torch.randn(4, device="cuda")
        y = torch.randn(4, device="cuda")
        args_list = (
            [x, y, torch.float32, tl.float32],
            [x, y, torch.bfloat16, tl.bfloat16],
        )
        for args in args_list:
            eager_out = f(*args)
            compiled_out = torch.compile(
                f, fullgraph=True, backend=backend, dynamic=dynamic
            )(*args)
            self.assertEqual(compiled_out, eager_out)
 def make_mutation_test(fn):
    @requires_cuda
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@ -1222,6 +1222,13 @@ class CheckFunctionManager:
        # guard_fn.__globals__ becomes equal to builder.scope. This causes
        # guard_fn to hold a referece to f_locals sitting in builder.scope["L"]
        globals_for_guard_fn = {"G": builder.scope["G"]}
        from torch.utils._triton import has_triton_package
        if has_triton_package():
            import triton
            globals_for_guard_fn[triton.__name__] = triton
        try:
            exec(pycode, globals_for_guard_fn, out)
        except SyntaxError as ex:
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@ -98,7 +98,7 @@ from torch._utils_internal import log_compilation_event
 from torch.nn.modules.lazy import LazyModuleMixin
 from torch.utils._pytree import tree_map_only
-
+from torch.utils._triton import has_triton, has_triton_package
 counters: DefaultDict[str, Counter[str]] = collections.defaultdict(collections.Counter)
 optimus_scuba_log: Dict[str, Any] = {}
@ -977,6 +977,10 @@ common_constant_types = {
    torch.memory_format,
    torch.layout,
 }
 if has_triton_package():
    import triton
    common_constant_types.add(triton.language.dtype)
 def is_safe_constant(v):
@ -2193,8 +2197,6 @@ def is_compile_supported(device_type):
    if device_type == "cpu":
        pass
    elif device_type == "cuda" and compile_supported:
        from torch.utils._triton import has_triton
        compile_supported = has_triton()
    else:
        compile_supported = False
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@ -1013,6 +1013,10 @@ class WrapperCodeGen(CodeGen):
        self.header.splice(f"\n\n{metadata_comment}{name} = {kernel}")
    def define_user_defined_triton_kernel(self, kernel, configs, kwargs):
        from torch.utils._triton import patch_triton_dtype_repr
        patch_triton_dtype_repr()
        original_name = kernel.__name__
        from .common import KernelArgType, SizeArg, TensorArg
@ -1299,6 +1303,11 @@ class WrapperCodeGen(CodeGen):
        raise NotImplementedError()
    def val_to_arg_str(self, s):
        from torch.utils._triton import dtype_to_string, has_triton
        if has_triton():
            import triton
        if isinstance(s, SymTypes):
            return pexpr(sympy.expand(repr(s)))
        elif isinstance(s, sympy.Expr):
@ -1317,6 +1326,8 @@ class WrapperCodeGen(CodeGen):
            return _get_qualified_name(s)
        elif isinstance(s, (ir.Buffer, ReinterpretView)):
            return s.codegen_reference()
        elif has_triton() and isinstance(s, triton.language.dtype):  # type: ignore[possibly-undefined]
            return dtype_to_string(s)
        else:
            return repr(s)
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@ -380,11 +380,19 @@ class CodeGen:
    def _gen_python_code(
        self, nodes, root_module: str, namespace: _Namespace, *, verbose: bool = False,
    ) -> PythonCode:
        from torch.utils._triton import has_triton
        free_vars: List[str] = []
        body: List[str] = []
        globals_: Dict[str, Any] = {}
        wrapped_fns: Dict[str, None] = {}
        if has_triton():
            import triton
            globals_[triton.__name__] = triton
            from torch.utils._triton import patch_triton_dtype_repr
            patch_triton_dtype_repr()
        # Wrap string in list to pass by reference
        maybe_return_annotation : List[str] = ['']
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@ -245,6 +245,11 @@ class TracerBase:
        Can be override to support more trace-specific types.
        """
        from torch.utils._triton import has_triton
        if has_triton():
            import triton
        if not isinstance(a, Proxy) and hasattr(a, '__fx_create_arg__'):
            return a.__fx_create_arg__(self)
        # aggregates
@ -280,6 +285,8 @@ class TracerBase:
        elif isinstance(a, torch._ops.OpOverload):
            return a
        elif has_triton() and isinstance(a, triton.language.dtype):
            return a
        if isinstance(a, Proxy):
            # base case: we unwrap the Proxy object
--- a/torch/utils/_triton.py
+++ b/torch/utils/_triton.py
@ -1,8 +1,6 @@
 import functools
 import hashlib
 from torch._dynamo.device_interface import get_interface_for_device
@functools.lru_cache(None)
 def has_triton_package() -> bool:
@ -16,6 +14,8 @@ def has_triton_package() -> bool:
@functools.lru_cache(None)
 def has_triton() -> bool:
    from torch._dynamo.device_interface import get_interface_for_device
    def cuda_extra_check(device_interface):
        return device_interface.Worker.get_device_properties().major >= 7
@ -59,3 +59,23 @@ def triton_hash_with_backend():
    backend = triton_backend()
    key = f"{triton_key()}-{backend.hash()}"
    return hashlib.sha256(key.encode("utf-8")).hexdigest()
 def dtype_to_string(dtype):
    if dtype.name.startswith("fp"):
        suffix = "float" + dtype.name[2:]
    elif dtype.name.startswith("bf"):
        suffix = "bfloat" + dtype.name[2:]
    else:
        suffix = dtype.name
    return "triton.language." + suffix
 def patch_triton_dtype_repr():
    import triton
    # Hack to get triton dtype repr to produce an evaluatable expression
    # triton.language.float32 emits triton.language.fp32 which does not
    # exist
    # REMOVE when https://github.com/openai/triton/pull/3342 lands
    triton.language.dtype.__repr__ = lambda self: dtype_to_string(self)