pytorch

mirror of https://github.com/pytorch/pytorch.git synced 2025-10-20 12:54:11 +08:00

Files

drisspg 3c6efd1380 Add cutedsl template support to compile (#160108 )

## Summary
Still figuring out what actually writing a template should look like, but lands alot of the base infra

<img width="1267" height="262" alt="Screenshot 2025-08-16 at 10 22 12 PM" src="https://github.com/user-attachments/assets/229f8bfa-0cb4-4fb1-8530-f535e569d350" />

Test code:

```Python
#!/usr/bin/env python3
"""
Fixed CuteDSL template test with proper def_kernel usage.
"""

import torch
import torch._inductor.config as config
from torch._inductor.lowering import lowerings
from torch._inductor.ir import TensorBox
from torch._inductor.select_algorithm import autotune_select_algorithm
from torch._inductor.codegen.cutedsl import CuteDSLTemplate

def create_fixed_cutedsl_template():
    """Create a properly structured CuteDSL template."""

    def cutedsl_grid(M, N, meta):
        return (1,)

    # Part 1: Imports and kernel definition
    template_part1 = r"""
import torch
import cutlass
import cutlass.cute as cute
from cutlass.cute.runtime import from_dlpack

@cute.kernel
def {{kernel_name}}_kernel(gA: cute.Tensor, gB: cute.Tensor, gC: cute.Tensor):
    # Get thread and block indices
    tidx, _, _ = cute.arch.thread_idx()
    bidx, _, _ = cute.arch.block_idx()
    bdim, _, _ = cute.arch.block_dim()

    thread_idx = bidx * bdim + tidx
    m, n = gA.shape

    if thread_idx < m * n:
        mi = thread_idx // n
        ni = thread_idx % n

        if mi < m and ni < n:
            a_val = gA[mi, ni]
            b_val = gB[mi, ni]
            result = a_val + b_val
            gC[mi, ni] = a_val + b_val
"""

    # Part 2: JIT wrapper function
    template_part2 = r"""
@cute.jit
def {{kernel_name}}_jit(mA: cute.Tensor, mB: cute.Tensor, mC: cute.Tensor):
    m, n = mA.shape
    total_threads = m * n
    threads_per_block = 256
    num_blocks = (total_threads + threads_per_block - 1) // threads_per_block

    kernel = {{kernel_name}}_kernel(mA, mB, mC)
    kernel.launch(
        grid=[num_blocks, 1, 1],
        block=[threads_per_block, 1, 1]
    )
"""

    # Part 3: Main kernel function
    template_part3 = r"""
{{def_kernel("input_a", "input_b", "output_c")}}
    cute_a = from_dlpack(input_a, assumed_align=16)
    cute_b = from_dlpack(input_b, assumed_align=16)
    cute_c = from_dlpack(output_c, assumed_align=16)

    # Launch kernel
    {{kernel_name}}_jit(cute_a, cute_b, cute_c)

    return output_c
"""

    # Combine all parts
    template = CuteDSLTemplate(
        name="fixed_add",
        grid=cutedsl_grid,
        source=template_part1 + template_part2 + template_part3
    )

    return template

def fixed_cutedsl_lowering(a: TensorBox, b: TensorBox) -> TensorBox:
    """Fixed CuteDSL lowering."""
    print(f"[FIXED] CuteDSL lowering: {a.get_size()} + {b.get_size()}")

    template = create_fixed_cutedsl_template()
    choices = []

    error = template.maybe_append_choice(
        choices,
        input_nodes=[a.data, b.data],
        layout=a.get_layout()
    )

    if error or not choices:
        print(f"[FIXED] Falling back: {error}")
        default_lowering = lowerings[torch.ops.aten.add.Tensor]
        return default_lowering(a, b)

    print(f"[FIXED] Using CuteDSL with {len(choices)} choices")

    result = autotune_select_algorithm(
        "fixed_cutedsl_add",
        choices,
        [a, b],
        a.get_layout(),
    )

    return result

def test_fixed_cutedsl():
    """Test the fixed CuteDSL template."""
    print("=" * 50)
    print("Fixed CuteDSL Template Test")
    print("=" * 50)

    original = lowerings.get(torch.ops.aten.add.Tensor, None)

    try:
        lowerings[torch.ops.aten.add.Tensor] = fixed_cutedsl_lowering

        def test_add(x, y):
            return x + y

        device = "cuda" if torch.cuda.is_available() else "cpu"
        x = torch.randn(128, 4, device=device, dtype=torch.float32)
        y = torch.randn(128, 4, device=device, dtype=torch.float32)

        print(f"[FIXED] Testing with {x.shape} tensors on {device}")

        compiled_fn = torch.compile(test_add, backend="inductor")
        result = compiled_fn(x, y)

        # Verify correctness
        expected = x + y
        if torch.allclose(result, expected, atol=1e-5):
            print("✅ [FIXED] Results match!")
            return True
        else:
            print("❌ [FIXED] Results don't match!")
            return False

    except Exception as e:
        print(f"❌ [FIXED] Failed: {e}")
        import traceback
        traceback.print_exc()
        return False

    finally:
        if original:
            lowerings[torch.ops.aten.add.Tensor] = original
        else:
            lowerings.pop(torch.ops.aten.add.Tensor, None)

if __name__ == "__main__":
    success = test_fixed_cutedsl()
    print("🎉 Fixed test completed!" if success else "💥 Fixed test failed!")

```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/160108
Approved by: https://github.com/mlazos

2025-08-18 04:37:15 +00:00

_awaits

…

[MPS] Add API to query GPU core count (#160414 )

2025-08-14 00:05:17 +00:00

_C_flatbuffer

…

_custom_op

[BE]: ruff PLC0207 - use maxsplit kwarg (#160107 )

2025-08-08 03:14:59 +00:00

_decomp

migrate more simple gso checks (#160253 )

2025-08-16 00:15:24 +00:00

_dispatch

Improve torch.ops typing (#154555 )

2025-06-22 15:52:27 +00:00

_dynamo

[PGO] add extra read/write keys (#160715 )

2025-08-18 01:41:08 +00:00

_export

Add support for param mutation under inference mode (#159661 )

2025-08-14 03:34:04 +00:00

_functorch

Add support for param mutation under inference mode (#159661 )

2025-08-14 03:34:04 +00:00

_higher_order_ops

[map] filter none gradients and add autograd inductor tests (#160548 )

2025-08-15 20:13:12 +00:00

_inductor

Add cutedsl template support to compile (#160108 )

2025-08-18 04:37:15 +00:00

_lazy

[BE][2/16] fix typos in torch/ (torch/_*/) (#156312 )

2025-07-12 05:47:06 +00:00

_library

Account for triton kernel source code hidden in custom ops properly in AOTAutogradCache (#160120 )

2025-08-12 14:11:06 +00:00

_logging

Add compile_id: Optional[CompileID] to torch._logging._internal.trace_structured_artifact (#160440 )

2025-08-13 06:28:23 +00:00

_numpy

Fix torch._numpy to match NumPy when empty ellipsis causes advanced indexing separation (#158297 )

2025-07-16 08:11:53 +00:00

_prims

[BE]: ruff PLC0207 - use maxsplit kwarg (#160107 )

2025-08-08 03:14:59 +00:00

_prims_common

Remove guard_size_oblivious from default contiguity python check, and add aten.sym_is_contiguous. (#159197 )

2025-08-16 09:15:58 +00:00

_refs

Remove guard_size_oblivious from default contiguity python check, and add aten.sym_is_contiguous. (#159197 )

2025-08-16 09:15:58 +00:00

_strobelight

[BE][2/16] fix typos in torch/ (torch/_*/) (#156312 )

2025-07-12 05:47:06 +00:00

_subclasses

Remove guard_size_oblivious from default contiguity python check, and add aten.sym_is_contiguous. (#159197 )

2025-08-16 09:15:58 +00:00

_vendor

…

accelerator

Add unified memory APIs for torch.accelerator (#152932 )

2025-08-08 17:41:22 +00:00

amp

Fix autocast context manager when there is exception (#159565 )

2025-08-01 02:12:24 +00:00

[BE]: ruff PLC0207 - use maxsplit kwarg (#160107 )

2025-08-08 03:14:59 +00:00

autograd

Add ownership token when needed on GradientEdge (#160098 )

2025-08-12 20:14:18 +00:00

backends

[MPS] Add API to query GPU core count (#160414 )

2025-08-14 00:05:17 +00:00

compiler

[PGO] add extra read/write keys (#160715 )

2025-08-18 01:41:08 +00:00

contrib

…

cpu

[device_mesh] improve device selection logic (#150897 )

2025-05-14 06:29:16 +00:00

csrc

Remove guard_size_oblivious from default contiguity python check, and add aten.sym_is_contiguous. (#159197 )

2025-08-16 09:15:58 +00:00

cuda

Add unified memory APIs for torch.accelerator (#152932 )

2025-08-08 17:41:22 +00:00

distributed

Do not incorrectly chain each of the strings as iterables (#160709 )

2025-08-15 23:22:24 +00:00

distributions

[BE][1/16] fix typos in torch/ (#156311 )

2025-07-09 11:02:22 +00:00

export

Warn when there is side effect in strict mode (#160060 )

2025-08-14 14:59:44 +00:00

fft

[BE][PYFMT] migrate PYFMT for torch/[e-n]*/ to ruff format (#144553 )

2025-06-17 08:18:47 +00:00

func

…

futures

Simplify the base classes of _PyFutureMeta (#157757 )

2025-07-08 15:39:56 +00:00

Remove guard_size_oblivious from default contiguity python check, and add aten.sym_is_contiguous. (#159197 )

2025-08-16 09:15:58 +00:00

headeronly

[Reland] Migrate ScalarType to headeronly (#159911 )

2025-08-06 07:36:37 +00:00

jit

[4/n] Remove references to TorchScript in PyTorch docs (#158317 )

2025-07-16 20:01:34 +00:00

legacy

…

lib

[2/N] Fix cppcoreguidelines-init-variables suppression (#146237 )

2025-06-19 23:26:42 +00:00

linalg

Fix for ambiguity in linalg.norm()'s ord argument of +2 & -2 (#155148 )

2025-06-04 21:15:20 +00:00

masked

Remove guard_size_oblivious from default contiguity python check, and add aten.sym_is_contiguous. (#159197 )

2025-08-16 09:15:58 +00:00

monitor

…

mps

[BE][12/16] fix typos in torch/ (#156602 )

2025-07-02 22:55:29 +00:00

mtia

[Re-land][Inductor] Support native Inductor as backend for MTIA (#159211 )

2025-07-29 17:03:24 +00:00

multiprocessing

Support NUMA Binding for Callable Entrypoints (#160163 )

2025-08-12 20:08:49 +00:00

nativert

[nativert] oss subgraph rewriter (#160780 )

2025-08-18 04:25:05 +00:00

nested

Remove guard_size_oblivious from default contiguity python check, and add aten.sym_is_contiguous. (#159197 )

2025-08-16 09:15:58 +00:00

typing debugging.py (#160364 )

2025-08-15 02:09:31 +00:00

numa

[ez] Make NUMA signpost parameters JSON serializable (#160710 )

2025-08-15 16:52:43 +00:00

onnx

[ONNX] Default to dynamo export (#159646 )

2025-08-16 04:48:58 +00:00

optim

[BE] Remove more optim entries from docs coverage ignore list (#160194 )

2025-08-09 00:09:45 +00:00

package

[BE][PYFMT] migrate PYFMT for torch/[p-z]*/ to ruff format (#144552 )

2025-08-07 00:09:56 +00:00

profiler

[BE][PYFMT] migrate PYFMT for torch/[p-z]*/ to ruff format (#144552 )

2025-08-07 00:09:56 +00:00

quantization

[BE][PYFMT] migrate PYFMT for torch/[p-z]*/ to ruff format (#144552 )

2025-08-07 00:09:56 +00:00

signal

[BE][PYFMT] migrate PYFMT for torch/[p-z]*/ to ruff format (#144552 )

2025-08-07 00:09:56 +00:00

sparse

[BE][PYFMT] migrate PYFMT for torch/[p-z]*/ to ruff format (#144552 )

2025-08-07 00:09:56 +00:00

special

[BE][PYFMT] migrate PYFMT for torch/[p-z]*/ to ruff format (#144552 )

2025-08-07 00:09:56 +00:00

testing

[MPS] Add grid_sampler_3d for MPS (#160541 )

2025-08-15 16:19:25 +00:00

utils

Remove guard_size_oblivious from default contiguity python check, and add aten.sym_is_contiguous. (#159197 )

2025-08-16 09:15:58 +00:00

xpu

[BE][PYFMT] migrate PYFMT for torch/[p-z]*/ to ruff format (#144552 )

2025-08-07 00:09:56 +00:00

__config__.py

…

__future__.py

…

__init__.py

[BE] remove torch deploy - conditionals (#158288 )

2025-07-29 17:40:49 +00:00

_appdirs.py

Fix broken URLs (#152237 )

2025-04-27 09:56:42 +00:00

_classes.py

remove allow-untyped-defs from torch/_classes.py (#157231 )

2025-07-08 00:11:52 +00:00

_compile.py

[precompile] Ensure @disable()-ed function won't trigger recompile from precompile bytecode. (#155363 )

2025-06-10 16:13:38 +00:00

_custom_ops.py

Render Example: and not Example:: in docs (#153978 )

2025-05-21 01:03:26 +00:00

_environment.py

…

_guards.py

[easy] [Precompile] Refactor guards, improve typing (#160530 )

2025-08-17 17:54:55 +00:00

_jit_internal.py

[BE][1/16] fix typos in torch/ (#156311 )

2025-07-09 11:02:22 +00:00

_linalg_utils.py

Update is_sparse doc to mention that it is sparse_coo specific (#157378 )

2025-07-09 18:22:14 +00:00

_lobpcg.py

[BE][1/16] fix typos in torch/ (#156311 )

2025-07-09 11:02:22 +00:00

_lowrank.py

[BE][1/16] fix typos in torch/ (#156311 )

2025-07-09 11:02:22 +00:00

_meta_registrations.py

Fix meta for constant_pad_nd (#159878 )

2025-08-14 14:47:47 +00:00

_namedtensor_internals.py

…

_ops.py

[BE] remove torch deploy - conditionals (#158288 )

2025-07-29 17:40:49 +00:00

_python_dispatcher.py

Typo fixes for "overridden" in comments and function names (#155944 )

2025-06-14 03:37:38 +00:00

_size_docs.py

Render Example: and not Example:: in docs (#153978 )

2025-05-21 01:03:26 +00:00

_sources.py

…

_storage_docs.py

Fix docstring for torch.UntypedStorage.from_file (#155067 )

2025-06-05 14:30:49 +00:00

_streambase.py

…

_tensor_docs.py

Add missing optional for tensor ops (#159028 )

2025-07-25 04:36:55 +00:00

_tensor_str.py

Fix max_width computation in _tensor_str._Formatter (#126859 )

2025-08-01 15:05:41 +00:00

_tensor.py

[MPS] Enable dlpack integration (#158888 )

2025-07-24 18:05:41 +00:00

_thread_safe_fork.py

…

_torch_docs.py

[cuda][cupy] Improve cupy device placement when device is provided with explicit index (#158529 )

2025-08-15 00:27:42 +00:00

_utils_internal.py

Wire in pt2_triton_builds (#159897 )

2025-08-06 07:39:51 +00:00

_utils.py

[BE][1/16] fix typos in torch/ (#156311 )

2025-07-09 11:02:22 +00:00

_VF.py

…

_vmap_internals.py

Fix broken URLs (#152237 )

2025-04-27 09:56:42 +00:00

_weights_only_unpickler.py

added class or module info for functions blocked by weight-only load (#159935 )

2025-08-12 20:52:25 +00:00

CMakeLists.txt

CMake build: preserve PYTHONPATH (#160144 )

2025-08-08 16:03:49 +00:00

custom_class_detail.h

…

custom_class.h

[BE][1/16] fix typos in torch/ (#156311 )

2025-07-09 11:02:22 +00:00

extension.h

…

functional.py

unify broadcast_shapes functions and avoid duplicates (#160251 )

2025-08-16 00:54:32 +00:00

header_only_apis.txt

[Reland] Migrate ScalarType to headeronly (#159911 )

2025-08-06 07:36:37 +00:00

hub.py

Allow torch.hub.load with unauthorized GITHUB_TOKEN (#159896 )

2025-08-14 18:15:49 +00:00

library.h

[BE][1/16] fix typos in torch/ (#156311 )

2025-07-09 11:02:22 +00:00

library.py

Add utility to get computed kernel in torch.library (#158393 )

2025-08-13 21:00:59 +00:00

overrides.py

[PT2]: Add Static Dispatch Kernel for wrapped_fbgemm_linear_fp16_weight (#160451 )

2025-08-15 04:06:17 +00:00

py.typed

…

quasirandom.py

…

random.py

Update description for torch.random.fork_rng (#151881 )

2025-04-23 16:59:29 +00:00

return_types.py

…

script.h

…

serialization.py

added class or module info for functions blocked by weight-only load (#159935 )

2025-08-12 20:52:25 +00:00

storage.py

mypy 1.16.0 (#155821 )

2025-06-14 18:18:43 +00:00

torch_version.py

…

types.py

…

version.py.tpl

…