Test examples in docstrings using mktestdocs (#118)

Also adjust examples so that they are correct.
2025-10-20 12:33:46 +08:00 · 2025-07-28 17:31:34 +02:00
parent 8069e3bf0c
commit f7490bd0a9
13 changed files with 152 additions and 61 deletions
--- a/flake.lock
+++ b/flake.lock
@ -58,32 +58,33 @@
        "nixpkgs": "nixpkgs"
      },
      "locked": {
-        "lastModified": 1750775451,
-        "narHash": "sha256-HiGqtwzIgUH7Xkh+wgpvHRZGooqrW0z663E6nauczA4=",
+        "lastModified": 1753693795,
+        "narHash": "sha256-bKT2Alrbo9wPMINaKVkgnnp/5dp6TpQe80OgpHOz2jI=",
        "owner": "huggingface",
        "repo": "hf-nix",
-        "rev": "5943c3169e861618a6634bc8dbdb498e413ab9b7",
+        "rev": "168df9452b3e6c961fdce0115899a1f8b0947a73",
        "type": "github"
      },
      "original": {
        "owner": "huggingface",
+        "ref": "mktestdocs-0.2.5",
        "repo": "hf-nix",
        "type": "github"
      }
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1747820358,
-        "narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=",
-        "owner": "danieldk",
+        "lastModified": 1752785354,
+        "narHash": "sha256-Y33ryUz7MPqKrZwlbQcsYCUz2jAJCacRf8jbs0tYUlA=",
+        "owner": "nixos",
        "repo": "nixpkgs",
-        "rev": "d3c1681180717528068082103bf323147de6ab0b",
+        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
        "type": "github"
      },
      "original": {
-        "owner": "danieldk",
-        "ref": "cudatoolkit-12.9-kernel-builder",
+        "owner": "nixos",
        "repo": "nixpkgs",
+        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
        "type": "github"
      }
    },
--- a/flake.nix
+++ b/flake.nix
@ -1,6 +1,6 @@
 {
  inputs = {
-    hf-nix.url = "github:huggingface/hf-nix";
+    hf-nix.url = "github:huggingface/hf-nix/mktestdocs-0.2.5";
    nixpkgs.follows = "hf-nix/nixpkgs";
    flake-utils.url = "github:numtide/flake-utils";
  };
@ -40,6 +40,7 @@
              ++ (with python3.pkgs; [
                docutils
                huggingface-hub
+                mktestdocs
                pytest
                pytest-benchmark
                pyyaml
--- a/pyproject.toml
+++ b/pyproject.toml
@ -24,11 +24,12 @@ build-backend = "setuptools.build_meta"

 [dependency-groups]
 dev = [
-  "mypy >= 1.15.0",
-  "pytest >=8",
+  "mktestdocs>=0.2.5",
+  "mypy>=1.15.0",
+  "pytest>=8",
  # Whatever version is compatible with pytest.
  "pytest-benchmark",
-  "torch >=2.5",
+  "torch>=2.5",
  "types-pyyaml"
 ]

--- a/pytest.ini
+++ b/pytest.ini
@ -1,4 +1,4 @@
 [pytest]
 markers =
+    cuda_only: marks tests that should only hosts with CUDA GPUs
    darwin_only: marks tests that should only run on macOS
-    linux_only: marks tests that should only run on Linux
--- a/src/kernels/init.py
+++ b/src/kernels/init.py
@ -2,6 +2,7 @@ from kernels.layer import (
    CUDAProperties,
    Device,
    LayerRepository,
+    LockedLayerRepository,
    Mode,
    kernelize,
    register_kernel_mapping,
@ -22,6 +23,7 @@ __all__ = [
    "CUDAProperties",
    "Device",
    "LayerRepository",
+    "LockedLayerRepository",
    "Mode",
    "get_kernel",
    "get_local_kernel",
--- a/src/kernels/layer.py
+++ b/src/kernels/layer.py
@ -208,15 +208,15 @@ class LayerRepository:

        # Reference a specific layer by revision
        layer_repo = LayerRepository(
-            repo_id="username/my-kernel",
-            layer_name="MyLayer",
+            repo_id="kernels-community/activation",
+            layer_name="SiluAndMul",
        )

        # Reference a layer by version constraint
        layer_repo_versioned = LayerRepository(
-            repo_id="username/my-kernel",
-            layer_name="MyLayer",
-            version=">=1.0.0,<2.0.0"
+            repo_id="kernels-community/activation",
+            layer_name="SiluAndMul",
+            version=">=0.0.3,<0.1"
        )
        ```
    """
@ -419,22 +419,36 @@ def use_kernel_mapping(

    Example:
        ```python
+        import torch
+        import torch.nn as nn
+        from torch.nn import functional as F
+
+        from kernels import use_kernel_forward_from_hub
        from kernels import use_kernel_mapping, LayerRepository, Device
+        from kernels import kernelize

        # Define a mapping
        mapping = {
-            "LayerNorm": {
+            "SiluAndMul": {
                "cuda": LayerRepository(
-                    repo_id="username/experimental-kernels",
-                    layer_name="FastLayerNorm"
+                    repo_id="kernels-community/activation",
+                    layer_name="SiluAndMul",
                )
            }
        }

+        @use_kernel_forward_from_hub("SiluAndMul")
+        class SiluAndMul(nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                d = x.shape[-1] // 2
+                return F.silu(x[..., :d]) * x[..., d:]
+
+        model = SiluAndMul()
+
        # Use the mapping for the duration of the context.
        with use_kernel_mapping(mapping):
            # kernelize uses the temporary mapping
-            model = kernelize(model)
+            model = kernelize(model, device="cuda")

        # Outside the context, original mappings are restored
        ```
@ -463,6 +477,7 @@ def register_kernel_mapping(
            Union[LayerRepositoryProtocol, Dict[Mode, LayerRepositoryProtocol]],
        ],
    ],
+    inherit_mapping: bool = True,
 ):
    """
    Register a global mapping between layer names and their corresponding kernel implementations.
@ -474,6 +489,9 @@ def register_kernel_mapping(
        mapping (`Dict[str, Dict[Union[Device, str], Union[LayerRepositoryProtocol, Dict[Mode, LayerRepositoryProtocol]]]]`):
            The kernel mapping to register globally. Maps layer names to device-specific kernels.
            The mapping can specify different kernels for different modes (training, inference, etc.).
+        inherit_mapping (`bool`, *optional*, defaults to `True`):
+            When `True`, the current mapping will be extended by `mapping`. When `False`, the existing mappings
+            are erased before adding `mapping`.

    Example:
        ```python
@ -509,6 +527,9 @@ def register_kernel_mapping(
        register_kernel_mapping(advanced_mapping)
        ```
    """
+    if not inherit_mapping:
+        _KERNEL_MAPPING.set({})
+
    # Merge with existing mappings.
    for new_kernel, new_device_repos in mapping.items():
        device_repo = _KERNEL_MAPPING.get().setdefault(new_kernel, {})
@ -626,19 +647,23 @@ def kernelize(

    Example:
        ```python
-        from kernels import kernelize, Mode, register_kernel_mapping, LayerRepository
+        import torch
        import torch.nn as nn

-        @use_kernel_forward_from_hub("LayerNorm")
-        class LayerNorm(nn.Module):
-            ...
+        from kernels import kernelize, Mode, register_kernel_mapping, LayerRepository
+        from kernels import use_kernel_forward_from_hub
+
+        @use_kernel_forward_from_hub("SiluAndMul")
+        class SiluAndMul(nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                d = x.shape[-1] // 2
+                return F.silu(x[..., :d]) * x[..., d:]

-        # First register some kernel mappings
        mapping = {
            "LayerNorm": {
                "cuda": LayerRepository(
-                    repo_id="username/fast-kernels",
-                    layer_name="FastLayerNorm"
+                    repo_id="kernels-community/activation",
+                    layer_name="SiluAndMul",
                )
            }
        }
@ -646,9 +671,8 @@ def kernelize(

        # Create and kernelize a model
        model = nn.Sequential(
-            nn.Linear(768, 768),
-            LayerNorm(768),
-            nn.Linear(768, 768)
+            nn.Linear(1024, 2048, device="cuda"),
+            SiluAndMul(),
        )

        # Kernelize for inference
@ -776,22 +800,25 @@ def use_kernel_forward_from_hub(layer_name: str):

    Example:
        ```python
-        from kernels import use_kernel_forward_from_hub
+        import torch
        import torch.nn as nn

+        from kernels import use_kernel_forward_from_hub, kernelize
+
        @use_kernel_forward_from_hub("MyCustomLayer")
        class MyCustomLayer(nn.Module):
            def __init__(self, hidden_size):
                super().__init__()
                self.hidden_size = hidden_size

-            def forward(self, x):
+            def forward(self, x: torch.Tensor):
                # original implementation
                return x

-        # The layer can now be kernelized
        model = MyCustomLayer(768)
-        kernelized_model = kernelize(model)
+
+        # The layer can now be kernelized:
+        # model = kernelize(model, device="cuda")
        ```
    """

--- a/src/kernels/utils.py
+++ b/src/kernels/utils.py
@ -221,9 +221,13 @@ def get_kernel(

    Example:
        ```python
+        import torch
        from kernels import get_kernel
-        kernel = get_kernel("username/my-kernel")
-        result = kernel.kernel_function(input_data)
+
+        activation = get_kernel("kernels-community/activation")
+        x = torch.randn(10, 20, device="cuda")
+        out = torch.empty_like(x)
+        result = activation.silu_and_mul(out, x)
        ```
    """
    revision = select_revision_or_version(repo_id, revision, version)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1,10 +1,13 @@
 import sys

 import pytest
+import torch
+
+has_cuda = torch.cuda.is_available() and torch.cuda.device_count() > 0


 def pytest_runtest_setup(item):
-    if "linux_only" in item.keywords and not sys.platform.startswith("linux"):
-        pytest.skip("skipping Linux-only test on non-Linux platform")
+    if "cuda_only" in item.keywords and not has_cuda:
+        pytest.skip("skipping CUDA-only test on host without CUDA")
    if "darwin_only" in item.keywords and not sys.platform.startswith("darwin"):
        pytest.skip("skipping macOS-only test on non-macOS platform")
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@ -34,7 +34,7 @@ def device():
    return "cuda"


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_gelu_fast(kernel, device):
    x = torch.arange(1, 10, dtype=torch.float16, device=device).view(3, 3)
    y = torch.empty_like(x)
@ -50,7 +50,7 @@ def test_gelu_fast(kernel, device):
    assert torch.allclose(y, expected)


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_local_kernel(local_kernel, device):
    x = torch.arange(1, 10, dtype=torch.float16, device=device).view(3, 3)
    y = torch.empty_like(x)
@ -74,7 +74,7 @@ def test_relu_metal(metal_kernel, dtype):
    assert torch.allclose(y, torch.relu(x))


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
@pytest.mark.parametrize(
    "kernel_exists",
    [
@ -110,7 +110,7 @@ def test_version():
        )


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_universal_kernel(universal_kernel):
    torch.manual_seed(0)
    A = torch.randint(-10, 10, (64, 128), dtype=torch.int8, device="cuda")
--- a/tests/test_benchmarks.py
+++ b/tests/test_benchmarks.py
@ -16,21 +16,21 @@ def device():
    return "cuda"


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_gelu_small(kernel, device, benchmark):
    x = torch.randn(32, 32, dtype=torch.float16, device=device)
    y = torch.empty_like(x)
    benchmark(kernel.gelu_fast, y, x)


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_gelu_medium(kernel, device, benchmark):
    x = torch.randn(128, 128, dtype=torch.float16, device=device)
    y = torch.empty_like(x)
    benchmark(kernel.gelu_fast, y, x)


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_gelu_large(kernel, device, benchmark):
    x = torch.randn(512, 512, dtype=torch.float16, device=device)
    y = torch.empty_like(x)
--- a/tests/test_doctest.py
+++ b/tests/test_doctest.py
@ -0,0 +1,49 @@
+import inspect
+
+import pytest
+from mktestdocs import check_docstring, get_codeblock_members
+
+import kernels
+
+
+def all_public_functions():
+    function_list = inspect.getmembers(kernels, inspect.isfunction)
+    return [func for _, func in function_list]
+
+
+def all_public_classes():
+    class_list = inspect.getmembers(kernels, inspect.isclass)
+    return [cls for _, cls in class_list]
+
+
+def all_public_class_members():
+    members = get_codeblock_members(*all_public_classes())
+    return members
+
+
+@pytest.mark.cuda_only
+@pytest.mark.parametrize(
+    "func",
+    all_public_functions(),
+    ids=lambda d: d.__name__,
+)
+def test_func_docstring(func):
+    check_docstring(obj=func)
+
+
+@pytest.mark.cuda_only
+@pytest.mark.parametrize(
+    "cls",
+    all_public_classes(),
+    ids=lambda d: d.__name__,
+)
+def test_class_docstring(cls):
+    check_docstring(obj=cls)
+
+
+@pytest.mark.cuda_only
+@pytest.mark.parametrize(
+    "member", all_public_class_members(), ids=lambda d: d.__qualname__
+)
+def test_member_docstring(member):
+    check_docstring(member)
--- a/tests/test_kernel_locking.py
+++ b/tests/test_kernel_locking.py
@ -27,7 +27,7 @@ def test_download_all_hash_validation():
    download_kernels(DownloadArgs(all_variants=True, project_dir=project_dir))


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_load_locked():
    project_dir = Path(__file__).parent / "kernel_locking"
    # Also validates that hashing works correctly.
--- a/tests/test_layer.py
+++ b/tests/test_layer.py
@ -13,12 +13,12 @@ from kernels import (
    kernelize,
    register_kernel_mapping,
    use_kernel_forward_from_hub,
+    use_kernel_mapping,
 )
 from kernels.layer import (
    _KERNEL_MAPPING,
    CUDAProperties,
    _validate_layer,
-    use_kernel_mapping,
 )

 kernel_layer_mapping = {
@ -102,7 +102,7 @@ def test_arg_kinds():
    assert arg_kind("foo", "bar", kwarg1="baz", kwarg2=5) == ("foo", "bar", "baz", 5)


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
@pytest.mark.parametrize("cls", [SiluAndMulWithKernel, SiluAndMulStringDevice])
@pytest.mark.parametrize("device", ["cuda", "cpu"])
 def test_hub_forward(cls, device):
@ -124,7 +124,7 @@ def test_hub_forward(cls, device):
        assert silu_and_mul_with_kernel.n_calls == 1


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_capability():
    linear = TorchLinearWithCounter(32, 32).to("cuda")
    with use_kernel_mapping(
@ -183,7 +183,7 @@ def test_layer_fallback_works():
    kernelize(silu_and_mul, device="cuda", mode=Mode.INFERENCE)


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
@pytest.mark.parametrize("cls", [SiluAndMulWithKernel, SiluAndMulNoCompileKernel])
@pytest.mark.parametrize("device", ["cuda"])
 def test_torch_compile_layer_without_fallback(cls, device):
@ -214,7 +214,7 @@ def test_torch_compile_layer_without_fallback(cls, device):
    torch.testing.assert_close(Y_compiled, Y)


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
@pytest.mark.parametrize("cls", [SiluAndMulWithKernel, SiluAndMulNoCompileKernel])
@pytest.mark.parametrize("device", ["cuda"])
 def test_torch_compile_layer_with_fallback(cls, device):
@ -237,8 +237,11 @@ def test_torch_compile_layer_with_fallback(cls, device):
    torch.testing.assert_close(Y_compiled, Y)


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_mapping_contexts():
+    # Make sure we start from scratch.
+    register_kernel_mapping(kernel_layer_mapping, inherit_mapping=False)
+
    assert set(_KERNEL_MAPPING.get().keys()) == {
        "SiluAndMul",
        "SiluAndMulStringDevice",
@ -351,7 +354,7 @@ def test_validate_kernel_layer():
        _validate_layer(cls=BadLayer4, check_cls=SiluAndMul)


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_invalid_mode_for_mapping_rejected():
    linear = TorchLinearWithCounter(32, 32).to("cuda")

@ -371,7 +374,7 @@ def test_invalid_mode_for_mapping_rejected():
            kernelize(linear, mode=Mode.TRAINING)


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_kernel_modes():
    linear = TorchLinearWithCounter(32, 32).to("cuda")

@ -515,7 +518,7 @@ def test_kernel_modes():
        assert linear.n_calls == 2


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_fallback_used_when_training():
    linear = TorchLinearWithCounter(32, 32).to("cuda")

@ -580,7 +583,7 @@ def test_invalid_mode_rejected():
        kernelize(torch.nn.Linear(32, 32), mode=Mode.TORCH_COMPILE)


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_kernel_modes_inference():
    """Test inference-specific fallback scenarios."""
    linear = TorchLinearWithCounter(32, 32).to("cuda")
@ -677,7 +680,7 @@ def test_kernel_modes_inference():
        assert linear.n_calls == 4


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_kernel_modes_mixed():
    """Test mixed training and inference kernel scenarios."""
    linear = TorchLinearWithCounter(32, 32).to("cuda")
@ -767,7 +770,7 @@ def test_kernel_modes_mixed():
        assert linear.n_calls == 2


-@pytest.mark.linux_only
+@pytest.mark.cuda_only
 def test_kernel_modes_cross_fallback():
    """Test cross-mode fallback scenarios from inference to training modes."""
    linear = TorchLinearWithCounter(32, 32).to("cuda")