Test examples in docstrings using mktestdocs (#118)

Also adjust examples so that they are correct.
This commit is contained in:
Daniël de Kok
2025-07-28 17:31:34 +02:00
committed by GitHub
parent 8069e3bf0c
commit f7490bd0a9
13 changed files with 152 additions and 61 deletions

19
flake.lock generated
View File

@ -58,32 +58,33 @@
"nixpkgs": "nixpkgs"
},
"locked": {
"lastModified": 1750775451,
"narHash": "sha256-HiGqtwzIgUH7Xkh+wgpvHRZGooqrW0z663E6nauczA4=",
"lastModified": 1753693795,
"narHash": "sha256-bKT2Alrbo9wPMINaKVkgnnp/5dp6TpQe80OgpHOz2jI=",
"owner": "huggingface",
"repo": "hf-nix",
"rev": "5943c3169e861618a6634bc8dbdb498e413ab9b7",
"rev": "168df9452b3e6c961fdce0115899a1f8b0947a73",
"type": "github"
},
"original": {
"owner": "huggingface",
"ref": "mktestdocs-0.2.5",
"repo": "hf-nix",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1747820358,
"narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=",
"owner": "danieldk",
"lastModified": 1752785354,
"narHash": "sha256-Y33ryUz7MPqKrZwlbQcsYCUz2jAJCacRf8jbs0tYUlA=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "d3c1681180717528068082103bf323147de6ab0b",
"rev": "d38025438a6ee456758dc03188ca6873a415463b",
"type": "github"
},
"original": {
"owner": "danieldk",
"ref": "cudatoolkit-12.9-kernel-builder",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "d38025438a6ee456758dc03188ca6873a415463b",
"type": "github"
}
},

View File

@ -1,6 +1,6 @@
{
inputs = {
hf-nix.url = "github:huggingface/hf-nix";
hf-nix.url = "github:huggingface/hf-nix/mktestdocs-0.2.5";
nixpkgs.follows = "hf-nix/nixpkgs";
flake-utils.url = "github:numtide/flake-utils";
};
@ -40,6 +40,7 @@
++ (with python3.pkgs; [
docutils
huggingface-hub
mktestdocs
pytest
pytest-benchmark
pyyaml

View File

@ -24,6 +24,7 @@ build-backend = "setuptools.build_meta"
[dependency-groups]
dev = [
"mktestdocs>=0.2.5",
"mypy>=1.15.0",
"pytest>=8",
# Whatever version is compatible with pytest.

View File

@ -1,4 +1,4 @@
[pytest]
markers =
cuda_only: marks tests that should only hosts with CUDA GPUs
darwin_only: marks tests that should only run on macOS
linux_only: marks tests that should only run on Linux

View File

@ -2,6 +2,7 @@ from kernels.layer import (
CUDAProperties,
Device,
LayerRepository,
LockedLayerRepository,
Mode,
kernelize,
register_kernel_mapping,
@ -22,6 +23,7 @@ __all__ = [
"CUDAProperties",
"Device",
"LayerRepository",
"LockedLayerRepository",
"Mode",
"get_kernel",
"get_local_kernel",

View File

@ -208,15 +208,15 @@ class LayerRepository:
# Reference a specific layer by revision
layer_repo = LayerRepository(
repo_id="username/my-kernel",
layer_name="MyLayer",
repo_id="kernels-community/activation",
layer_name="SiluAndMul",
)
# Reference a layer by version constraint
layer_repo_versioned = LayerRepository(
repo_id="username/my-kernel",
layer_name="MyLayer",
version=">=1.0.0,<2.0.0"
repo_id="kernels-community/activation",
layer_name="SiluAndMul",
version=">=0.0.3,<0.1"
)
```
"""
@ -419,22 +419,36 @@ def use_kernel_mapping(
Example:
```python
import torch
import torch.nn as nn
from torch.nn import functional as F
from kernels import use_kernel_forward_from_hub
from kernels import use_kernel_mapping, LayerRepository, Device
from kernels import kernelize
# Define a mapping
mapping = {
"LayerNorm": {
"SiluAndMul": {
"cuda": LayerRepository(
repo_id="username/experimental-kernels",
layer_name="FastLayerNorm"
repo_id="kernels-community/activation",
layer_name="SiluAndMul",
)
}
}
@use_kernel_forward_from_hub("SiluAndMul")
class SiluAndMul(nn.Module):
def forward(self, x: torch.Tensor) -> torch.Tensor:
d = x.shape[-1] // 2
return F.silu(x[..., :d]) * x[..., d:]
model = SiluAndMul()
# Use the mapping for the duration of the context.
with use_kernel_mapping(mapping):
# kernelize uses the temporary mapping
model = kernelize(model)
model = kernelize(model, device="cuda")
# Outside the context, original mappings are restored
```
@ -463,6 +477,7 @@ def register_kernel_mapping(
Union[LayerRepositoryProtocol, Dict[Mode, LayerRepositoryProtocol]],
],
],
inherit_mapping: bool = True,
):
"""
Register a global mapping between layer names and their corresponding kernel implementations.
@ -474,6 +489,9 @@ def register_kernel_mapping(
mapping (`Dict[str, Dict[Union[Device, str], Union[LayerRepositoryProtocol, Dict[Mode, LayerRepositoryProtocol]]]]`):
The kernel mapping to register globally. Maps layer names to device-specific kernels.
The mapping can specify different kernels for different modes (training, inference, etc.).
inherit_mapping (`bool`, *optional*, defaults to `True`):
When `True`, the current mapping will be extended by `mapping`. When `False`, the existing mappings
are erased before adding `mapping`.
Example:
```python
@ -509,6 +527,9 @@ def register_kernel_mapping(
register_kernel_mapping(advanced_mapping)
```
"""
if not inherit_mapping:
_KERNEL_MAPPING.set({})
# Merge with existing mappings.
for new_kernel, new_device_repos in mapping.items():
device_repo = _KERNEL_MAPPING.get().setdefault(new_kernel, {})
@ -626,19 +647,23 @@ def kernelize(
Example:
```python
from kernels import kernelize, Mode, register_kernel_mapping, LayerRepository
import torch
import torch.nn as nn
@use_kernel_forward_from_hub("LayerNorm")
class LayerNorm(nn.Module):
...
from kernels import kernelize, Mode, register_kernel_mapping, LayerRepository
from kernels import use_kernel_forward_from_hub
@use_kernel_forward_from_hub("SiluAndMul")
class SiluAndMul(nn.Module):
def forward(self, x: torch.Tensor) -> torch.Tensor:
d = x.shape[-1] // 2
return F.silu(x[..., :d]) * x[..., d:]
# First register some kernel mappings
mapping = {
"LayerNorm": {
"cuda": LayerRepository(
repo_id="username/fast-kernels",
layer_name="FastLayerNorm"
repo_id="kernels-community/activation",
layer_name="SiluAndMul",
)
}
}
@ -646,9 +671,8 @@ def kernelize(
# Create and kernelize a model
model = nn.Sequential(
nn.Linear(768, 768),
LayerNorm(768),
nn.Linear(768, 768)
nn.Linear(1024, 2048, device="cuda"),
SiluAndMul(),
)
# Kernelize for inference
@ -776,22 +800,25 @@ def use_kernel_forward_from_hub(layer_name: str):
Example:
```python
from kernels import use_kernel_forward_from_hub
import torch
import torch.nn as nn
from kernels import use_kernel_forward_from_hub, kernelize
@use_kernel_forward_from_hub("MyCustomLayer")
class MyCustomLayer(nn.Module):
def __init__(self, hidden_size):
super().__init__()
self.hidden_size = hidden_size
def forward(self, x):
def forward(self, x: torch.Tensor):
# original implementation
return x
# The layer can now be kernelized
model = MyCustomLayer(768)
kernelized_model = kernelize(model)
# The layer can now be kernelized:
# model = kernelize(model, device="cuda")
```
"""

View File

@ -221,9 +221,13 @@ def get_kernel(
Example:
```python
import torch
from kernels import get_kernel
kernel = get_kernel("username/my-kernel")
result = kernel.kernel_function(input_data)
activation = get_kernel("kernels-community/activation")
x = torch.randn(10, 20, device="cuda")
out = torch.empty_like(x)
result = activation.silu_and_mul(out, x)
```
"""
revision = select_revision_or_version(repo_id, revision, version)

View File

@ -1,10 +1,13 @@
import sys
import pytest
import torch
has_cuda = torch.cuda.is_available() and torch.cuda.device_count() > 0
def pytest_runtest_setup(item):
if "linux_only" in item.keywords and not sys.platform.startswith("linux"):
pytest.skip("skipping Linux-only test on non-Linux platform")
if "cuda_only" in item.keywords and not has_cuda:
pytest.skip("skipping CUDA-only test on host without CUDA")
if "darwin_only" in item.keywords and not sys.platform.startswith("darwin"):
pytest.skip("skipping macOS-only test on non-macOS platform")

View File

@ -34,7 +34,7 @@ def device():
return "cuda"
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_gelu_fast(kernel, device):
x = torch.arange(1, 10, dtype=torch.float16, device=device).view(3, 3)
y = torch.empty_like(x)
@ -50,7 +50,7 @@ def test_gelu_fast(kernel, device):
assert torch.allclose(y, expected)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_local_kernel(local_kernel, device):
x = torch.arange(1, 10, dtype=torch.float16, device=device).view(3, 3)
y = torch.empty_like(x)
@ -74,7 +74,7 @@ def test_relu_metal(metal_kernel, dtype):
assert torch.allclose(y, torch.relu(x))
@pytest.mark.linux_only
@pytest.mark.cuda_only
@pytest.mark.parametrize(
"kernel_exists",
[
@ -110,7 +110,7 @@ def test_version():
)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_universal_kernel(universal_kernel):
torch.manual_seed(0)
A = torch.randint(-10, 10, (64, 128), dtype=torch.int8, device="cuda")

View File

@ -16,21 +16,21 @@ def device():
return "cuda"
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_gelu_small(kernel, device, benchmark):
x = torch.randn(32, 32, dtype=torch.float16, device=device)
y = torch.empty_like(x)
benchmark(kernel.gelu_fast, y, x)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_gelu_medium(kernel, device, benchmark):
x = torch.randn(128, 128, dtype=torch.float16, device=device)
y = torch.empty_like(x)
benchmark(kernel.gelu_fast, y, x)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_gelu_large(kernel, device, benchmark):
x = torch.randn(512, 512, dtype=torch.float16, device=device)
y = torch.empty_like(x)

49
tests/test_doctest.py Normal file
View File

@ -0,0 +1,49 @@
import inspect
import pytest
from mktestdocs import check_docstring, get_codeblock_members
import kernels
def all_public_functions():
function_list = inspect.getmembers(kernels, inspect.isfunction)
return [func for _, func in function_list]
def all_public_classes():
class_list = inspect.getmembers(kernels, inspect.isclass)
return [cls for _, cls in class_list]
def all_public_class_members():
members = get_codeblock_members(*all_public_classes())
return members
@pytest.mark.cuda_only
@pytest.mark.parametrize(
"func",
all_public_functions(),
ids=lambda d: d.__name__,
)
def test_func_docstring(func):
check_docstring(obj=func)
@pytest.mark.cuda_only
@pytest.mark.parametrize(
"cls",
all_public_classes(),
ids=lambda d: d.__name__,
)
def test_class_docstring(cls):
check_docstring(obj=cls)
@pytest.mark.cuda_only
@pytest.mark.parametrize(
"member", all_public_class_members(), ids=lambda d: d.__qualname__
)
def test_member_docstring(member):
check_docstring(member)

View File

@ -27,7 +27,7 @@ def test_download_all_hash_validation():
download_kernels(DownloadArgs(all_variants=True, project_dir=project_dir))
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_load_locked():
project_dir = Path(__file__).parent / "kernel_locking"
# Also validates that hashing works correctly.

View File

@ -13,12 +13,12 @@ from kernels import (
kernelize,
register_kernel_mapping,
use_kernel_forward_from_hub,
use_kernel_mapping,
)
from kernels.layer import (
_KERNEL_MAPPING,
CUDAProperties,
_validate_layer,
use_kernel_mapping,
)
kernel_layer_mapping = {
@ -102,7 +102,7 @@ def test_arg_kinds():
assert arg_kind("foo", "bar", kwarg1="baz", kwarg2=5) == ("foo", "bar", "baz", 5)
@pytest.mark.linux_only
@pytest.mark.cuda_only
@pytest.mark.parametrize("cls", [SiluAndMulWithKernel, SiluAndMulStringDevice])
@pytest.mark.parametrize("device", ["cuda", "cpu"])
def test_hub_forward(cls, device):
@ -124,7 +124,7 @@ def test_hub_forward(cls, device):
assert silu_and_mul_with_kernel.n_calls == 1
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_capability():
linear = TorchLinearWithCounter(32, 32).to("cuda")
with use_kernel_mapping(
@ -183,7 +183,7 @@ def test_layer_fallback_works():
kernelize(silu_and_mul, device="cuda", mode=Mode.INFERENCE)
@pytest.mark.linux_only
@pytest.mark.cuda_only
@pytest.mark.parametrize("cls", [SiluAndMulWithKernel, SiluAndMulNoCompileKernel])
@pytest.mark.parametrize("device", ["cuda"])
def test_torch_compile_layer_without_fallback(cls, device):
@ -214,7 +214,7 @@ def test_torch_compile_layer_without_fallback(cls, device):
torch.testing.assert_close(Y_compiled, Y)
@pytest.mark.linux_only
@pytest.mark.cuda_only
@pytest.mark.parametrize("cls", [SiluAndMulWithKernel, SiluAndMulNoCompileKernel])
@pytest.mark.parametrize("device", ["cuda"])
def test_torch_compile_layer_with_fallback(cls, device):
@ -237,8 +237,11 @@ def test_torch_compile_layer_with_fallback(cls, device):
torch.testing.assert_close(Y_compiled, Y)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_mapping_contexts():
# Make sure we start from scratch.
register_kernel_mapping(kernel_layer_mapping, inherit_mapping=False)
assert set(_KERNEL_MAPPING.get().keys()) == {
"SiluAndMul",
"SiluAndMulStringDevice",
@ -351,7 +354,7 @@ def test_validate_kernel_layer():
_validate_layer(cls=BadLayer4, check_cls=SiluAndMul)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_invalid_mode_for_mapping_rejected():
linear = TorchLinearWithCounter(32, 32).to("cuda")
@ -371,7 +374,7 @@ def test_invalid_mode_for_mapping_rejected():
kernelize(linear, mode=Mode.TRAINING)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_kernel_modes():
linear = TorchLinearWithCounter(32, 32).to("cuda")
@ -515,7 +518,7 @@ def test_kernel_modes():
assert linear.n_calls == 2
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_fallback_used_when_training():
linear = TorchLinearWithCounter(32, 32).to("cuda")
@ -580,7 +583,7 @@ def test_invalid_mode_rejected():
kernelize(torch.nn.Linear(32, 32), mode=Mode.TORCH_COMPILE)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_kernel_modes_inference():
"""Test inference-specific fallback scenarios."""
linear = TorchLinearWithCounter(32, 32).to("cuda")
@ -677,7 +680,7 @@ def test_kernel_modes_inference():
assert linear.n_calls == 4
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_kernel_modes_mixed():
"""Test mixed training and inference kernel scenarios."""
linear = TorchLinearWithCounter(32, 32).to("cuda")
@ -767,7 +770,7 @@ def test_kernel_modes_mixed():
assert linear.n_calls == 2
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_kernel_modes_cross_fallback():
"""Test cross-mode fallback scenarios from inference to training modes."""
linear = TorchLinearWithCounter(32, 32).to("cuda")