Set version to 0.9.0

2025-11-03 21:44:27 +08:00 · 2025-08-01 14:38:18 +00:00
7 changed files with 63 additions and 92 deletions
--- a/docs/source/layers.md
+++ b/docs/source/layers.md
@ -84,6 +84,12 @@ model = kernelize(model, mode=Mode.INFERENCE | Mode.TORCH_COMPILE)
 model = kernelize(model, mode=Mode.TRAINING | Mode.TORCH_COMPILE)
 ```

+When the `mode` argument is not specified,
+`Mode.TRAINING | Mode.TORCH_COMPILE` is used as the default. This mode
+aligns most closely with pure PyTorch layers which also support training
+and `torch.compile`. However, to select the most performant kernels, it
+is often good to make the mode specific as possible.
+
 ### Kernel device

 Kernels can be registered per device type. For instance, separate `cuda` and
@ -151,7 +157,7 @@ used with the `use_kernel_mapping` context manager:
 ```python
 with use_kernel_mapping(kernel_layer_mapping):
    # Use the layer for which the mapping is applied.
-    model = kernelize(model, mode=Mode.TRAINING | Mode.TORCH_COMPILE)
+    model = kernelize(model)
 ```

 This ensures that the mapping is not active anymore outside the
@ -279,11 +285,12 @@ a kernel to a range of ROCm capabilities.
 The `LocalLayerRepository` class is provided to load a repository from
 a local directory. For example:

-```python
+```
 with use_kernel_mapping(
    {
        "SiluAndMul": {
            "cuda": LocalLayerRepository(
+                # install_kernel will give the fully-resolved path.
                repo_path="/home/daniel/kernels/activation",
                package_name="activation",
                layer_name="SiluAndMul",
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "kernels"
-version = "0.10.1"
+version = "0.9.0"
 description = "Download compute kernels"
 authors = [
  { name = "OlivierDehaene", email = "olivier@huggingface.co" },
--- a/src/kernels/init.py
+++ b/src/kernels/init.py
@ -1,7 +1,3 @@
-import importlib.metadata
-
-__version__ = importlib.metadata.version("kernels")
-
 from kernels.layer import (
    CUDAProperties,
    Device,
@ -25,7 +21,6 @@ from kernels.utils import (
 )

 __all__ = [
-    "__version__",
    "CUDAProperties",
    "Device",
    "LayerRepository",
--- a/src/kernels/layer.py
+++ b/src/kernels/layer.py
@ -87,7 +87,7 @@ class Device:

    Args:
        type (`str`):
-            The device type (e.g., "cuda", "mps", "rocm").
+            The device type (e.g., "cuda", "mps", "cpu").
        properties ([`CUDAProperties`], *optional*):
            Device-specific properties. Currently only [`CUDAProperties`] is supported for CUDA devices.

@ -531,7 +531,7 @@ class _ROCMRepos(_DeviceRepos):

 def _validate_device_type(device_type: str) -> None:
    """Validate that the device type is supported."""
-    supported_devices = {"cuda", "rocm", "mps"}
+    supported_devices = {"cuda", "rocm", "mps", "cpu"}
    if device_type not in supported_devices:
        raise ValueError(
            f"Unsupported device type '{device_type}'. Supported device types are: {', '.join(sorted(supported_devices))}"
@ -578,7 +578,7 @@ def use_kernel_mapping(

        from kernels import use_kernel_forward_from_hub
        from kernels import use_kernel_mapping, LayerRepository, Device
-        from kernels import Mode, kernelize
+        from kernels import kernelize

        # Define a mapping
        mapping = {
@ -601,7 +601,7 @@ def use_kernel_mapping(
        # Use the mapping for the duration of the context.
        with use_kernel_mapping(mapping):
            # kernelize uses the temporary mapping
-            model = kernelize(model, mode=Mode.TRAINING | Mode.TORCH_COMPILE, device="cuda")
+            model = kernelize(model, device="cuda")

        # Outside the context, original mappings are restored
        ```
@ -772,7 +772,7 @@ def _select_repository(
 def kernelize(
    model: "nn.Module",
    *,
-    mode: Mode,
+    mode: Mode = Mode.TRAINING | Mode.TORCH_COMPILE,
    device: Optional[Union[str, "torch.device"]] = None,
    use_fallback: bool = True,
 ):
@ -785,11 +785,11 @@ def kernelize(
    Args:
        model (`nn.Module`):
            The PyTorch model to kernelize.
-        mode ([`Mode`]): The mode that the kernel is going to be used in. For example,
-            `Mode.TRAINING | Mode.TORCH_COMPILE` kernelizes the model for training with
-            `torch.compile`.
+        mode ([`Mode`], *optional*, defaults to `Mode.TRAINING | Mode.TORCH_COMPILE`):
+            The mode that the kernel is going to be used in. For example, `Mode.TRAINING | Mode.TORCH_COMPILE`
+            kernelizes the model for training with `torch.compile`.
        device (`Union[str, torch.device]`, *optional*):
-            The device type to load kernels for. Supported device types are: "cuda", "mps", "rocm".
+            The device type to load kernels for. Supported device types are: "cuda", "rocm", "mps", "cpu".
            The device type will be inferred from the model parameters when not provided.
        use_fallback (`bool`, *optional*, defaults to `True`):
            Whether to use the original forward method of modules when no compatible kernel could be found.
@ -829,7 +829,7 @@ def kernelize(
        )

        # Kernelize for inference
-        kernelized_model = kernelize(model, mode=Mode.TRAINING | Mode.TORCH_COMPILE)
+        kernelized_model = kernelize(model)
        ```
    """

@ -954,8 +954,7 @@ def use_kernel_forward_from_hub(layer_name: str):
        import torch
        import torch.nn as nn

-        from kernels import use_kernel_forward_from_hub
-        from kernels import Mode, kernelize
+        from kernels import use_kernel_forward_from_hub, kernelize

        @use_kernel_forward_from_hub("MyCustomLayer")
        class MyCustomLayer(nn.Module):
@ -970,7 +969,7 @@ def use_kernel_forward_from_hub(layer_name: str):
        model = MyCustomLayer(768)

        # The layer can now be kernelized:
-        # model = kernelize(model, mode=Mode.TRAINING | Mode.TORCH_COMPILE, device="cuda")
+        # model = kernelize(model, device="cuda")
        ```
    """

--- a/src/kernels/utils.py
+++ b/src/kernels/utils.py
@ -46,9 +46,8 @@ def build_variant() -> str:
        compute_framework = f"rocm{rocm_version.major}{rocm_version.minor}"
    elif torch.backends.mps.is_available():
        compute_framework = "metal"
-    elif torch.version.xpu is not None:
-        version = torch.version.xpu
-        compute_framework = f"xpu{version[0:4]}{version[5:6]}"
+    elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        compute_framework = "xpu"
    else:
        raise AssertionError(
            "Torch was not compiled with CUDA, Metal, XPU, or ROCm enabled."
@ -249,24 +248,8 @@ def get_local_kernel(repo_path: Path, package_name: str) -> ModuleType:
    Returns:
        `ModuleType`: The imported kernel module.
    """
-    variant = build_variant()
-    universal_variant = universal_build_variant()
-
-    # Presume we were given the top level path of the kernel repository.
-    for base_path in [repo_path, repo_path / "build"]:
-        # Prefer the universal variant if it exists.
-        for v in [universal_variant, variant]:
-            package_path = base_path / v / package_name / "__init__.py"
-            if package_path.exists():
-                return import_from_path(package_name, package_path)
-
-    # If we didn't find the package in the repo we may have a explicit
-    # package path.
-    package_path = repo_path / package_name / "__init__.py"
-    if package_path.exists():
-        return import_from_path(package_name, package_path)
-
-    raise FileNotFoundError(f"Could not find package '{package_name}' in {repo_path}")
+    package_name, package_path = _load_kernel_from_path(repo_path, package_name)
+    return import_from_path(package_name, package_path / package_name / "__init__.py")


 def has_kernel(
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@ -10,16 +10,10 @@ def kernel():


@pytest.fixture
-def local_kernel_path():
+def local_kernel():
    package_name, path = install_kernel("kernels-community/activation", "main")
    # Path is the build variant path (build/torch-<...>), so the grandparent
    # is the kernel repository path.
-    return package_name, path
-
-
-@pytest.fixture
-def local_kernel(local_kernel_path):
-    package_name, path = local_kernel_path
    return get_local_kernel(path.parent.parent, package_name)


@ -72,39 +66,6 @@ def test_local_kernel(local_kernel, device):
    assert torch.allclose(y, expected)


-@pytest.mark.cuda_only
-def test_local_kernel_path_types(local_kernel_path, device):
-    package_name, path = local_kernel_path
-
-    # Top-level repo path
-    # ie: /home/ubuntu/.cache/huggingface/hub/models--kernels-community--activation/snapshots/2fafa6a3a38ccb57a1a98419047cf7816ecbc071
-    kernel = get_local_kernel(path.parent.parent, package_name)
-    x = torch.arange(1, 10, dtype=torch.float16, device=device).view(3, 3)
-    y = torch.empty_like(x)
-
-    kernel.gelu_fast(y, x)
-    expected = torch.tensor(
-        [[0.8408, 1.9551, 2.9961], [4.0000, 5.0000, 6.0000], [7.0000, 8.0000, 9.0000]],
-        device=device,
-        dtype=torch.float16,
-    )
-    assert torch.allclose(y, expected)
-
-    # Build directory path
-    # ie: /home/ubuntu/.cache/huggingface/hub/models--kernels-community--activation/snapshots/2fafa6a3a38ccb57a1a98419047cf7816ecbc071/build
-    kernel = get_local_kernel(path.parent.parent / "build", package_name)
-    y = torch.empty_like(x)
-    kernel.gelu_fast(y, x)
-    assert torch.allclose(y, expected)
-
-    # Explicit package path
-    # ie: /home/ubuntu/.cache/huggingface/hub/models--kernels-community--activation/snapshots/2fafa6a3a38ccb57a1a98419047cf7816ecbc071/build/torch28-cxx11-cu128-x86_64-linux
-    kernel = get_local_kernel(path, package_name)
-    y = torch.empty_like(x)
-    kernel.gelu_fast(y, x)
-    assert torch.allclose(y, expected)
-
-
@pytest.mark.darwin_only
@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
 def test_relu_metal(metal_kernel, dtype):
--- a/tests/test_layer.py
+++ b/tests/test_layer.py
@ -110,20 +110,24 @@ def test_arg_kinds():

@pytest.mark.cuda_only
@pytest.mark.parametrize("cls", [SiluAndMulWithKernel, SiluAndMulStringDevice])
-def test_hub_forward(cls):
+@pytest.mark.parametrize("device", ["cuda", "cpu"])
+def test_hub_forward(cls, device):
    torch.random.manual_seed(0)

    silu_and_mul = SiluAndMul()
-    X = torch.randn((32, 64), device="cuda")
+    X = torch.randn((32, 64), device=device)
    Y = silu_and_mul(X)

-    silu_and_mul_with_kernel = kernelize(cls(), device="cuda", mode=Mode.INFERENCE)
+    silu_and_mul_with_kernel = kernelize(cls(), device=device, mode=Mode.INFERENCE)
    Y_kernel = silu_and_mul_with_kernel(X)

    torch.testing.assert_close(Y_kernel, Y)

    assert silu_and_mul.n_calls == 1
-    assert silu_and_mul_with_kernel.n_calls == 0
+    if device == "cuda":
+        assert silu_and_mul_with_kernel.n_calls == 0
+    else:
+        assert silu_and_mul_with_kernel.n_calls == 1


@pytest.mark.rocm_only
@ -484,6 +488,11 @@ def test_kernel_modes():
        linear(X)
        assert linear.n_calls == 0

+        # Same as previous, since TRAINING | TORCH_COMPILE is the default.
+        kernelize(linear)
+        linear(X)
+        assert linear.n_calls == 0
+
    # Case 2: register a kernel just for training. If no base kernel
    #         layer is registered, we fall back to the original layer.
    with use_kernel_mapping(
@ -513,6 +522,12 @@ def test_kernel_modes():
        # TRAINING | TORCH_COMPILE cannot fall back to TRAINING kernel, so uses original.
        assert linear.n_calls == 1

+        # Same as previous, since TRAINING | TORCH_COMPILE is the default.
+        kernelize(linear)
+        linear(X)
+        # TRAINING | TORCH_COMPILE cannot fall back to TRAINING kernel, so uses original.
+        assert linear.n_calls == 2
+
    # Case 3: register a kernel just for training and one for fallback.
    with use_kernel_mapping(
        {
@ -534,17 +549,23 @@ def test_kernel_modes():
        X = torch.randn(10, 32, device="cuda")
        linear(X)
        # Falls back to TRAINING.
-        assert linear.n_calls == 1
+        assert linear.n_calls == 2

        kernelize(linear, mode=Mode.TRAINING)
        linear(X)
        # Falls back to the TRAINING kernel.
-        assert linear.n_calls == 1
+        assert linear.n_calls == 2

        kernelize(linear, mode=Mode.TRAINING | Mode.TORCH_COMPILE)
        linear(X)
        # TRAINING | TORCH_COMPILE falls back to FALLBACK kernel.
-        assert linear.n_calls == 1
+        assert linear.n_calls == 2
+
+        # Same as previous, since TRAINING | TORCH_COMPILE is the default.
+        kernelize(linear)
+        linear(X)
+        # TRAINING | TORCH_COMPILE falls back to FALLBACK kernel.
+        assert linear.n_calls == 2

    # Case 4: register a kernel with two preferences.
    with use_kernel_mapping(
@ -564,17 +585,22 @@ def test_kernel_modes():
        X = torch.randn(10, 32, device="cuda")
        linear(X)
        # Falls back to the TRAINING | TORCH_COMPILE kernel.
-        assert linear.n_calls == 1
+        assert linear.n_calls == 2

        kernelize(linear, mode=Mode.TRAINING)
        linear(X)
        # TRAINING can fall back to TRAINING | TORCH_COMPILE kernel.
-        assert linear.n_calls == 1
+        assert linear.n_calls == 2

        kernelize(linear, mode=Mode.TRAINING | Mode.TORCH_COMPILE)
        linear(X)
        # Uses TRAINING | TORCH_COMPILE kernel.
-        assert linear.n_calls == 1
+        assert linear.n_calls == 2
+
+        kernelize(linear)
+        linear(X)
+        # Same as previous, since TRAINING | TORCH_COMPILE is the default.
+        assert linear.n_calls == 2


@pytest.mark.cuda_only