Compare commits

...

35 Commits

Author SHA1 Message Date
81fb5d34bb Use token for upload test 2025-09-12 14:08:07 +00:00
fd237b04bd up 2025-09-12 17:25:41 +05:30
0eb07f198c up 2025-09-12 17:24:48 +05:30
620bf75864 up 2025-09-12 17:23:54 +05:30
8f78116b87 command to format all files at once would be nice. 2025-09-12 14:30:16 +05:30
f1782d1914 up 2025-09-12 14:27:27 +05:30
f6c901205c up 2025-09-12 14:26:05 +05:30
6899e4bfe1 up 2025-09-12 14:10:01 +05:30
ad9cba28f7 Apply suggestions from code review
Co-authored-by: Daniël de Kok <me@danieldk.eu>
2025-09-12 14:05:06 +05:30
2f1986e01a remove duplicate imports. 2025-09-12 13:05:23 +05:30
ab607022c0 propagate. 2025-09-12 12:58:13 +05:30
02cbff1d0f add a test 2025-09-11 14:31:16 +05:30
d2d8f77d97 up 2025-09-11 13:59:01 +05:30
421f09e08a up 2025-09-11 13:48:51 +05:30
e2d43815c1 sorted imports. 2025-09-10 13:57:28 +05:30
7ee9660d2c black format 2025-09-10 13:55:53 +05:30
b56106966e remove stale files. 2025-09-10 13:49:26 +05:30
1720baac7d format 2025-09-08 14:48:27 +05:30
a6dc55ddb1 add an uploading utility. 2025-09-08 14:43:15 +05:30
e801ebf332 Set version to v0.10.0.dev0 (#137) 2025-09-05 10:48:41 +02:00
0ae07f05fc Remove default for mode argument of kernelize (#136) 2025-08-29 17:44:20 +02:00
7611021100 cpu is not (yet) a supported device type (#132)
Fixes #131.
2025-08-25 16:25:58 +02:00
767e7ccf13 fix: add get local tests (#134)
* fix: add tests for get local kernel

* fix: update test and add path example comments

* fix: run black linter
2025-08-21 13:01:48 -04:00
1caa4c1393 feat: improve get local kernel importing (#129)
* feat: improve get local kernel importing

* fix: adjust for linter
2025-08-08 10:22:29 -04:00
da701bf58a Small markup fixes of the local kernel repo example (#127) 2025-08-06 08:02:28 +02:00
703664ed31 Set version to 0.9.0.dev0 (#126) 2025-08-01 16:37:30 +02:00
a8a6564fa7 Add ROCm device discovery (#122)
* Add ROCm device discovery

* Ruff

* Address review comments

* Ruff

* Reorg torch import

* Remove redundant import

* Apply suggestions from code review

Co-authored-by: Daniël de Kok <me@danieldk.eu>

* Address review comments

* Validat device type

* Clean diff

* black

* Sync test with repo changes

* black again

---------

Co-authored-by: Daniël de Kok <me@danieldk.eu>
2025-08-01 16:09:45 +02:00
c89e0fa9b9 Nix: go back to hf-nix main (#125) 2025-08-01 15:56:02 +02:00
176a601178 Run black check (#124) 2025-08-01 15:42:38 +02:00
cfa0c76ddc Add LocalLayerRepository to load from a local repo (#123) 2025-08-01 14:03:11 +02:00
bcc29915f9 Log when using fallback layer (#121) 2025-07-31 17:18:00 +02:00
6fbff7a9cb Add doc build to CI (#119)
* Add doc build to CI

* Trigger doc build

* No path scoping
2025-07-29 16:01:05 +02:00
f7490bd0a9 Test examples in docstrings using mktestdocs (#118)
Also adjust examples so that they are correct.
2025-07-28 17:31:34 +02:00
8069e3bf0c Update documentation for compatibility with doc-builder (#117) 2025-07-24 16:21:54 +02:00
c540d1e1d6 Fix typo in layers documentation (#116) 2025-07-23 17:13:14 +02:00
33 changed files with 1269 additions and 247 deletions

View File

@ -0,0 +1,17 @@
name: Build documentation
on:
push:
branches:
- main
- doc-builder*
- v*-release
jobs:
build:
uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
with:
commit_sha: ${{ github.sha }}
package: kernels
secrets:
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}

View File

@ -0,0 +1,15 @@
name: Build PR Documentation
on: pull_request
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
build:
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
with:
commit_sha: ${{ github.event.pull_request.head.sha }}
pr_number: ${{ github.event.number }}
package: kernels

View File

@ -8,3 +8,24 @@ jobs:
- uses: actions/checkout@v4
- name: Run ruff
uses: astral-sh/ruff-action@v3
black:
name: Run black check
runs-on: ubuntu-latest
env:
UV_PYTHON_PREFERENCE: only-managed
steps:
- uses: actions/checkout@v4
- name: Install uv and set the python version
uses: astral-sh/setup-uv@v5
with:
python-version: 3.12
- name: Install black
run: uv pip install black
- name: Check formatting
run: |
uv run black --check src
uv run black --check tests

View File

@ -51,7 +51,9 @@ jobs:
run: uv run mypy src/kernels
- name: Run tests
run: uv run pytest tests
run: |
export HF_TOKEN=${{ secrets.HF_TOKEN }}
uv run pytest tests
- name: Check kernel conversion
run: |

View File

@ -0,0 +1,16 @@
name: Upload PR Documentation
on:
workflow_run:
workflows: ["Build PR Documentation"]
types:
- completed
jobs:
build:
uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
with:
package_name: kernels
secrets:
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}

View File

@ -56,10 +56,13 @@ the Hub.
## 📚 Documentation
- [Using layers](docs/layers.md)
- [Locking kernel/layer versions](docs/locking.md)
- [Environment variables](docs/env.md)
- [Using kernels in a Docker container](docs/docker.md)
- [Kernel requirements](docs/kernel-requirements.md)
- [Frequently Asked Questions](docs/faq.md)
- [Introduction](docs/source/index.md)
- [Installation](docs/source/installation.md)
- [Basic usage](docs/source/basic-usage.md)
- [Using layers](docs/source/layers.md)
- [Locking kernel/layer versions](docs/source/locking.md)
- [Environment variables](docs/source/env.md)
- [Using kernels in a Docker container](docs/source/docker.md)
- [Kernel requirements](docs/source/kernel-requirements.md)
- [Frequently Asked Questions](docs/source/faq.md)
- [Writing kernels](https://github.com/huggingface/kernel-builder/blob/main/docs/writing-kernels.md) using [kernel-builder](https://github.com/huggingface/kernel-builder/)

View File

@ -1,8 +0,0 @@
# Using kernels in a Docker container
build and run the reference [examples/basic.py](examples/basic.py) in a Docker container with the following commands:
```bash
docker build --platform linux/amd64 -t kernels-reference -f docker/Dockerfile.reference .
docker run --gpus all -it --rm -e HF_TOKEN=$HF_TOKEN kernels-reference
```

28
docs/source/_toctree.yml Normal file
View File

@ -0,0 +1,28 @@
- sections:
- local: index
title: Introduction
- local: installation
title: Installation
title: Getting started
- sections:
- local: basic-usage
title: Basic Usage
- local: layers
title: Using Layers
- local: locking
title: Locking Kernel Versions
- local: env
title: Environment Variables
- local: faq
title: FAQ
title: Usage Guide
- sections:
- local: api/kernels
title: Kernels
- local: api/layers
title: Layers
title: API Reference
- sections:
- local: kernel-requirements
title: Kernel Requirements
title: Developer Guide

View File

@ -0,0 +1,21 @@
# Kernels API Reference
## Main Functions
### get_kernel
[[autodoc]] kernels.get_kernel
### has_kernel
[[autodoc]] kernels.has_kernel
## Loading locked kernels
### load_kernel
[[autodoc]] kernels.load_kernel
### get_locked_kernel
[[autodoc]] kernels.get_locked_kernel

41
docs/source/api/layers.md Normal file
View File

@ -0,0 +1,41 @@
# Layers API Reference
## Making layers kernel-aware
### use_kernel_forward_from_hub
[[autodoc]] kernels.use_kernel_forward_from_hub
### replace_kernel_forward_from_hub
[[autodoc]] kernels.replace_kernel_forward_from_hub
## Registering kernel mappings
### use_kernel_mapping
[[autodoc]] kernels.use_kernel_mapping
### register_kernel_mapping
[[autodoc]] kernels.register_kernel_mapping
## Kernelizing a model
### kernelize
[[autodoc]] kernels.kernelize
## Classes
### Device
[[autodoc]] kernels.Device
### Mode
[[autodoc]] kernels.Mode
### LayerRepository
[[autodoc]] kernels.LayerRepository

View File

@ -0,0 +1,34 @@
# Basic Usage
## Loading Kernels
Here is how you would use the [activation](https://huggingface.co/kernels-community/activation) kernels from the Hugging Face Hub:
```python
import torch
from kernels import get_kernel
# Download optimized kernels from the Hugging Face hub
activation = get_kernel("kernels-community/activation")
# Create a random tensor
x = torch.randn((10, 10), dtype=torch.float16, device="cuda")
# Run the kernel
y = torch.empty_like(x)
activation.gelu_fast(y, x)
print(y)
```
## Checking Kernel Availability
You can check if a specific kernel is available for your environment:
```python
from kernels import has_kernel
# Check if kernel is available for current environment
is_available = has_kernel("kernels-community/activation")
print(f"Kernel available: {is_available}")
```

View File

@ -2,9 +2,9 @@
## Why is the kernelization step needed?
In earlier versions of `kernels`, a layer's `forward` was replaced by
`use_kernel_forward_from_hub` and `replace_kernel_forward_from_hub`. The
new `forward` would dispatch to a kernel based on the device type,
In earlier versions of `kernels`, a layer's `forward` method was replaced
by `use_kernel_forward_from_hub` and `replace_kernel_forward_from_hub`.
The new `forward` would dispatch to a kernel based on the device type,
whether a model was training, etc. However, this approach was
fundamentally incompatible with `torch.compile` since it relied
on data-dependent branching.

20
docs/source/index.md Normal file
View File

@ -0,0 +1,20 @@
# Kernels
<div align="center">
<img src="https://github.com/user-attachments/assets/64a652f3-0cd3-4829-b3c1-df13f7933569" width="450" height="450" alt="kernel-builder logo">
</div>
The Kernel Hub allows Python libraries and applications to load compute
kernels directly from the [Hub](https://hf.co/). To support this kind
of dynamic loading, Hub kernels differ from traditional Python kernel
packages in that they are made to be:
- **Portable**: a kernel can be loaded from paths outside `PYTHONPATH`.
- **Unique**: multiple versions of the same kernel can be loaded in the
same Python process.
- **Compatible**: kernels must support all recent versions of Python and
the different PyTorch build configurations (various CUDA versions
and C++ ABIs). Furthermore, older C library versions must be supported.
You can [search for kernels](https://huggingface.co/models?other=kernel) on
the Hub.

View File

@ -0,0 +1,16 @@
# Installation
Install the `kernels` package with `pip` (requires `torch>=2.5` and CUDA):
```bash
pip install kernels
```
# Using kernels in a Docker container
Build and run the reference `examples/basic.py` in a Docker container with the following commands:
```bash
docker build --platform linux/amd64 -t kernels-reference -f docker/Dockerfile.reference .
docker run --gpus all -it --rm -e HF_TOKEN=$HF_TOKEN kernels-reference
```

View File

@ -84,12 +84,6 @@ model = kernelize(model, mode=Mode.INFERENCE | Mode.TORCH_COMPILE)
model = kernelize(model, mode=Mode.TRAINING | Mode.TORCH_COMPILE)
```
When the `mode` argument is not specified,
`Mode.TRAINING | Mode.TORCH_COMPILE` is used as the default. This mode
aligns most closely with pure PyTorch layers which also support training
and `torch.compile`. However, to select the most performant kernels, it
is often good to make the mode specific as possible.
### Kernel device
Kernels can be registered per device type. For instance, separate `cuda` and
@ -107,7 +101,7 @@ model = kernelize(model, device="cuda", mode=Mode.INFERENCE)
If the `TRAINING` and/or `TORCH_COMPILE` modes are used, but a registered
kernel does not support backward passes or `torch.compile` respectively,
`kernenize` will fall back to the original, non-kernelized, layer. You
`kernelize` will fall back to the original, non-kernelized, layer. You
can let `kernelize` raise an exception instead by using `use_fallback=False`:
```python
@ -135,6 +129,10 @@ kernel_layer_mapping = {
"cuda": LayerRepository(
repo_id="kernels-community/activation",
layer_name="SiluAndMul",
),
"rocm": LayerRepository(
repo_id="kernels-community/activation",
layer_name="SiluAndMul",
)
}
}
@ -153,7 +151,7 @@ used with the `use_kernel_mapping` context manager:
```python
with use_kernel_mapping(kernel_layer_mapping):
# Use the layer for which the mapping is applied.
model = kernelize(model)
model = kernelize(model, mode=Mode.TRAINING | Mode.TORCH_COMPILE)
```
This ensures that the mapping is not active anymore outside the
@ -261,7 +259,6 @@ Capabilities behave as follows:
an existing kernel, the new kernel will replace the old kernel.
- When there are multiple kernels that support a capability, the kernel
with the smaller capability interval will be used. E.g. given:
- `KernelA` with `min_capability=80` and `max_capability=89`;
- `KernelB` with `min_capability=75` and `max_capability=89`;
- `kernelize` runs on a system with capability 8.6.
@ -270,3 +267,30 @@ Capabilities behave as follows:
than 75..89. The motivation is that kernels with smaller ranges
tend to be more optimized for a specific set of GPUs. **This behavior
might still change in the future.**
### Registering kernels for specific ROCm capabilities
Registering kernels for the ROCm architecture follows the exact same
pattern as CUDA kernels, using `min_capability` and `max_capability` to restrict
a kernel to a range of ROCm capabilities.
### Loading from a local repository for testing
The `LocalLayerRepository` class is provided to load a repository from
a local directory. For example:
```python
with use_kernel_mapping(
{
"SiluAndMul": {
"cuda": LocalLayerRepository(
repo_path="/home/daniel/kernels/activation",
package_name="activation",
layer_name="SiluAndMul",
)
}
},
inherit_mapping=False,
):
kernelize(linear, mode=Mode.INFERENCE)
```

18
flake.lock generated
View File

@ -58,11 +58,11 @@
"nixpkgs": "nixpkgs"
},
"locked": {
"lastModified": 1750775451,
"narHash": "sha256-HiGqtwzIgUH7Xkh+wgpvHRZGooqrW0z663E6nauczA4=",
"lastModified": 1754038838,
"narHash": "sha256-oHigCT4z0ayyLyEuxdZooSXRAZP8lfOkZHzY1lx1U50=",
"owner": "huggingface",
"repo": "hf-nix",
"rev": "5943c3169e861618a6634bc8dbdb498e413ab9b7",
"rev": "336f781fa284e193baa3d4c3ce3f95fb34e9ffad",
"type": "github"
},
"original": {
@ -73,17 +73,17 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1747820358,
"narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=",
"owner": "danieldk",
"lastModified": 1752785354,
"narHash": "sha256-Y33ryUz7MPqKrZwlbQcsYCUz2jAJCacRf8jbs0tYUlA=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "d3c1681180717528068082103bf323147de6ab0b",
"rev": "d38025438a6ee456758dc03188ca6873a415463b",
"type": "github"
},
"original": {
"owner": "danieldk",
"ref": "cudatoolkit-12.9-kernel-builder",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "d38025438a6ee456758dc03188ca6873a415463b",
"type": "github"
}
},

View File

@ -26,6 +26,10 @@
formatter = pkgs.nixfmt-tree;
devShells = with pkgs; rec {
default = mkShell {
nativeBuildInputs = [
# For hf-doc-builder.
nodejs
];
buildInputs =
[
black
@ -36,6 +40,7 @@
++ (with python3.pkgs; [
docutils
huggingface-hub
mktestdocs
pytest
pytest-benchmark
pyyaml

View File

@ -1,6 +1,6 @@
[project]
name = "kernels"
version = "0.8.1.dev0"
version = "0.10.0.dev0"
description = "Download compute kernels"
authors = [
{ name = "OlivierDehaene", email = "olivier@huggingface.co" },
@ -24,16 +24,20 @@ build-backend = "setuptools.build_meta"
[dependency-groups]
dev = [
"mypy >= 1.15.0",
"pytest >=8",
"mktestdocs>=0.2.5",
"mypy>=1.15.0",
"pytest>=8",
# Whatever version is compatible with pytest.
"pytest-benchmark",
"torch >=2.5",
"torch>=2.5",
"types-pyyaml"
]
[project.optional-dependencies]
torch = ["torch"]
docs = [
"hf-doc-builder",
]
[project.scripts]
kernels = "kernels.cli:main"

View File

@ -1,4 +1,5 @@
[pytest]
markers =
cuda_only: marks tests that should only hosts with CUDA GPUs
rocm_only: marks tests that should only run on hosts with ROCm GPUs
darwin_only: marks tests that should only run on macOS
linux_only: marks tests that should only run on Linux

View File

@ -2,6 +2,8 @@ from kernels.layer import (
CUDAProperties,
Device,
LayerRepository,
LocalLayerRepository,
LockedLayerRepository,
Mode,
kernelize,
register_kernel_mapping,
@ -22,6 +24,8 @@ __all__ = [
"CUDAProperties",
"Device",
"LayerRepository",
"LocalLayerRepository",
"LockedLayerRepository",
"Mode",
"get_kernel",
"get_local_kernel",

View File

@ -4,6 +4,8 @@ import json
import sys
from pathlib import Path
from huggingface_hub import create_repo, upload_folder
from kernels.compat import tomllib
from kernels.lockfile import KernelLock, get_kernel_locks
from kernels.utils import install_kernel, install_kernel_all_variants
@ -31,6 +33,24 @@ def main():
)
download_parser.set_defaults(func=download_kernels)
upload_parser = subparsers.add_parser("upload", help="Upload kernels to the Hub")
upload_parser.add_argument(
"kernel_dir",
type=Path,
help="Directory of the kernel build",
)
upload_parser.add_argument(
"--repo_id",
type=str,
help="Repository ID to use to upload to the Hugging Face Hub",
)
upload_parser.add_argument(
"--private",
action="store_true",
help="If the repository should be private.",
)
upload_parser.set_defaults(func=upload_kernels)
lock_parser = subparsers.add_parser("lock", help="Lock kernel revisions")
lock_parser.add_argument(
"project_dir",
@ -153,6 +173,33 @@ def lock_kernels(args):
json.dump(all_locks, f, cls=_JSONEncoder, indent=2)
def upload_kernels(args):
kernel_dir = Path(args.kernel_dir).resolve()
build_dir = kernel_dir / "build"
if not kernel_dir.is_dir():
raise ValueError(f"{kernel_dir} is not a directory")
if not build_dir.is_dir():
raise ValueError("Couldn't find `build` directory inside `kernel_dir`")
repo_id = create_repo(
repo_id=args.repo_id, private=args.private, exist_ok=True
).repo_id
delete_patterns: set[str] = set()
for build_variant in build_dir.iterdir():
if build_variant.is_dir():
delete_patterns.add(f"{build_variant.name}/**")
upload_folder(
repo_id=repo_id,
folder_path=build_dir,
path_in_repo="build",
delete_patterns=list(delete_patterns),
commit_message="Build uploaded using `kernels`.",
)
print(f"✅ Kernel upload successful. Find the kernel in https://hf.co/{repo_id}.")
class _JSONEncoder(json.JSONEncoder):
def default(self, o):
if dataclasses.is_dataclass(o):

View File

@ -13,7 +13,7 @@ from dataclasses import dataclass
from enum import Flag, auto
from functools import lru_cache
from pathlib import Path
from types import MethodType
from types import MethodType, ModuleType
from typing import (
TYPE_CHECKING,
Dict,
@ -26,13 +26,17 @@ from typing import (
from ._interval_tree import IntervalTree
from ._versions import select_revision_or_version
from .utils import _get_caller_locked_kernel, _get_locked_kernel, get_kernel
from .utils import (
_get_caller_locked_kernel,
_get_locked_kernel,
get_kernel,
get_local_kernel,
)
if TYPE_CHECKING:
import torch
from torch import nn
_DISABLE_KERNEL_MAPPING: bool = bool(int(os.environ.get("DISABLE_KERNEL_MAPPING", "0")))
@ -40,17 +44,19 @@ class Mode(Flag):
"""
Kernelize mode
The `Mode` flag is used by `kernelize` to select kernels for the given
mode. Mappings can be registered for specific modes.
The `Mode` flag is used by [`kernelize`] to select kernels for the given mode. Mappings can be registered for
specific modes.
* `INFERENCE`: The kernel is used for inference.
* `TRAINING`: The kernel is used for training.
* `TORCH_COMPILE`: The kernel is used with `torch.compile`.
* `FALLBACK`: In a kernel mapping, this kernel is used when no other mode
matches.
Attributes:
INFERENCE: The kernel is used for inference.
TRAINING: The kernel is used for training.
TORCH_COMPILE: The kernel is used with `torch.compile`.
FALLBACK: In a kernel mapping, this kernel is used when no other mode matches.
Note:
Different modes can be combined. For instance, `INFERENCE | TORCH_COMPILE` should be used for layers that
are used for inference *with* `torch.compile`.
Different modes can be combined. For instance, `INFERENCE | TORCH_COMPILE`
should be used for layers that are used for inference *with* `torch.compile`.
"""
_NONE = 0
@ -73,6 +79,36 @@ class Mode(Flag):
@dataclass(frozen=True)
class Device:
"""
Represents a compute device with optional properties.
This class encapsulates device information including device type and optional device-specific properties
like CUDA capabilities.
Args:
type (`str`):
The device type (e.g., "cuda", "mps", "rocm").
properties ([`CUDAProperties`], *optional*):
Device-specific properties. Currently only [`CUDAProperties`] is supported for CUDA devices.
Example:
```python
from kernels import Device, CUDAProperties
# Basic CUDA device
cuda_device = Device(type="cuda")
# CUDA device with specific capability requirements
cuda_device_with_props = Device(
type="cuda",
properties=CUDAProperties(min_capability=75, max_capability=90)
)
# MPS device for Apple Silicon
mps_device = Device(type="mps")
```
"""
type: str
properties: Optional[CUDAProperties] = None
@ -85,6 +121,8 @@ class Device:
"""Create an appropriate repository set for this device type."""
if self.type == "cuda":
return _CUDARepos()
elif self.type == "rocm":
return _ROCMRepos()
elif self.type == "mps":
return _MPSRepos()
else:
@ -101,6 +139,34 @@ class Device:
@dataclass(frozen=True)
class CUDAProperties:
"""
CUDA-specific device properties for capability-based kernel selection.
This class defines CUDA compute capability constraints for kernel selection, allowing kernels to specify
minimum and maximum CUDA compute capabilities they support.
Args:
min_capability (`int`):
Minimum CUDA compute capability required (e.g., 75 for compute capability 7.5).
max_capability (`int`):
Maximum CUDA compute capability supported (e.g., 90 for compute capability 9.0).
Example:
```python
from kernels import CUDAProperties, Device
# Define CUDA properties for modern GPUs (compute capability 7.5 to 9.0)
cuda_props = CUDAProperties(min_capability=75, max_capability=90)
# Create a device with these properties
device = Device(type="cuda", properties=cuda_props)
```
Note:
CUDA compute capabilities are represented as integers where the major and minor versions are concatenated.
For example, compute capability 7.5 is represented as 75, and 8.6 is represented as 86.
"""
min_capability: int
max_capability: int
@ -116,20 +182,90 @@ class CUDAProperties:
return hash((self.min_capability, self.max_capability))
@dataclass(frozen=True)
class ROCMProperties:
"""
ROCM-specific device properties for capability-based kernel selection.
This class defines ROCM compute capability constraints for kernel selection, allowing kernels to specify
minimum and maximum ROCM compute capabilities they support.
Args:
min_capability (`int`):
Minimum ROCM compute capability required (e.g., 75 for compute capability 7.5).
max_capability (`int`):
Maximum ROCM compute capability supported (e.g., 90 for compute capability 9.0).
Example:
```python
from kernels import ROCMProperties, Device
# Define ROCM properties for modern GPUs (compute capability 7.5 to 9.0)
rocm_props = ROCMProperties(min_capability=75, max_capability=90)
# Create a device with these properties
device = Device(type="rocm", properties=rocm_props)
```
Note:
ROCM compute capabilities are represented as integers where the major and minor versions are concatenated.
For example, compute capability 7.5 is represented as 75, and 8.6 is represented as 86.
"""
min_capability: int
max_capability: int
def __eq__(self, other):
if not isinstance(other, ROCMProperties):
return NotImplemented
return (
self.min_capability == other.min_capability
and self.max_capability == other.max_capability
)
def __hash__(self):
return hash((self.min_capability, self.max_capability))
class LayerRepositoryProtocol(Protocol):
@property
def layer_name(self) -> str: ...
@property
def repo_id(self) -> str: ...
@property
def revision(self) -> str: ...
def load(self) -> ModuleType: ...
class LayerRepository:
"""
Repository and name of a layer.
Repository and name of a layer for kernel mapping.
Args:
repo_id (`str`):
The Hub repository containing the layer.
layer_name (`str`):
The name of the layer within the kernel repository.
revision (`str`, *optional*, defaults to `"main"`):
The specific revision (branch, tag, or commit) to download. Cannot be used together with `version`.
version (`str`, *optional*):
The kernel version to download. This can be a Python version specifier, such as `">=1.0.0,<2.0.0"`.
Cannot be used together with `revision`.
Example:
```python
from kernels import LayerRepository
# Reference a specific layer by revision
layer_repo = LayerRepository(
repo_id="kernels-community/activation",
layer_name="SiluAndMul",
)
# Reference a layer by version constraint
layer_repo_versioned = LayerRepository(
repo_id="kernels-community/activation",
layer_name="SiluAndMul",
version=">=0.0.3,<0.1"
)
```
"""
def __init__(
@ -140,25 +276,12 @@ class LayerRepository:
revision: Optional[str] = None,
version: Optional[str] = None,
):
"""
Construct a layer repository.
Args:
repo_id (`str`): The Hub repository containing the layer.
revision (`str`, *optional*, defaults to `"main"`): The specific
revision (branch, tag, or commit) to download.
Cannot be used together with `version`.
version (`str`, *optional*): The kernel version to download. This
can be a Python version specifier, such as `">=1.0.0,<2.0.0"`.
Cannot be used together with `revision`.
"""
if revision is not None and version is not None:
raise ValueError(
"Either a revision or a version must be specified, not both."
)
self.repo_id = repo_id
self._repo_id = repo_id
self.layer_name = layer_name
# We are going to resolve these lazily, since we do not want
@ -166,24 +289,85 @@ class LayerRepository:
self._revision = revision
self._version = version
@property
@functools.lru_cache()
def revision(self) -> str:
def _resolve_revision(self) -> str:
return select_revision_or_version(
repo_id=self.repo_id, revision=self._revision, version=self._version
repo_id=self._repo_id, revision=self._revision, version=self._version
)
def load(self) -> ModuleType:
return get_kernel(self._repo_id, revision=self._resolve_revision())
def __eq__(self, other):
return (
isinstance(other, LayerRepository)
and self.layer_name == other.layer_name
and self.repo_id == other.repo_id
and self._repo_id == other._repo_id
and self._revision == other._revision
and self._version == other._version
)
def __hash__(self):
return hash((self.layer_name, self.repo_id, self._revision, self._version))
return hash((self.layer_name, self._repo_id, self._revision, self._version))
def __str__(self) -> str:
return f"`{self._repo_id}` (revision: {self._resolve_revision()}) for layer `{self.layer_name}`"
class LocalLayerRepository:
"""
Repository from a local directory for kernel mapping.
Args:
repo_path (`Path`):
The local repository containing the layer.
package_name (`str`):
Package name of the kernel.
layer_name (`str`):
The name of the layer within the kernel repository.
Example:
```python
from pathlib import Path
from kernels import LocalLayerRepository
# Reference a specific layer by revision
layer_repo = LocalLayerRepository(
repo_path=Path("/home/daniel/kernels/activation"),
package_name="activation",
layer_name="SiluAndMul",
)
```
"""
def __init__(
self,
repo_path: Path,
*,
package_name: str,
layer_name: str,
):
self._repo_path = repo_path
self._package_name = package_name
self.layer_name = layer_name
def load(self) -> ModuleType:
return get_local_kernel(self._repo_path, self._package_name)
def __eq__(self, other):
return (
isinstance(other, LocalLayerRepository)
and self.layer_name == other.layer_name
and self._repo_path == other._repo_path
and self._package_name == other._package_name
)
def __hash__(self):
return hash((self.layer_name, self._repo_path, self._package_name))
def __str__(self) -> str:
return f"`{self._repo_path}` (package: {self._package_name}) for layer `{self.layer_name}`"
class LockedLayerRepository:
@ -207,33 +391,38 @@ class LockedLayerRepository:
Args:
repo_id (`str`): The Hub repository containing the layer.
"""
self.repo_id = repo_id
self.lockfile = lockfile
self._repo_id = repo_id
self._lockfile = lockfile
self.layer_name = layer_name
@property
@functools.lru_cache()
def revision(self) -> str:
if self.lockfile is None:
locked_sha = _get_caller_locked_kernel(self.repo_id)
def _resolve_revision(self) -> str:
if self._lockfile is None:
locked_sha = _get_caller_locked_kernel(self._repo_id)
else:
with open(self.lockfile, "r") as f:
locked_sha = _get_locked_kernel(self.repo_id, f.read())
with open(self._lockfile, "r") as f:
locked_sha = _get_locked_kernel(self._repo_id, f.read())
if locked_sha is None:
raise ValueError(f"Kernel `{self.repo_id}` is not locked")
raise ValueError(f"Kernel `{self._repo_id}` is not locked")
return locked_sha
def load(self) -> ModuleType:
return get_kernel(repo_id=self._repo_id, revision=self._resolve_revision())
def __eq__(self, other):
return (
isinstance(other, LockedLayerRepository)
and self.layer_name == other.layer_name
and self.repo_id == other.repo_id
and self._repo_id == other._repo_id
)
def __hash__(self):
return hash((self.layer_name, self.repo_id))
return hash((self.layer_name, self._repo_id))
def __str__(self) -> str:
return f"`{self._repo_id}` (revision: {self._resolve_revision()}) for layer `{self.layer_name}`"
_CACHED_LAYER: Dict[LayerRepositoryProtocol, Type["nn.Module"]] = {}
@ -309,6 +498,46 @@ class _CUDARepos(_DeviceRepos):
self.repos_by_capability.insert(min_capability, max_capability, repos)
class _ROCMRepos(_DeviceRepos):
_repos: IntervalTree[Dict[Mode, LayerRepositoryProtocol]]
def __init__(self):
super().__init__()
self.repos_by_capability = IntervalTree()
@property
def repos(
self,
) -> Optional[Dict[Mode, LayerRepositoryProtocol]]:
capability = _find_capability()
return self.repos_by_capability.find_smallest_interval(capability)
def insert(self, device: Device, repos: Dict[Mode, LayerRepositoryProtocol]):
assert device.properties is None or isinstance(
device.properties, ROCMProperties
)
min_capability = (
0 if device.properties is None else device.properties.min_capability
)
max_capability = (
sys.maxsize
if device.properties is None
else device.properties.max_capability
)
self.repos_by_capability.insert(min_capability, max_capability, repos)
def _validate_device_type(device_type: str) -> None:
"""Validate that the device type is supported."""
supported_devices = {"cuda", "rocm", "mps"}
if device_type not in supported_devices:
raise ValueError(
f"Unsupported device type '{device_type}'. Supported device types are: {', '.join(sorted(supported_devices))}"
)
_KERNEL_MAPPING: ContextVar[Dict[str, Dict[str, _DeviceRepos]]] = ContextVar(
"_KERNEL_MAPPING", default={}
)
@ -326,11 +555,56 @@ def use_kernel_mapping(
inherit_mapping: bool = True,
):
"""
Context manager that sets a mapping for a duration of the context.
Context manager that sets a kernel mapping for the duration of the context.
When `inherit_mapping` is set to `True` the current mapping will be
extended by `mapping` inside the context. If it is `False`, only
`mapping` is used inside the context.
This function allows temporary kernel mappings to be applied within a specific context, enabling different
kernel configurations for different parts of your code.
Args:
mapping (`Dict[str, Dict[Union[Device, str], Union[LayerRepositoryProtocol, Dict[Mode, LayerRepositoryProtocol]]]]`):
The kernel mapping to apply. Maps layer names to device-specific kernel configurations.
inherit_mapping (`bool`, *optional*, defaults to `True`):
When `True`, the current mapping will be extended by `mapping` inside the context. When `False`,
only `mapping` is used inside the context.
Returns:
Context manager that handles the temporary kernel mapping.
Example:
```python
import torch
import torch.nn as nn
from torch.nn import functional as F
from kernels import use_kernel_forward_from_hub
from kernels import use_kernel_mapping, LayerRepository, Device
from kernels import Mode, kernelize
# Define a mapping
mapping = {
"SiluAndMul": {
"cuda": LayerRepository(
repo_id="kernels-community/activation",
layer_name="SiluAndMul",
)
}
}
@use_kernel_forward_from_hub("SiluAndMul")
class SiluAndMul(nn.Module):
def forward(self, x: torch.Tensor) -> torch.Tensor:
d = x.shape[-1] // 2
return F.silu(x[..., :d]) * x[..., d:]
model = SiluAndMul()
# Use the mapping for the duration of the context.
with use_kernel_mapping(mapping):
# kernelize uses the temporary mapping
model = kernelize(model, mode=Mode.TRAINING | Mode.TORCH_COMPILE, device="cuda")
# Outside the context, original mappings are restored
```
"""
class ContextManager:
@ -356,29 +630,59 @@ def register_kernel_mapping(
Union[LayerRepositoryProtocol, Dict[Mode, LayerRepositoryProtocol]],
],
],
inherit_mapping: bool = True,
):
"""
Allows one to register a mapping between a layer name and the corresponding
kernel(s) to use, depending on the device. This should be used in conjunction
with `kernelize`.
Register a global mapping between layer names and their corresponding kernel implementations.
Example usage:
This function allows you to register a mapping between a layer name and the corresponding kernel(s) to use,
depending on the device and mode. This should be used in conjunction with [`kernelize`].
```python
from kernels import LayerRepository, register_kernel_mapping
Args:
mapping (`Dict[str, Dict[Union[Device, str], Union[LayerRepositoryProtocol, Dict[Mode, LayerRepositoryProtocol]]]]`):
The kernel mapping to register globally. Maps layer names to device-specific kernels.
The mapping can specify different kernels for different modes (training, inference, etc.).
inherit_mapping (`bool`, *optional*, defaults to `True`):
When `True`, the current mapping will be extended by `mapping`. When `False`, the existing mappings
are erased before adding `mapping`.
kernel_layer_mapping = {
"LlamaRMSNorm": {
"cuda": LayerRepository(
repo_id="kernels-community/activation",
layer_name="RmsNorm",
revision="layers",
),
},
}
register_kernel_mapping(kernel_layer_mapping)
```
Example:
```python
from kernels import LayerRepository, register_kernel_mapping, Mode
# Simple mapping for a single kernel per device
kernel_layer_mapping = {
"LlamaRMSNorm": {
"cuda": LayerRepository(
repo_id="kernels-community/activation",
layer_name="RmsNorm",
revision="layers",
),
},
}
register_kernel_mapping(kernel_layer_mapping)
# Advanced mapping with mode-specific kernels
advanced_mapping = {
"MultiHeadAttention": {
"cuda": {
Mode.TRAINING: LayerRepository(
repo_id="username/training-kernels",
layer_name="TrainingAttention"
),
Mode.INFERENCE: LayerRepository(
repo_id="username/inference-kernels",
layer_name="FastAttention"
),
}
}
}
register_kernel_mapping(advanced_mapping)
```
"""
if not inherit_mapping:
_KERNEL_MAPPING.set({})
# Merge with existing mappings.
for new_kernel, new_device_repos in mapping.items():
device_repo = _KERNEL_MAPPING.get().setdefault(new_kernel, {})
@ -401,15 +705,20 @@ def replace_kernel_forward_from_hub(
layer_name: str,
):
"""
Decorator that prepares a layer class to use a kernel from the Hugging Face Hub.
Function that prepares a layer class to use kernels from the Hugging Face Hub.
This decorator stores the layer name and original forward method, which will be used
by the kernelize function to replace the forward implementation with the appropriate
kernel from the hub.
It is recommended to use [`use_kernel_forward_from_hub`] decorator instead.
This function should only be used as a last resort to extend third-party layers,
it is inherently fragile since the member variables and `forward` signature
of usch a layer can change.
Args:
cls: The layer class to decorate
layer_name: The name of the layer to use for kernel lookup
Example:
```python
from kernels import replace_kernel_forward_from_hub
import torch.nn as nn
replace_kernel_forward_from_hub(nn.LayerNorm, "LayerNorm")
```
"""
cls.kernel_layer_name = layer_name
@ -463,30 +772,66 @@ def _select_repository(
def kernelize(
model: "nn.Module",
*,
mode: Mode = Mode.TRAINING | Mode.TORCH_COMPILE,
mode: Mode,
device: Optional[Union[str, "torch.device"]] = None,
use_fallback: bool = True,
):
"""
Iterate over all modules in the model and replace the `forward` method of
extensible layers for which kernels are registered using `register_kernel_mapping`
or `use_kernel_mapping`.
Replace layer forward methods with optimized kernel implementations.
This function iterates over all modules in the model and replaces the `forward` method of extensible layers
for which kernels are registered using [`register_kernel_mapping`] or [`use_kernel_mapping`].
Args:
model: The PyTorch model to kernelize
mode: the mode that the kernel is going to be used in (e.g.
`Mode.TRAINING | Mode.TORCH_COMPILE` kernelizes the model for training
and `torch.compile`).
device: The device type to load kernels for. The device type will be inferred
from the parameters of the model when not provided.
use_fallback: Whether to use the original forward method of modules when no
compatible kernel could be found. If set to `False`, an exception will
be raised in such cases.
model (`nn.Module`):
The PyTorch model to kernelize.
mode ([`Mode`]): The mode that the kernel is going to be used in. For example,
`Mode.TRAINING | Mode.TORCH_COMPILE` kernelizes the model for training with
`torch.compile`.
device (`Union[str, torch.device]`, *optional*):
The device type to load kernels for. Supported device types are: "cuda", "mps", "rocm".
The device type will be inferred from the model parameters when not provided.
use_fallback (`bool`, *optional*, defaults to `True`):
Whether to use the original forward method of modules when no compatible kernel could be found.
If set to `False`, an exception will be raised in such cases.
Returns:
The kernelized model
`nn.Module`: The kernelized model with optimized kernel implementations.
Example:
```python
import torch
import torch.nn as nn
from kernels import kernelize, Mode, register_kernel_mapping, LayerRepository
from kernels import use_kernel_forward_from_hub
@use_kernel_forward_from_hub("SiluAndMul")
class SiluAndMul(nn.Module):
def forward(self, x: torch.Tensor) -> torch.Tensor:
d = x.shape[-1] // 2
return F.silu(x[..., :d]) * x[..., d:]
mapping = {
"LayerNorm": {
"cuda": LayerRepository(
repo_id="kernels-community/activation",
layer_name="SiluAndMul",
)
}
}
register_kernel_mapping(mapping)
# Create and kernelize a model
model = nn.Sequential(
nn.Linear(1024, 2048, device="cuda"),
SiluAndMul(),
)
# Kernelize for inference
kernelized_model = kernelize(model, mode=Mode.TRAINING | Mode.TORCH_COMPILE)
```
"""
import torch
if mode == Mode.FALLBACK:
raise ValueError("Mode.FALLBACK can only be used to register kernel mappings.")
@ -500,7 +845,8 @@ def kernelize(
if device is None:
device_type = _find_device(model)
elif isinstance(device, str):
device_type = Device(type=torch.device(device).type)
_validate_device_type(device)
device_type = Device(type=device)
else:
device_type = Device(device.type)
@ -566,9 +912,7 @@ def kernelize(
repo, repo_mode = repo_with_mode
logging.info(
f"Using layer `{repo.layer_name}` from repo `{repo.repo_id}` (revision: {repo.revision}) for layer `{layer_name}`"
)
logging.info(f"Using layer `{repo.layer_name}` from repo {repo}")
logging.debug(f"kernelize mode: {mode}, repo mode: {repo_mode}")
layer = _get_layer_memoize(repo, module_class)
@ -593,7 +937,41 @@ def kernelize(
def use_kernel_forward_from_hub(layer_name: str):
"""
Make a layer extensible using the name `layer_name`.
Decorator factory that makes a layer extensible using the specified layer name.
This is a decorator factory that returns a decorator which prepares a layer class to use kernels from the
Hugging Face Hub.
Args:
layer_name (`str`):
The name of the layer to use for kernel lookup in registered mappings.
Returns:
`Callable`: A decorator function that can be applied to layer classes.
Example:
```python
import torch
import torch.nn as nn
from kernels import use_kernel_forward_from_hub
from kernels import Mode, kernelize
@use_kernel_forward_from_hub("MyCustomLayer")
class MyCustomLayer(nn.Module):
def __init__(self, hidden_size):
super().__init__()
self.hidden_size = hidden_size
def forward(self, x: torch.Tensor):
# original implementation
return x
model = MyCustomLayer(768)
# The layer can now be kernelized:
# model = kernelize(model, mode=Mode.TRAINING | Mode.TORCH_COMPILE, device="cuda")
```
"""
def decorator(cls):
@ -603,21 +981,17 @@ def use_kernel_forward_from_hub(layer_name: str):
return decorator
def _get_kernel_layer(
*, repo_id: str, layer_name: str, revision: str
) -> Type["nn.Module"]:
def _get_kernel_layer(repo: LayerRepositoryProtocol) -> Type["nn.Module"]:
"""Get a layer from a kernel."""
kernel = get_kernel(repo_id, revision=revision)
kernel = repo.load()
if getattr(kernel, "layers", None) is None:
raise ValueError(
f"Kernel `{repo_id}` at revision `{revision}` does not define any layers."
)
raise ValueError(f"Kernel repo {repo} does not define any layers.")
layer = getattr(kernel.layers, layer_name, None)
layer = getattr(kernel.layers, repo.layer_name, None)
if layer is None:
raise ValueError(f"Layer `{layer_name}` not found in kernel `{repo_id}`.")
raise ValueError(f"Layer `{repo.layer_name}` not found in kernel repo {repo}.")
return layer
@ -661,6 +1035,18 @@ def _validate_layer(*, check_cls, cls):
)
def _is_cuda_platform():
import torch
return torch.version.cuda is not None
def _is_rocm_platform():
import torch
return torch.version.hip is not None
def _find_device(model: "nn.Module") -> Device:
try:
param = next(model.parameters())
@ -669,7 +1055,15 @@ def _find_device(model: "nn.Module") -> Device:
"Cannot determine model device, provide as `device` argument to `kernelize`."
)
return Device(type=param.device.type)
dev_type = param.device.type
if dev_type == "cuda":
# Refine based on actual platform
if _is_rocm_platform():
return Device(type="rocm")
elif _is_cuda_platform():
return Device(type="cuda")
return Device(type=dev_type)
@lru_cache
@ -694,13 +1088,19 @@ def _conditionally_replace_forward(
# layers registered with the FALLBACK mode never get rejected by
# _validate_layer_has_mode. For such layers, we want to fall back in
# case the layer does not support the given mode.
needs_fallback = Mode.TORCH_COMPILE in mode and not getattr(
needs_fallback_for_compile = Mode.TORCH_COMPILE in mode and not getattr(
layer, "can_torch_compile", False
)
needs_fallback |= Mode.TRAINING in mode and not getattr(layer, "has_backward", True)
needs_fallback_for_backward = Mode.TRAINING in mode and not getattr(
layer, "has_backward", True
)
if needs_fallback:
if needs_fallback_for_compile or needs_fallback_for_backward:
if use_fallback:
if needs_fallback_for_compile:
logging.info("Layer does not support torch.compile, using fallback")
if needs_fallback_for_backward:
logging.info("Layer does not support backward, using fallback")
_replace_forward(module, module_class)
else:
raise ValueError(f"Available kernel does not support mode: {mode}")
@ -725,7 +1125,7 @@ def _validate_layer_has_mode(
if Mode.TRAINING in repo_mode and not getattr(module, "has_backward", True):
raise ValueError(
f"Layer `{repo.layer_name}` ({repo.repo_id}, revision: {repo.revision}) does not support backward.\n"
f"Layer `{repo.layer_name}` from repo {repo} does not support backward.\n"
f"Was registered for `{layer_name}` with mode `{repo_mode}`"
)
@ -733,7 +1133,7 @@ def _validate_layer_has_mode(
module, "can_torch_compile", False
):
raise ValueError(
f"Layer `{repo.layer_name}` ({repo.repo_id}, revision: {repo.revision}) does not support torch.compile.\n"
f"Layer `{repo.layer_name}` from repo {repo} does not support torch.compile.\n"
f"Was registered for `{layer_name}` with mode `{repo_mode}`"
)
@ -747,11 +1147,7 @@ def _get_layer_memoize(
if layer is not None:
return layer
layer = _get_kernel_layer(
repo_id=repo.repo_id,
layer_name=repo.layer_name,
revision=repo.revision,
)
layer = _get_kernel_layer(repo)
_validate_layer(check_cls=module_class, cls=layer)
_CACHED_LAYER[repo] = layer

View File

@ -98,7 +98,20 @@ def install_kernel(
"""
Download a kernel for the current environment to the cache.
The output path is validated againt `hash` when set.
The output path is validated against the hashes in `variant_locks` when provided.
Args:
repo_id (`str`):
The Hub repository containing the kernel.
revision (`str`):
The specific revision (branch, tag, or commit) to download.
local_files_only (`bool`, *optional*, defaults to `False`):
Whether to only use local files and not download from the Hub.
variant_locks (`Dict[str, VariantLock]`, *optional*):
Optional dictionary of variant locks for validation.
Returns:
`Tuple[str, Path]`: A tuple containing the package name and the path to the variant directory.
"""
package_name = package_name_from_repo_id(repo_id)
variant = build_variant()
@ -190,23 +203,31 @@ def get_kernel(
) -> ModuleType:
"""
Load a kernel from the kernel hub.
This function downloads a kernel to the local Hugging Face Hub cache
directory (if it was not downloaded before) and then loads the kernel.
This function downloads a kernel to the local Hugging Face Hub cache directory (if it was not downloaded before)
and then loads the kernel.
Args:
repo_id (`str`): The Hub repository containing the kernel.
revision (`str`, *optional*, defaults to `"main"`): The specific
revision (branch, tag, or commit) to download.
Cannot be used together with `version`.
version (`str`, *optional*): The kernel version to download. This
can be a Python version specifier, such as `">=1.0.0,<2.0.0"`.
repo_id (`str`):
The Hub repository containing the kernel.
revision (`str`, *optional*, defaults to `"main"`):
The specific revision (branch, tag, or commit) to download. Cannot be used together with `version`.
version (`str`, *optional*):
The kernel version to download. This can be a Python version specifier, such as `">=1.0.0,<2.0.0"`.
Cannot be used together with `revision`.
Returns:
`ModuleType`: The imported kernel module.
Example:
```python
import torch
from kernels import get_kernel
kernel = get_kernel("username/my-kernel")
result = kernel.kernel_function(input_data)
activation = get_kernel("kernels-community/activation")
x = torch.randn(10, 20, device="cuda")
out = torch.empty_like(x)
result = activation.silu_and_mul(out, x)
```
"""
revision = select_revision_or_version(repo_id, revision, version)
@ -217,28 +238,53 @@ def get_kernel(
def get_local_kernel(repo_path: Path, package_name: str) -> ModuleType:
"""
Import a kernel from a local kernel repository path.
Args:
repo_path (`Path`):
The local path to the kernel repository.
package_name (`str`):
The name of the package to import from the repository.
Returns:
`ModuleType`: The imported kernel module.
"""
package_name, package_path = _load_kernel_from_path(repo_path, package_name)
return import_from_path(package_name, package_path / package_name / "__init__.py")
variant = build_variant()
universal_variant = universal_build_variant()
# Presume we were given the top level path of the kernel repository.
for base_path in [repo_path, repo_path / "build"]:
# Prefer the universal variant if it exists.
for v in [universal_variant, variant]:
package_path = base_path / v / package_name / "__init__.py"
if package_path.exists():
return import_from_path(package_name, package_path)
# If we didn't find the package in the repo we may have a explicit
# package path.
package_path = repo_path / package_name / "__init__.py"
if package_path.exists():
return import_from_path(package_name, package_path)
raise FileNotFoundError(f"Could not find package '{package_name}' in {repo_path}")
def has_kernel(
repo_id: str, revision: Optional[str] = None, version: Optional[str] = None
) -> bool:
"""
Check whether a kernel build exists for the current environment
(Torch version and compute framework).
Check whether a kernel build exists for the current environment (Torch version and compute framework).
Args:
repo_id (`str`): The Hub repository containing the kernel.
revision (`str`, *optional*, defaults to `"main"`): The specific
revision (branch, tag, or commit) to download.
Cannot be used together with `version`.
version (`str`, *optional*): The kernel version to download. This
can be a Python version specifier, such as `">=1.0.0,<2.0.0"`.
repo_id (`str`):
The Hub repository containing the kernel.
revision (`str`, *optional*, defaults to `"main"`):
The specific revision (branch, tag, or commit) to download. Cannot be used together with `version`.
version (`str`, *optional*):
The kernel version to download. This can be a Python version specifier, such as `">=1.0.0,<2.0.0"`.
Cannot be used together with `revision`.
Returns:
`bool`: `true` if a kernel is avaialble for the current environment.
`bool`: `True` if a kernel is available for the current environment.
"""
revision = select_revision_or_version(repo_id, revision, version)
@ -264,8 +310,16 @@ def load_kernel(repo_id: str, *, lockfile: Optional[Path] = None) -> ModuleType:
"""
Get a pre-downloaded, locked kernel.
If `lockfile` is not specified, the lockfile will be loaded from the
caller's package metadata.
If `lockfile` is not specified, the lockfile will be loaded from the caller's package metadata.
Args:
repo_id (`str`):
The Hub repository containing the kernel.
lockfile (`Path`, *optional*):
Path to the lockfile. If not provided, the lockfile will be loaded from the caller's package metadata.
Returns:
`ModuleType`: The imported kernel module.
"""
if lockfile is None:
locked_sha = _get_caller_locked_kernel(repo_id)
@ -310,7 +364,18 @@ def load_kernel(repo_id: str, *, lockfile: Optional[Path] = None) -> ModuleType:
def get_locked_kernel(repo_id: str, local_files_only: bool = False) -> ModuleType:
"""Get a kernel using a lock file."""
"""
Get a kernel using a lock file.
Args:
repo_id (`str`):
The Hub repository containing the kernel.
local_files_only (`bool`, *optional*, defaults to `False`):
Whether to only use local files and not download from the Hub.
Returns:
`ModuleType`: The imported kernel module.
"""
locked_sha = _get_caller_locked_kernel(repo_id)
if locked_sha is None:

View File

@ -1,10 +1,24 @@
import sys
import pytest
import torch
has_cuda = (
hasattr(torch.version, "cuda")
and torch.version.cuda is not None
and torch.cuda.device_count() > 0
)
has_rocm = (
hasattr(torch.version, "hip")
and torch.version.hip is not None
and torch.cuda.device_count() > 0
)
def pytest_runtest_setup(item):
if "linux_only" in item.keywords and not sys.platform.startswith("linux"):
pytest.skip("skipping Linux-only test on non-Linux platform")
if "cuda_only" in item.keywords and not has_cuda:
pytest.skip("skipping CUDA-only test on host without CUDA")
if "rocm_only" in item.keywords and not has_rocm:
pytest.skip("skipping ROCm-only test on host without ROCm")
if "darwin_only" in item.keywords and not sys.platform.startswith("darwin"):
pytest.skip("skipping macOS-only test on non-macOS platform")

View File

@ -10,10 +10,16 @@ def kernel():
@pytest.fixture
def local_kernel():
def local_kernel_path():
package_name, path = install_kernel("kernels-community/activation", "main")
# Path is the build variant path (build/torch-<...>), so the grandparent
# is the kernel repository path.
return package_name, path
@pytest.fixture
def local_kernel(local_kernel_path):
package_name, path = local_kernel_path
return get_local_kernel(path.parent.parent, package_name)
@ -34,7 +40,7 @@ def device():
return "cuda"
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_gelu_fast(kernel, device):
x = torch.arange(1, 10, dtype=torch.float16, device=device).view(3, 3)
y = torch.empty_like(x)
@ -50,7 +56,7 @@ def test_gelu_fast(kernel, device):
assert torch.allclose(y, expected)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_local_kernel(local_kernel, device):
x = torch.arange(1, 10, dtype=torch.float16, device=device).view(3, 3)
y = torch.empty_like(x)
@ -66,6 +72,39 @@ def test_local_kernel(local_kernel, device):
assert torch.allclose(y, expected)
@pytest.mark.cuda_only
def test_local_kernel_path_types(local_kernel_path, device):
package_name, path = local_kernel_path
# Top-level repo path
# ie: /home/ubuntu/.cache/huggingface/hub/models--kernels-community--activation/snapshots/2fafa6a3a38ccb57a1a98419047cf7816ecbc071
kernel = get_local_kernel(path.parent.parent, package_name)
x = torch.arange(1, 10, dtype=torch.float16, device=device).view(3, 3)
y = torch.empty_like(x)
kernel.gelu_fast(y, x)
expected = torch.tensor(
[[0.8408, 1.9551, 2.9961], [4.0000, 5.0000, 6.0000], [7.0000, 8.0000, 9.0000]],
device=device,
dtype=torch.float16,
)
assert torch.allclose(y, expected)
# Build directory path
# ie: /home/ubuntu/.cache/huggingface/hub/models--kernels-community--activation/snapshots/2fafa6a3a38ccb57a1a98419047cf7816ecbc071/build
kernel = get_local_kernel(path.parent.parent / "build", package_name)
y = torch.empty_like(x)
kernel.gelu_fast(y, x)
assert torch.allclose(y, expected)
# Explicit package path
# ie: /home/ubuntu/.cache/huggingface/hub/models--kernels-community--activation/snapshots/2fafa6a3a38ccb57a1a98419047cf7816ecbc071/build/torch28-cxx11-cu128-x86_64-linux
kernel = get_local_kernel(path, package_name)
y = torch.empty_like(x)
kernel.gelu_fast(y, x)
assert torch.allclose(y, expected)
@pytest.mark.darwin_only
@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
def test_relu_metal(metal_kernel, dtype):
@ -74,7 +113,7 @@ def test_relu_metal(metal_kernel, dtype):
assert torch.allclose(y, torch.relu(x))
@pytest.mark.linux_only
@pytest.mark.cuda_only
@pytest.mark.parametrize(
"kernel_exists",
[
@ -110,7 +149,7 @@ def test_version():
)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_universal_kernel(universal_kernel):
torch.manual_seed(0)
A = torch.randint(-10, 10, (64, 128), dtype=torch.int8, device="cuda")

View File

@ -16,21 +16,21 @@ def device():
return "cuda"
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_gelu_small(kernel, device, benchmark):
x = torch.randn(32, 32, dtype=torch.float16, device=device)
y = torch.empty_like(x)
benchmark(kernel.gelu_fast, y, x)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_gelu_medium(kernel, device, benchmark):
x = torch.randn(128, 128, dtype=torch.float16, device=device)
y = torch.empty_like(x)
benchmark(kernel.gelu_fast, y, x)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_gelu_large(kernel, device, benchmark):
x = torch.randn(512, 512, dtype=torch.float16, device=device)
y = torch.empty_like(x)

49
tests/test_doctest.py Normal file
View File

@ -0,0 +1,49 @@
import inspect
import pytest
from mktestdocs import check_docstring, get_codeblock_members
import kernels
def all_public_functions():
function_list = inspect.getmembers(kernels, inspect.isfunction)
return [func for _, func in function_list]
def all_public_classes():
class_list = inspect.getmembers(kernels, inspect.isclass)
return [cls for _, cls in class_list]
def all_public_class_members():
members = get_codeblock_members(*all_public_classes())
return members
@pytest.mark.cuda_only
@pytest.mark.parametrize(
"func",
all_public_functions(),
ids=lambda d: d.__name__,
)
def test_func_docstring(func):
check_docstring(obj=func)
@pytest.mark.cuda_only
@pytest.mark.parametrize(
"cls",
all_public_classes(),
ids=lambda d: d.__name__,
)
def test_class_docstring(cls):
check_docstring(obj=cls)
@pytest.mark.cuda_only
@pytest.mark.parametrize(
"member", all_public_class_members(), ids=lambda d: d.__qualname__
)
def test_member_docstring(member):
check_docstring(member)

View File

@ -27,7 +27,7 @@ def test_download_all_hash_validation():
download_kernels(DownloadArgs(all_variants=True, project_dir=project_dir))
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_load_locked():
project_dir = Path(__file__).parent / "kernel_locking"
# Also validates that hashing works correctly.

View File

@ -0,0 +1,86 @@
import logging
import os
import re
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import List
from huggingface_hub import model_info
from kernels.cli import upload_kernels
REPO_ID = "kernels-test/kernels-upload-test"
PY_CONTENT = """\
#!/usr/bin/env python3
def main():
print("Hello from torch-universal!")
if __name__ == "__main__":
main()
"""
@dataclass
class UploadArgs:
kernel_dir: None
repo_id: None
private: False
def next_filename(path: Path) -> Path:
"""
Given a path like foo_2050.py, return foo_2051.py.
"""
m = re.match(r"^(.*?)(\d+)(\.py)$", path.name)
if not m:
raise ValueError(
f"Filename {path.name!r} does not match pattern <prefix>_<number>.py"
)
prefix, number, suffix = m.groups()
new_number = str(int(number) + 1).zfill(len(number))
return path.with_name(f"{prefix}{new_number}{suffix}")
def get_filename_to_change(repo_filenames):
for f in repo_filenames:
if "foo" in f and f.endswith(".py"):
filename_to_change = os.path.basename(f)
break
assert filename_to_change
return filename_to_change
def get_filenames_from_a_repo(repo_id: str) -> List[str]:
try:
repo_info = model_info(repo_id=repo_id, files_metadata=True)
repo_siblings = repo_info.siblings
if repo_siblings is not None:
return [f.rfilename for f in repo_siblings]
else:
raise ValueError("No repo siblings found.")
except Exception as e:
logging.error(f"Error connecting to the Hub: {e}.")
def test_kernel_upload_deletes_as_expected():
repo_filenames = get_filenames_from_a_repo(REPO_ID)
filename_to_change = get_filename_to_change(repo_filenames)
with tempfile.TemporaryDirectory() as tmpdir:
path = f"{tmpdir}/build/torch-universal/upload_test"
build_dir = Path(path)
build_dir.mkdir(parents=True, exist_ok=True)
changed_filename = next_filename(Path(filename_to_change))
script_path = build_dir / changed_filename
script_path.write_text(PY_CONTENT)
upload_kernels(UploadArgs(tmpdir, REPO_ID, False))
repo_filenames = get_filenames_from_a_repo(REPO_ID)
assert any(str(changed_filename) in k for k in repo_filenames), f"{repo_filenames=}"
assert not any(
str(filename_to_change) in k for k in repo_filenames
), f"{repo_filenames=}"

View File

@ -7,19 +7,21 @@ import torch.nn as nn
from torch.nn import functional as F
from kernels import (
CUDAProperties,
Device,
LayerRepository,
LocalLayerRepository,
Mode,
kernelize,
register_kernel_mapping,
use_kernel_forward_from_hub,
use_kernel_mapping,
)
from kernels.layer import (
_KERNEL_MAPPING,
CUDAProperties,
_validate_layer,
use_kernel_mapping,
)
from kernels.utils import install_kernel
kernel_layer_mapping = {
"SiluAndMul": {
@ -32,7 +34,11 @@ kernel_layer_mapping = {
"cuda": LayerRepository(
repo_id="kernels-test/op-without-fake-test",
layer_name="SiluAndMul",
)
),
"rocm": LayerRepository(
repo_id="kernels-test/op-without-fake-test",
layer_name="SiluAndMul",
),
},
"SiluAndMulStringDevice": {
"cuda": LayerRepository(
@ -102,29 +108,74 @@ def test_arg_kinds():
assert arg_kind("foo", "bar", kwarg1="baz", kwarg2=5) == ("foo", "bar", "baz", 5)
@pytest.mark.linux_only
@pytest.mark.cuda_only
@pytest.mark.parametrize("cls", [SiluAndMulWithKernel, SiluAndMulStringDevice])
@pytest.mark.parametrize("device", ["cuda", "cpu"])
def test_hub_forward(cls, device):
def test_hub_forward(cls):
torch.random.manual_seed(0)
silu_and_mul = SiluAndMul()
X = torch.randn((32, 64), device=device)
X = torch.randn((32, 64), device="cuda")
Y = silu_and_mul(X)
silu_and_mul_with_kernel = kernelize(cls(), device=device, mode=Mode.INFERENCE)
silu_and_mul_with_kernel = kernelize(cls(), device="cuda", mode=Mode.INFERENCE)
Y_kernel = silu_and_mul_with_kernel(X)
torch.testing.assert_close(Y_kernel, Y)
assert silu_and_mul.n_calls == 1
if device == "cuda":
assert silu_and_mul_with_kernel.n_calls == 0
else:
assert silu_and_mul_with_kernel.n_calls == 1
assert silu_and_mul_with_kernel.n_calls == 0
@pytest.mark.linux_only
@pytest.mark.rocm_only
def test_hub_forward_rocm():
torch.manual_seed(0)
silu_and_mul = SiluAndMul()
X = torch.randn((32, 64))
Y = silu_and_mul(X)
silu_and_mul_with_kernel = kernelize(
SiluAndMulNoCompileKernel(), device="rocm", mode=Mode.INFERENCE
)
Y_kernel = silu_and_mul_with_kernel(X)
torch.testing.assert_close(Y_kernel, Y)
assert silu_and_mul.n_calls == 1
# Should use kernel (n_calls == 0) if ROCm kernel is available, otherwise fallback (n_calls == 1)
# The exact behavior depends on whether the test kernel exists for ROCm
assert silu_and_mul_with_kernel.n_calls in [0, 1]
def test_rocm_kernel_mapping():
"""Test that ROCm shorthand device mapping works correctly."""
kernel_layer_mapping = {
"SiluAndMul": {
"rocm": LayerRepository(
repo_id="kernels-community/activation",
layer_name="SiluAndMul",
)
}
}
# Test that the mapping is processed correctly
with use_kernel_mapping(kernel_layer_mapping, inherit_mapping=False):
mapping = _KERNEL_MAPPING.get()
# Verify the mapping exists
assert "SiluAndMul" in mapping
assert "rocm" in mapping["SiluAndMul"]
# Verify the repository is correctly stored
rocm_repos = mapping["SiluAndMul"]["rocm"]
assert rocm_repos is not None
assert (
rocm_repos.repos[Mode.FALLBACK]._repo_id == "kernels-community/activation"
)
assert rocm_repos.repos[Mode.FALLBACK].layer_name == "SiluAndMul"
@pytest.mark.cuda_only
def test_capability():
linear = TorchLinearWithCounter(32, 32).to("cuda")
with use_kernel_mapping(
@ -183,7 +234,33 @@ def test_layer_fallback_works():
kernelize(silu_and_mul, device="cuda", mode=Mode.INFERENCE)
@pytest.mark.linux_only
def test_local_layer_repo():
# Fetch a kernel to the local cache.
package_name, path = install_kernel("kernels-test/backward-marker-test", "main")
linear = TorchLinearWithCounter(32, 32).to("cuda")
with use_kernel_mapping(
{
"Linear": {
"cuda": LocalLayerRepository(
# install_kernel will give the fully-resolved path.
repo_path=path.parent.parent,
package_name=package_name,
layer_name="LinearBackward",
)
}
},
inherit_mapping=False,
):
kernelize(linear, mode=Mode.INFERENCE)
X = torch.randn(10, 32, device="cuda")
linear(X)
assert linear.n_calls == 0
@pytest.mark.cuda_only
@pytest.mark.parametrize("cls", [SiluAndMulWithKernel, SiluAndMulNoCompileKernel])
@pytest.mark.parametrize("device", ["cuda"])
def test_torch_compile_layer_without_fallback(cls, device):
@ -214,7 +291,7 @@ def test_torch_compile_layer_without_fallback(cls, device):
torch.testing.assert_close(Y_compiled, Y)
@pytest.mark.linux_only
@pytest.mark.cuda_only
@pytest.mark.parametrize("cls", [SiluAndMulWithKernel, SiluAndMulNoCompileKernel])
@pytest.mark.parametrize("device", ["cuda"])
def test_torch_compile_layer_with_fallback(cls, device):
@ -237,8 +314,11 @@ def test_torch_compile_layer_with_fallback(cls, device):
torch.testing.assert_close(Y_compiled, Y)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_mapping_contexts():
# Make sure we start from scratch.
register_kernel_mapping(kernel_layer_mapping, inherit_mapping=False)
assert set(_KERNEL_MAPPING.get().keys()) == {
"SiluAndMul",
"SiluAndMulStringDevice",
@ -281,7 +361,9 @@ def test_mapping_contexts():
"TestKernel",
}
assert (
_KERNEL_MAPPING.get()["SiluAndMul"]["cuda"].repos[Mode.FALLBACK].repo_id
_KERNEL_MAPPING.get()["SiluAndMul"]["cuda"]
.repos[Mode.FALLBACK]
._repo_id
== "kernels-community/non-existing"
)
@ -292,7 +374,7 @@ def test_mapping_contexts():
"TestKernel",
}
assert (
_KERNEL_MAPPING.get()["SiluAndMul"]["cuda"].repos[Mode.FALLBACK].repo_id
_KERNEL_MAPPING.get()["SiluAndMul"]["cuda"].repos[Mode.FALLBACK]._repo_id
== "kernels-community/activation"
)
@ -301,7 +383,9 @@ def test_mapping_contexts():
"SiluAndMul",
}
assert (
_KERNEL_MAPPING.get()["SiluAndMul"]["cuda"].repos[Mode.FALLBACK].repo_id
_KERNEL_MAPPING.get()["SiluAndMul"]["cuda"]
.repos[Mode.FALLBACK]
._repo_id
== "kernels-community/non-existing"
)
@ -312,7 +396,7 @@ def test_mapping_contexts():
"TestKernel",
}
assert (
_KERNEL_MAPPING.get()["SiluAndMul"]["cuda"].repos[Mode.FALLBACK].repo_id
_KERNEL_MAPPING.get()["SiluAndMul"]["cuda"].repos[Mode.FALLBACK]._repo_id
== "kernels-community/activation"
)
@ -351,7 +435,7 @@ def test_validate_kernel_layer():
_validate_layer(cls=BadLayer4, check_cls=SiluAndMul)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_invalid_mode_for_mapping_rejected():
linear = TorchLinearWithCounter(32, 32).to("cuda")
@ -371,7 +455,7 @@ def test_invalid_mode_for_mapping_rejected():
kernelize(linear, mode=Mode.TRAINING)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_kernel_modes():
linear = TorchLinearWithCounter(32, 32).to("cuda")
@ -400,11 +484,6 @@ def test_kernel_modes():
linear(X)
assert linear.n_calls == 0
# Same as previous, since TRAINING | TORCH_COMPILE is the default.
kernelize(linear)
linear(X)
assert linear.n_calls == 0
# Case 2: register a kernel just for training. If no base kernel
# layer is registered, we fall back to the original layer.
with use_kernel_mapping(
@ -434,12 +513,6 @@ def test_kernel_modes():
# TRAINING | TORCH_COMPILE cannot fall back to TRAINING kernel, so uses original.
assert linear.n_calls == 1
# Same as previous, since TRAINING | TORCH_COMPILE is the default.
kernelize(linear)
linear(X)
# TRAINING | TORCH_COMPILE cannot fall back to TRAINING kernel, so uses original.
assert linear.n_calls == 2
# Case 3: register a kernel just for training and one for fallback.
with use_kernel_mapping(
{
@ -461,23 +534,17 @@ def test_kernel_modes():
X = torch.randn(10, 32, device="cuda")
linear(X)
# Falls back to TRAINING.
assert linear.n_calls == 2
assert linear.n_calls == 1
kernelize(linear, mode=Mode.TRAINING)
linear(X)
# Falls back to the TRAINING kernel.
assert linear.n_calls == 2
assert linear.n_calls == 1
kernelize(linear, mode=Mode.TRAINING | Mode.TORCH_COMPILE)
linear(X)
# TRAINING | TORCH_COMPILE falls back to FALLBACK kernel.
assert linear.n_calls == 2
# Same as previous, since TRAINING | TORCH_COMPILE is the default.
kernelize(linear)
linear(X)
# TRAINING | TORCH_COMPILE falls back to FALLBACK kernel.
assert linear.n_calls == 2
assert linear.n_calls == 1
# Case 4: register a kernel with two preferences.
with use_kernel_mapping(
@ -497,25 +564,20 @@ def test_kernel_modes():
X = torch.randn(10, 32, device="cuda")
linear(X)
# Falls back to the TRAINING | TORCH_COMPILE kernel.
assert linear.n_calls == 2
assert linear.n_calls == 1
kernelize(linear, mode=Mode.TRAINING)
linear(X)
# TRAINING can fall back to TRAINING | TORCH_COMPILE kernel.
assert linear.n_calls == 2
assert linear.n_calls == 1
kernelize(linear, mode=Mode.TRAINING | Mode.TORCH_COMPILE)
linear(X)
# Uses TRAINING | TORCH_COMPILE kernel.
assert linear.n_calls == 2
kernelize(linear)
linear(X)
# Same as previous, since TRAINING | TORCH_COMPILE is the default.
assert linear.n_calls == 2
assert linear.n_calls == 1
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_fallback_used_when_training():
linear = TorchLinearWithCounter(32, 32).to("cuda")
@ -580,7 +642,7 @@ def test_invalid_mode_rejected():
kernelize(torch.nn.Linear(32, 32), mode=Mode.TORCH_COMPILE)
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_kernel_modes_inference():
"""Test inference-specific fallback scenarios."""
linear = TorchLinearWithCounter(32, 32).to("cuda")
@ -677,7 +739,7 @@ def test_kernel_modes_inference():
assert linear.n_calls == 4
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_kernel_modes_mixed():
"""Test mixed training and inference kernel scenarios."""
linear = TorchLinearWithCounter(32, 32).to("cuda")
@ -767,7 +829,7 @@ def test_kernel_modes_mixed():
assert linear.n_calls == 2
@pytest.mark.linux_only
@pytest.mark.cuda_only
def test_kernel_modes_cross_fallback():
"""Test cross-mode fallback scenarios from inference to training modes."""
linear = TorchLinearWithCounter(32, 32).to("cuda")