[nativert] libtorch kernel registry (#157150)

Summary: att Test Plan: ci Rollback Plan: Differential Revision: D77451703 Pull Request resolved: https://github.com/pytorch/pytorch/pull/157150 Approved by: https://github.com/georgiaphillips, https://github.com/henryoier
2025-10-20 21:14:14 +08:00 · 2025-07-16 12:36:51 +00:00
parent 55d888a616
commit 51a708ffc6
6 changed files with 1626 additions and 3 deletions
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -625,6 +625,10 @@ libtorch_nativert_sources = [
    "torch/nativert/executor/memory/AliasAnalyzer.cpp",
    "torch/nativert/executor/memory/LayoutPlanner.cpp",
    "torch/nativert/executor/memory/LayoutManager.cpp",
+    "torch/nativert/kernels/KernelRegistry.cpp",
+    "torch/nativert/kernels/NativeKernels.cpp",
+    "torch/nativert/kernels/GeneratedStaticDispatchKernels.cpp",
+    "torch/nativert/kernels/GeneratedNativeStaticDispatchKernels.cpp",
 ]

 torch_mobile_tracer_sources = [
--- a/c10/core/impl/SizesAndStrides.h
+++ b/c10/core/impl/SizesAndStrides.h
@ -64,6 +64,10 @@ class C10_API SizesAndStrides {
                  storageBytes(size_)));
  }

+  bool operator!=(const SizesAndStrides& other) const {
+    return !(*this == other);
+  }
+
  SizesAndStrides& operator=(const SizesAndStrides& rhs) {
    if (this == &rhs) {
      return *this;
--- a/torch/nativert/kernels/KernelRegistry.cpp
+++ b/torch/nativert/kernels/KernelRegistry.cpp
--- a/torch/nativert/kernels/KernelRegistry.h
+++ b/torch/nativert/kernels/KernelRegistry.h
@ -0,0 +1,122 @@
+#pragma once
+
+#include <torch/nativert/executor/OpKernel.h>
+#include <torch/nativert/graph/Graph.h>
+#include <torch/nativert/kernels/PrimKernelRegistry.h>
+
+namespace torch::nativert {
+
+TORCH_DECLARE_REGISTRY(
+    StaticallyDispatchedCPUKernelRegistry,
+    OpKernel,
+    const Node*,
+    c10::Device);
+
+#define REGISTER_CPU_KERNEL(name, id, ...)                                \
+  class OpKernel_##id : public C10Kernel {                                \
+   public:                                                                \
+    OpKernel_##id(const Node* node, c10::Device device)                   \
+        : C10Kernel(                                                      \
+              node,                                                       \
+              device,                                                     \
+              torch::nativert::OpKernelKind::kStaticDispatchKernel) {}    \
+    void computeInternal(torch::nativert::ExecutionFrame& executionFrame) \
+        const override final {                                            \
+      __VA_ARGS__;                                                        \
+    }                                                                     \
+  };                                                                      \
+  C10_REGISTER_TYPED_CLASS(                                               \
+      StaticallyDispatchedCPUKernelRegistry, name, OpKernel_##id)
+
+#define ALIASING_SPEC(...) __VA_ARGS__
+
+#define REGISTER_ALIASING_CPU_KERNEL(name, id, aliasing_spec, ...)        \
+  class OpKernel_##id : public C10Kernel {                                \
+   public:                                                                \
+    OpKernel_##id(const Node* node, c10::Device device)                   \
+        : C10Kernel(                                                      \
+              node,                                                       \
+              device,                                                     \
+              torch::nativert::OpKernelKind::kNativeStaticDispatchKernel, \
+              aliasing_spec) {}                                           \
+    void computeInternal(torch::nativert::ExecutionFrame& executionFrame) \
+        const override final {                                            \
+      __VA_ARGS__;                                                        \
+    }                                                                     \
+  };                                                                      \
+  C10_REGISTER_TYPED_CLASS(                                               \
+      StaticallyDispatchedCPUKernelRegistry, name, OpKernel_##id)
+
+#define REGISTER_NATIVE_CPU_KERNEL(name, id, ...)                            \
+  class OpKernel_##id : public C10Kernel {                                   \
+   public:                                                                   \
+    OpKernel_##id(const Node* node, c10::Device device)                      \
+        : C10Kernel(                                                         \
+              node,                                                          \
+              device,                                                        \
+              torch::nativert::OpKernelKind::kNativeStaticDispatchKernel) {} \
+    void computeInternal(torch::nativert::ExecutionFrame& executionFrame)    \
+        const override final {                                               \
+      __VA_ARGS__;                                                           \
+    }                                                                        \
+  };                                                                         \
+  C10_REGISTER_TYPED_CLASS(                                                  \
+      StaticallyDispatchedCPUKernelRegistry, name, OpKernel_##id)
+
+inline at::Tensor create_empty_from(const at::Tensor& t) {
+  return at::detail::empty_cpu(
+      {0},
+      c10::typeMetaToScalarType(t.dtype()),
+      t.layout(),
+      t.device(),
+      std::nullopt,
+      std::nullopt);
+}
+
+inline at::Tensor create_empty_from(
+    const at::Tensor& t,
+    c10::ScalarType dtype) {
+  return at::detail::empty_cpu(
+      {0}, dtype, t.layout(), t.device(), std::nullopt, std::nullopt);
+}
+
+inline at::Tensor create_empty_from(const at::Tensor& t, c10::Device device) {
+  return at::detail::empty_cpu(
+      {0},
+      c10::typeMetaToScalarType(t.dtype()),
+      t.layout(),
+      device,
+      std::nullopt,
+      std::nullopt);
+}
+inline at::Tensor create_empty_from(const at::Tensor& t, c10::Layout layout) {
+  return at::detail::empty_cpu(
+      {0},
+      c10::typeMetaToScalarType(t.dtype()),
+      layout,
+      t.device(),
+      std::nullopt,
+      std::nullopt);
+}
+
+inline at::Tensor create_empty_from(
+    const at::Tensor& t,
+    c10::MemoryFormat memory_format) {
+  return at::detail::empty_cpu(
+      {0},
+      c10::typeMetaToScalarType(t.dtype()),
+      t.layout(),
+      t.device(),
+      std::nullopt,
+      memory_format);
+}
+
+inline at::Tensor create_empty_from(
+    const at::Tensor& t,
+    c10::ScalarType dtype,
+    c10::MemoryFormat memory_format) {
+  return at::detail::empty_cpu(
+      {0}, dtype, t.layout(), t.device(), std::nullopt, memory_format);
+}
+
+} // namespace torch::nativert
--- a/torch/nativert/kernels/NativeKernels.cpp
+++ b/torch/nativert/kernels/NativeKernels.cpp
@ -0,0 +1,113 @@
+#include <torch/nativert/kernels/KernelRegistry.h>
+
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/IndexingUtils.h>
+#include <ATen/native/NonSymbolicBC.h>
+
+namespace torch::nativert {
+
+REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.slice.Tensor", aten_slice_Tensor, {
+  const auto& self = KernelInput(0).toTensor();
+  const auto& dim = KernelInput(1).toInt();
+  const auto& start = KernelInput(2).toOptional<int64_t>();
+  const auto& end = KernelInput(3).toOptional<int64_t>();
+  const auto& step = KernelInput(4).toInt();
+  KernelOutput(0) = at::native::slice(self, dim, start, end, step);
+});
+
+REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.sym_size.int", aten_sym_size_int, {
+  const auto& self = KernelInput(0).toTensor();
+  const auto dim = KernelInput(1).toInt();
+  auto& out = KernelOutput(0);
+  TORCH_CHECK(dim >= 0 && dim < self.dim(), "Invalid dimension");
+  out = self.sym_size(dim);
+});
+
+REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.reshape.default", aten_reshape, {
+  const auto& self = KernelInput(0).toTensor();
+  const auto& shape = KernelInput(1).toIntVector();
+  KernelOutput(0) = at::native::reshape(self, shape);
+});
+
+REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.view.default", aten_view, {
+  const auto& self = KernelInput(0).toTensor();
+  const auto& size = KernelInput(1).toIntVector();
+  KernelOutput(0) = at::native::view(self, size);
+});
+
+REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.permute.default", aten_permute, {
+  const auto& self = KernelInput(0).toTensor();
+  const auto& dims = KernelInput(1).toDimVector();
+  KernelOutput(0) = at::native::permute(self, dims);
+});
+
+REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.select.int", aten_select, {
+  const auto& self = KernelInput(0).toTensor();
+  const auto dim = KernelInput(1).toInt();
+  const auto index = KernelInput(2).toInt();
+  KernelOutput(0) = at::native::select(self, dim, index);
+});
+
+REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.split.Tensor", aten_split_Tensor, {
+  const auto& self = KernelInput(0).toTensor();
+  const auto split_size = KernelInput(1).toInt();
+  const auto dim = KernelInput(2).toInt();
+  KernelOutput(0) = at::native::split(self, split_size, dim);
+});
+
+REGISTER_NATIVE_CPU_KERNEL(
+    "torch.ops.aten.split_with_sizes.default",
+    aten_split_with_sizes,
+    {
+      const auto& self = KernelInput(0).toTensor();
+      const auto& split_sizes = KernelInput(1).toIntList();
+      const auto dim = KernelInput(2).toInt();
+      KernelOutput(0) =
+          at::native::split_with_sizes(self, split_sizes.vec(), dim);
+    });
+
+REGISTER_NATIVE_CPU_KERNEL(
+    "torch.ops.aten.tensor_split.sections",
+    aten_tensor_split_sections,
+    {
+      const auto& self = KernelInput(0).toTensor();
+      const auto sections = KernelInput(1).toInt();
+      const auto dim = KernelInput(2).toInt();
+      KernelOutput(0) =
+          at::native::tensor_split_sections_symint(self, sections, dim);
+    });
+
+REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.item.default", aten_item, {
+  const auto& self = KernelInput(0).toTensor();
+  KernelOutput(0) = at::native::item(self);
+});
+
+REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.narrow.default", aten_narrow, {
+  const auto& self = KernelInput(0).toTensor();
+  const auto dim = KernelInput(1).toInt();
+  int64_t start = 0;
+  if (KernelInput(2).isScalar()) {
+    start = KernelInput(2).toInt();
+  } else {
+    auto& t = KernelInput(2).toTensor();
+    start = t.item<int64_t>();
+  }
+  const auto length = KernelInput(3).toInt();
+  TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
+  auto cur_size = self.sizes()[dim];
+  if (start != cur_size && start < 0) {
+    start = at::maybe_wrap_dim(start, cur_size);
+  }
+  TORCH_CHECK(
+      length >= 0 && start <= cur_size - length,
+      "start (",
+      start,
+      ") + length (",
+      length,
+      ") exceeds dimension size (",
+      cur_size,
+      ").");
+  KernelOutput(0) = at::native::slice(self, dim, start, start + length, 1);
+});
+
+} // namespace torch::nativert
--- a/torch/nativert/kernels/PrimKernelRegistry.cpp
+++ b/torch/nativert/kernels/PrimKernelRegistry.cpp
@ -57,7 +57,7 @@ class OpKernel_prim_listpack : public OpKernel {
 C10_REGISTER_TYPED_CLASS(
    PrimKernelRegistry,
    "prim.ListPack",
-    OpKernel_prim_listpack);
+    OpKernel_prim_listpack)

 REGISTER_PRIM_KERNEL("prim.ListUnpack", prim_listunpack, {
  RECORD_USER_SCOPE("nativert::OpKernel_prim_listunpack");
@ -114,7 +114,7 @@ class OpKernel_variadic_concat : public OpKernel {
 C10_REGISTER_TYPED_CLASS(
    PrimKernelRegistry,
    "prim.VarConcat",
-    OpKernel_variadic_concat);
+    OpKernel_variadic_concat)

 namespace {

@ -158,6 +158,6 @@ class OpKernel_variadic_stack : public OpKernel {
 C10_REGISTER_TYPED_CLASS(
    PrimKernelRegistry,
    "prim.VarStack",
-    OpKernel_variadic_stack);
+    OpKernel_variadic_stack)

 } // namespace torch::nativert