[nativert] libtorch kernel registry (#157150)

Summary: att Test Plan: ci Rollback Plan: Differential Revision: D77451703 Pull Request resolved: https://github.com/pytorch/pytorch/pull/157150 Approved by: https://github.com/georgiaphillips, https://github.com/henryoier
2025-10-31 04:04:57 +08:00 · 2025-07-16 12:36:51 +00:00
parent 55d888a616
commit 51a708ffc6
6 changed files with 1626 additions and 3 deletions
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -625,6 +625,10 @@ libtorch_nativert_sources = [
    "torch/nativert/executor/memory/AliasAnalyzer.cpp",
    "torch/nativert/executor/memory/LayoutPlanner.cpp",
    "torch/nativert/executor/memory/LayoutManager.cpp",
    "torch/nativert/kernels/KernelRegistry.cpp",
    "torch/nativert/kernels/NativeKernels.cpp",
    "torch/nativert/kernels/GeneratedStaticDispatchKernels.cpp",
    "torch/nativert/kernels/GeneratedNativeStaticDispatchKernels.cpp",
 ]
 torch_mobile_tracer_sources = [
--- a/c10/core/impl/SizesAndStrides.h
+++ b/c10/core/impl/SizesAndStrides.h
@ -64,6 +64,10 @@ class C10_API SizesAndStrides {
                  storageBytes(size_)));
  }
  bool operator!=(const SizesAndStrides& other) const {
    return !(*this == other);
  }
  SizesAndStrides& operator=(const SizesAndStrides& rhs) {
    if (this == &rhs) {
      return *this;
--- a/torch/nativert/kernels/KernelRegistry.cpp
+++ b/torch/nativert/kernels/KernelRegistry.cpp
--- a/torch/nativert/kernels/KernelRegistry.h
+++ b/torch/nativert/kernels/KernelRegistry.h
@ -0,0 +1,122 @@
 #pragma once
 #include <torch/nativert/executor/OpKernel.h>
 #include <torch/nativert/graph/Graph.h>
 #include <torch/nativert/kernels/PrimKernelRegistry.h>
 namespace torch::nativert {
 TORCH_DECLARE_REGISTRY(
    StaticallyDispatchedCPUKernelRegistry,
    OpKernel,
    const Node*,
    c10::Device);
 #define REGISTER_CPU_KERNEL(name, id, ...)                                \
  class OpKernel_##id : public C10Kernel {                                \
   public:                                                                \
    OpKernel_##id(const Node* node, c10::Device device)                   \
        : C10Kernel(                                                      \
              node,                                                       \
              device,                                                     \
              torch::nativert::OpKernelKind::kStaticDispatchKernel) {}    \
    void computeInternal(torch::nativert::ExecutionFrame& executionFrame) \
        const override final {                                            \
      __VA_ARGS__;                                                        \
    }                                                                     \
  };                                                                      \
  C10_REGISTER_TYPED_CLASS(                                               \
      StaticallyDispatchedCPUKernelRegistry, name, OpKernel_##id)
 #define ALIASING_SPEC(...) __VA_ARGS__
 #define REGISTER_ALIASING_CPU_KERNEL(name, id, aliasing_spec, ...)        \
  class OpKernel_##id : public C10Kernel {                                \
   public:                                                                \
    OpKernel_##id(const Node* node, c10::Device device)                   \
        : C10Kernel(                                                      \
              node,                                                       \
              device,                                                     \
              torch::nativert::OpKernelKind::kNativeStaticDispatchKernel, \
              aliasing_spec) {}                                           \
    void computeInternal(torch::nativert::ExecutionFrame& executionFrame) \
        const override final {                                            \
      __VA_ARGS__;                                                        \
    }                                                                     \
  };                                                                      \
  C10_REGISTER_TYPED_CLASS(                                               \
      StaticallyDispatchedCPUKernelRegistry, name, OpKernel_##id)
 #define REGISTER_NATIVE_CPU_KERNEL(name, id, ...)                            \
  class OpKernel_##id : public C10Kernel {                                   \
   public:                                                                   \
    OpKernel_##id(const Node* node, c10::Device device)                      \
        : C10Kernel(                                                         \
              node,                                                          \
              device,                                                        \
              torch::nativert::OpKernelKind::kNativeStaticDispatchKernel) {} \
    void computeInternal(torch::nativert::ExecutionFrame& executionFrame)    \
        const override final {                                               \
      __VA_ARGS__;                                                           \
    }                                                                        \
  };                                                                         \
  C10_REGISTER_TYPED_CLASS(                                                  \
      StaticallyDispatchedCPUKernelRegistry, name, OpKernel_##id)
 inline at::Tensor create_empty_from(const at::Tensor& t) {
  return at::detail::empty_cpu(
      {0},
      c10::typeMetaToScalarType(t.dtype()),
      t.layout(),
      t.device(),
      std::nullopt,
      std::nullopt);
 }
 inline at::Tensor create_empty_from(
    const at::Tensor& t,
    c10::ScalarType dtype) {
  return at::detail::empty_cpu(
      {0}, dtype, t.layout(), t.device(), std::nullopt, std::nullopt);
 }
 inline at::Tensor create_empty_from(const at::Tensor& t, c10::Device device) {
  return at::detail::empty_cpu(
      {0},
      c10::typeMetaToScalarType(t.dtype()),
      t.layout(),
      device,
      std::nullopt,
      std::nullopt);
 }
 inline at::Tensor create_empty_from(const at::Tensor& t, c10::Layout layout) {
  return at::detail::empty_cpu(
      {0},
      c10::typeMetaToScalarType(t.dtype()),
      layout,
      t.device(),
      std::nullopt,
      std::nullopt);
 }
 inline at::Tensor create_empty_from(
    const at::Tensor& t,
    c10::MemoryFormat memory_format) {
  return at::detail::empty_cpu(
      {0},
      c10::typeMetaToScalarType(t.dtype()),
      t.layout(),
      t.device(),
      std::nullopt,
      memory_format);
 }
 inline at::Tensor create_empty_from(
    const at::Tensor& t,
    c10::ScalarType dtype,
    c10::MemoryFormat memory_format) {
  return at::detail::empty_cpu(
      {0}, dtype, t.layout(), t.device(), std::nullopt, memory_format);
 }
 } // namespace torch::nativert
--- a/torch/nativert/kernels/NativeKernels.cpp
+++ b/torch/nativert/kernels/NativeKernels.cpp
@ -0,0 +1,113 @@
 #include <torch/nativert/kernels/KernelRegistry.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/IndexingUtils.h>
 #include <ATen/native/NonSymbolicBC.h>
 namespace torch::nativert {
 REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.slice.Tensor", aten_slice_Tensor, {
  const auto& self = KernelInput(0).toTensor();
  const auto& dim = KernelInput(1).toInt();
  const auto& start = KernelInput(2).toOptional<int64_t>();
  const auto& end = KernelInput(3).toOptional<int64_t>();
  const auto& step = KernelInput(4).toInt();
  KernelOutput(0) = at::native::slice(self, dim, start, end, step);
 });
 REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.sym_size.int", aten_sym_size_int, {
  const auto& self = KernelInput(0).toTensor();
  const auto dim = KernelInput(1).toInt();
  auto& out = KernelOutput(0);
  TORCH_CHECK(dim >= 0 && dim < self.dim(), "Invalid dimension");
  out = self.sym_size(dim);
 });
 REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.reshape.default", aten_reshape, {
  const auto& self = KernelInput(0).toTensor();
  const auto& shape = KernelInput(1).toIntVector();
  KernelOutput(0) = at::native::reshape(self, shape);
 });
 REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.view.default", aten_view, {
  const auto& self = KernelInput(0).toTensor();
  const auto& size = KernelInput(1).toIntVector();
  KernelOutput(0) = at::native::view(self, size);
 });
 REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.permute.default", aten_permute, {
  const auto& self = KernelInput(0).toTensor();
  const auto& dims = KernelInput(1).toDimVector();
  KernelOutput(0) = at::native::permute(self, dims);
 });
 REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.select.int", aten_select, {
  const auto& self = KernelInput(0).toTensor();
  const auto dim = KernelInput(1).toInt();
  const auto index = KernelInput(2).toInt();
  KernelOutput(0) = at::native::select(self, dim, index);
 });
 REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.split.Tensor", aten_split_Tensor, {
  const auto& self = KernelInput(0).toTensor();
  const auto split_size = KernelInput(1).toInt();
  const auto dim = KernelInput(2).toInt();
  KernelOutput(0) = at::native::split(self, split_size, dim);
 });
 REGISTER_NATIVE_CPU_KERNEL(
    "torch.ops.aten.split_with_sizes.default",
    aten_split_with_sizes,
    {
      const auto& self = KernelInput(0).toTensor();
      const auto& split_sizes = KernelInput(1).toIntList();
      const auto dim = KernelInput(2).toInt();
      KernelOutput(0) =
          at::native::split_with_sizes(self, split_sizes.vec(), dim);
    });
 REGISTER_NATIVE_CPU_KERNEL(
    "torch.ops.aten.tensor_split.sections",
    aten_tensor_split_sections,
    {
      const auto& self = KernelInput(0).toTensor();
      const auto sections = KernelInput(1).toInt();
      const auto dim = KernelInput(2).toInt();
      KernelOutput(0) =
          at::native::tensor_split_sections_symint(self, sections, dim);
    });
 REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.item.default", aten_item, {
  const auto& self = KernelInput(0).toTensor();
  KernelOutput(0) = at::native::item(self);
 });
 REGISTER_NATIVE_CPU_KERNEL("torch.ops.aten.narrow.default", aten_narrow, {
  const auto& self = KernelInput(0).toTensor();
  const auto dim = KernelInput(1).toInt();
  int64_t start = 0;
  if (KernelInput(2).isScalar()) {
    start = KernelInput(2).toInt();
  } else {
    auto& t = KernelInput(2).toTensor();
    start = t.item<int64_t>();
  }
  const auto length = KernelInput(3).toInt();
  TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
  auto cur_size = self.sizes()[dim];
  if (start != cur_size && start < 0) {
    start = at::maybe_wrap_dim(start, cur_size);
  }
  TORCH_CHECK(
      length >= 0 && start <= cur_size - length,
      "start (",
      start,
      ") + length (",
      length,
      ") exceeds dimension size (",
      cur_size,
      ").");
  KernelOutput(0) = at::native::slice(self, dim, start, start + length, 1);
 });
 } // namespace torch::nativert
--- a/torch/nativert/kernels/PrimKernelRegistry.cpp
+++ b/torch/nativert/kernels/PrimKernelRegistry.cpp
@ -57,7 +57,7 @@ class OpKernel_prim_listpack : public OpKernel {
 C10_REGISTER_TYPED_CLASS(
    PrimKernelRegistry,
    "prim.ListPack",
-    OpKernel_prim_listpack);
+    OpKernel_prim_listpack)
 REGISTER_PRIM_KERNEL("prim.ListUnpack", prim_listunpack, {
  RECORD_USER_SCOPE("nativert::OpKernel_prim_listunpack");
@ -114,7 +114,7 @@ class OpKernel_variadic_concat : public OpKernel {
 C10_REGISTER_TYPED_CLASS(
    PrimKernelRegistry,
    "prim.VarConcat",
-    OpKernel_variadic_concat);
+    OpKernel_variadic_concat)
 namespace {
@ -158,6 +158,6 @@ class OpKernel_variadic_stack : public OpKernel {
 C10_REGISTER_TYPED_CLASS(
    PrimKernelRegistry,
    "prim.VarStack",
-    OpKernel_variadic_stack);
+    OpKernel_variadic_stack)
 } // namespace torch::nativert