Revert "[BE] Cleanup old ExecuTorch codegen and runtime code (#154165)"

This reverts commit 515c19a3856e953c0fe23a0ed4fa844f8eea34d8. Reverted https://github.com/pytorch/pytorch/pull/154165 on behalf of https://github.com/seemethere due to This is failing when attempting to test against executorch main internally, author has acknowledged that this should be reverted ([comment](https://github.com/pytorch/pytorch/pull/154165#issuecomment-2931489616))
2025-10-20 21:14:14 +08:00 · 2025-06-02 16:28:46 +00:00
parent 981bdb39ca
commit 67067512a1
43 changed files with 5091 additions and 1 deletions
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-22e7dbd922fbc3f2ae6e97be66e2329fab978619
+b173722085b3f555d6ba4533d6bbaddfd7c71144
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -52,6 +52,12 @@ fi
 export USE_LLVM=/opt/llvm
 export LLVM_DIR=/opt/llvm/lib/cmake/llvm

+if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
+  # To build test_edge_op_registration
+  export BUILD_EXECUTORCH=ON
+  export USE_CUDA=0
+fi
+
 if ! which conda; then
  # In ROCm CIs, we are doing cross compilation on build machines with
  # intel cpu and later run tests on machines with amd cpu.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -526,6 +526,7 @@ if(USE_LIGHTWEIGHT_DISPATCH AND NOT STATIC_DISPATCH_BACKEND)
 endif()
 option(TRACING_BASED
       "Master flag to build Lite Interpreter with tracing build option" OFF)
+option(BUILD_EXECUTORCH "Master flag to build Executorch" ON)
 # This is a fix for a rare build issue on Ubuntu: symbol lookup error:
 # miniconda3/envs/pytorch-py3.7/lib/libmkl_intel_lp64.so: undefined symbol:
 # mkl_blas_dsyrk
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1319,6 +1319,12 @@ install(FILES
  "${TORCH_SRC_DIR}/custom_class_detail.h"
  DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
 if(BUILD_TEST)
+  if(BUILD_EXECUTORCH)
+    add_subdirectory(
+            ${TORCH_ROOT}/test/edge
+            ${CMAKE_BINARY_DIR}/test_edge_op_registration
+    )
+  endif()
  if(BUILD_LITE_INTERPRETER)
    add_subdirectory(
      ${TORCH_ROOT}/test/cpp/lite_interpreter_runtime
--- a/pyproject.toml
+++ b/pyproject.toml
@ -272,6 +272,10 @@ select = [
    "F401",
    "F403",
 ]
+"torchgen/executorch/api/types/__init__.py" = [
+    "F401",
+    "F403",
+]
 "torch/utils/collect_env.py" = [
    "UP", # collect_env.py needs to work with older versions of Python
 ]
--- a/setup.py
+++ b/setup.py
@ -699,6 +699,8 @@ class build_ext(setuptools.command.build_ext.build_ext):
            )
        if cmake_cache_vars["USE_LIGHTWEIGHT_DISPATCH"]:
            report("-- Using lightweight dispatch")
+        if cmake_cache_vars["BUILD_EXECUTORCH"]:
+            report("-- Building Executorch")

        if cmake_cache_vars["USE_ITT"]:
            report("-- Using ITT")
--- a/test/edge/CMakeLists.txt
+++ b/test/edge/CMakeLists.txt
@ -0,0 +1,74 @@
+cmake_minimum_required(VERSION 3.15)
+
+set(TORCH_ROOT ${CMAKE_CURRENT_LIST_DIR}/../..)
+set(TEST_ROOT ${TORCH_ROOT}/test/edge)
+set(OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/out)
+file(GLOB_RECURSE all_python "${TORCH_ROOT}/torchgen/*.py")
+include(${TORCH_ROOT}/cmake/public/utils.cmake)
+append_cxx_flag_if_supported("-Wno-unused-private-field" CMAKE_CXX_FLAGS)
+
+# Generate unboxing kernels
+set(GEN_COMMAND
+        Python::Interpreter -m torchgen.gen_executorch
+        --source-path=${TEST_ROOT}
+        --install-dir=${OUTPUT_DIRECTORY}
+        --tags-path=${TORCH_ROOT}/aten/src/ATen/native/tags.yaml
+        --aten-yaml-path=${TORCH_ROOT}/aten/src/ATen/native/native_functions.yaml
+        --use-aten-lib
+        --op-selection-yaml-path=${TEST_ROOT}/selected_operators.yaml
+        --custom-ops-yaml-path=${TEST_ROOT}/custom_ops.yaml
+        )
+set(GEN_COMMAND_sources
+        ${OUTPUT_DIRECTORY}/RegisterCodegenUnboxedKernelsEverything.cpp
+        ${OUTPUT_DIRECTORY}/RegisterCPUCustomOps.cpp
+        ${OUTPUT_DIRECTORY}/Functions.h
+        ${OUTPUT_DIRECTORY}/NativeFunctions.h
+        ${OUTPUT_DIRECTORY}/CustomOpsNativeFunctions.h
+        )
+message(STATUS "Generating sources for unboxing kernels ${GEN_COMMAND}")
+add_custom_command(
+        COMMENT "Generating sources"
+        OUTPUT ${GEN_COMMAND_sources}
+        COMMAND ${GEN_COMMAND}
+        DEPENDS
+        ${all_python}
+        ${TORCH_ROOT}/aten/src/ATen/native/native_functions.yaml
+        ${TORCH_ROOT}/aten/src/ATen/native/tags.yaml
+        ${TEST_ROOT}/templates/Functions.h
+        ${TEST_ROOT}/templates/NativeFunctions.h
+        ${TEST_ROOT}/templates/RegisterCodegenUnboxedKernels.cpp
+        ${TEST_ROOT}/templates/RegisterDispatchKeyCustomOps.cpp
+        WORKING_DIRECTORY ${TORCH_ROOT}
+)
+add_custom_target(unbox_target DEPENDS ${GEN_COMMAND_sources})
+
+add_library(unbox_lib STATIC
+        ${GEN_COMMAND_sources}
+        ${TEST_ROOT}/operator_registry.cpp
+        ${TEST_ROOT}/custom_ops.cpp
+        )
+target_include_directories(unbox_lib PUBLIC ${TEST_ROOT} ${ATen_CPU_INCLUDE})
+target_link_libraries(unbox_lib PUBLIC torch_cpu)
+target_compile_definitions(unbox_lib PUBLIC USE_ATEN_LIB)
+
+add_executable(test_edge_op_registration
+        ${TEST_ROOT}/test_operator_registration.cpp
+        ${TEST_ROOT}/test_main.cpp
+        )
+
+target_compile_definitions(test_edge_op_registration PRIVATE USE_GTEST)
+
+target_link_libraries(test_edge_op_registration PRIVATE gtest_main unbox_lib)
+if((CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") OR (APPLE AND CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
+  target_link_options(test_edge_op_registration PRIVATE
+          "-Wl,-force_load,$<TARGET_FILE:unbox_lib>"
+          )
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+  target_link_options(test_edge_op_registration PRIVATE
+          "-Wl,--whole-archive,$<TARGET_FILE:unbox_lib>,--no-whole-archive"
+          )
+endif()
+if(INSTALL_TEST)
+  set_target_properties(test_edge_op_registration PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
+  install(TARGETS test_edge_op_registration DESTINATION bin)
+endif()
--- a/test/edge/Evalue.h
+++ b/test/edge/Evalue.h
@ -0,0 +1,479 @@
+#pragma once
+
+#include <ATen/ATen.h>
+/**
+ * WARNING: EValue is a class used by Executorch, for its boxed operators. It
+ * contains similar logic as `IValue` in PyTorch, by providing APIs to convert
+ * boxed values to unboxed values.
+ *
+ * It's mirroring a fbcode internal source file
+ * [`EValue.h`](https://www.internalfb.com/code/fbsource/xplat/executorch/core/values/Evalue.h).
+ *
+ * The reason why we are mirroring this class, is to make sure we have CI job
+ * coverage on torchgen logic, given that torchgen is used for both Executorch
+ * and PyTorch.
+ *
+ * If any of the logic here needs to be changed, please update fbcode version of
+ * `Evalue.h` as well. These two versions will be merged as soon as Executorch
+ * is in OSS (hopefully by Q2 2023).
+ */
+namespace torch {
+namespace executor {
+
+#define ET_CHECK_MSG TORCH_CHECK_MSG
+#define EXECUTORCH_FORALL_TAGS(_) \
+  _(None)                         \
+  _(Tensor)                       \
+  _(String)                       \
+  _(Double)                       \
+  _(Int)                          \
+  _(Bool)                         \
+  _(ListBool)                     \
+  _(ListDouble)                   \
+  _(ListInt)                      \
+  _(ListTensor)                   \
+  _(ListScalar)                   \
+  _(ListOptionalTensor)
+
+enum class Tag : uint32_t {
+#define DEFINE_TAG(x) x,
+  EXECUTORCH_FORALL_TAGS(DEFINE_TAG)
+#undef DEFINE_TAG
+};
+
+struct EValue;
+
+template <typename T>
+struct evalue_to_const_ref_overload_return {
+  using type = T;
+};
+
+template <>
+struct evalue_to_const_ref_overload_return<at::Tensor> {
+  using type = const at::Tensor&;
+};
+
+template <typename T>
+struct evalue_to_ref_overload_return {
+  using type = T;
+};
+
+template <>
+struct evalue_to_ref_overload_return<at::Tensor> {
+  using type = at::Tensor&;
+};
+
+/*
+ * Helper class used to correlate EValues in the executor table, with the
+ * unwrapped list of the proper type. Because values in the runtime's values
+ * table can change during execution, we cannot statically allocate list of
+ * objects at deserialization. Imagine the serialized list says index 0 in the
+ * value table is element 2 in the list, but during execution the value in
+ * element 2 changes (in the case of tensor this means the TensorImpl* stored in
+ * the tensor changes). To solve this instead they must be created dynamically
+ * whenever they are used.
+ */
+template <typename T>
+class EValObjectList {
+ public:
+  EValObjectList() = default;
+  /*
+   * Wrapped_vals is a list of pointers into the values table of the runtime
+   * whose destinations correlate with the elements of the list, unwrapped_vals
+   * is a container of the same size whose serves as memory to construct the
+   * unwrapped vals.
+   */
+  EValObjectList(EValue** wrapped_vals, T* unwrapped_vals, int size)
+      : wrapped_vals_(wrapped_vals, size), unwrapped_vals_(unwrapped_vals) {}
+  /*
+   * Constructs and returns the list of T specified by the EValue pointers
+   */
+  at::ArrayRef<T> get() const;
+
+ private:
+  // Source of truth for the list
+  at::ArrayRef<EValue*> wrapped_vals_;
+  // Same size as wrapped_vals
+  mutable T* unwrapped_vals_;
+};
+
+// Aggregate typing system similar to IValue only slimmed down with less
+// functionality, no dependencies on atomic, and fewer supported types to better
+// suit embedded systems (ie no intrusive ptr)
+struct EValue {
+  union Payload {
+    // When in ATen mode at::Tensor is not trivially copyable, this nested union
+    // lets us handle tensor as a special case while leaving the rest of the
+    // fields in a simple state instead of requiring a switch on tag everywhere.
+    union TriviallyCopyablePayload {
+      TriviallyCopyablePayload() : as_int(0) {}
+      // Scalar supported through these 3 types
+      int64_t as_int;
+      double as_double;
+      bool as_bool;
+      // TODO(jakeszwe): convert back to pointers to optimize size of this
+      // struct
+      at::ArrayRef<char> as_string;
+      at::ArrayRef<int64_t> as_int_list;
+      at::ArrayRef<double> as_double_list;
+      at::ArrayRef<bool> as_bool_list;
+      EValObjectList<at::Tensor> as_tensor_list;
+      EValObjectList<std::optional<at::Tensor>> as_list_optional_tensor;
+    } copyable_union;
+
+    // Since a Tensor just holds a TensorImpl*, there's no value to use Tensor*
+    // here.
+    at::Tensor as_tensor;
+
+    Payload() {}
+    ~Payload() {}
+  };
+
+  // Data storage and type tag
+  Payload payload;
+  Tag tag;
+
+  // Basic ctors and assignments
+  EValue(const EValue& rhs) : EValue(rhs.payload, rhs.tag) {}
+
+  EValue(EValue&& rhs) noexcept : tag(rhs.tag) {
+    moveFrom(std::move(rhs));
+  }
+
+  EValue& operator=(EValue&& rhs) & noexcept {
+    if (&rhs == this) {
+      return *this;
+    }
+
+    destroy();
+    moveFrom(std::move(rhs));
+    return *this;
+  }
+
+  EValue& operator=(EValue const& rhs) & {
+    // Define copy assignment through copy ctor and move assignment
+    *this = EValue(rhs);
+    return *this;
+  }
+
+  ~EValue() {
+    destroy();
+  }
+
+  /****** None Type ******/
+  EValue() : tag(Tag::None) {
+    payload.copyable_union.as_int = 0;
+  }
+
+  bool isNone() const {
+    return tag == Tag::None;
+  }
+
+  /****** Int Type ******/
+  /*implicit*/ EValue(int64_t i) : tag(Tag::Int) {
+    payload.copyable_union.as_int = i;
+  }
+
+  bool isInt() const {
+    return tag == Tag::Int;
+  }
+
+  int64_t toInt() const {
+    ET_CHECK_MSG(isInt(), "EValue is not an int.");
+    return payload.copyable_union.as_int;
+  }
+
+  /****** Double Type ******/
+  /*implicit*/ EValue(double d) : tag(Tag::Double) {
+    payload.copyable_union.as_double = d;
+  }
+
+  bool isDouble() const {
+    return tag == Tag::Double;
+  }
+
+  double toDouble() const {
+    ET_CHECK_MSG(isDouble(), "EValue is not a Double.");
+    return payload.copyable_union.as_double;
+  }
+
+  /****** Bool Type ******/
+  /*implicit*/ EValue(bool b) : tag(Tag::Bool) {
+    payload.copyable_union.as_bool = b;
+  }
+
+  bool isBool() const {
+    return tag == Tag::Bool;
+  }
+
+  bool toBool() const {
+    ET_CHECK_MSG(isBool(), "EValue is not a Bool.");
+    return payload.copyable_union.as_bool;
+  }
+
+  /****** Scalar Type ******/
+  /// Construct an EValue using the implicit value of a Scalar.
+  /*implicit*/ EValue(at::Scalar s) {
+    if (s.isIntegral(false)) {
+      tag = Tag::Int;
+      payload.copyable_union.as_int = s.to<int64_t>();
+    } else if (s.isFloatingPoint()) {
+      tag = Tag::Double;
+      payload.copyable_union.as_double = s.to<double>();
+    } else if (s.isBoolean()) {
+      tag = Tag::Bool;
+      payload.copyable_union.as_bool = s.to<bool>();
+    } else {
+      ET_CHECK_MSG(false, "Scalar passed to EValue is not initialized.");
+    }
+  }
+
+  bool isScalar() const {
+    return tag == Tag::Int || tag == Tag::Double || tag == Tag::Bool;
+  }
+
+  at::Scalar toScalar() const {
+    // Convert from implicit value to Scalar using implicit constructors.
+
+    if (isDouble()) {
+      return toDouble();
+    } else if (isInt()) {
+      return toInt();
+    } else if (isBool()) {
+      return toBool();
+    } else {
+      ET_CHECK_MSG(false, "EValue is not a Scalar.");
+      return c10::Scalar();
+    }
+  }
+
+  /****** Tensor Type ******/
+  /*implicit*/ EValue(at::Tensor t) : tag(Tag::Tensor) {
+    // When built in aten mode, at::Tensor has a non trivial constructor
+    // destructor, so regular assignment to a union field is UB. Instead we must
+    // go through placement new (which causes a refcount bump).
+    new (&payload.as_tensor) at::Tensor(t);
+  }
+
+  bool isTensor() const {
+    return tag == Tag::Tensor;
+  }
+
+  at::Tensor toTensor() && {
+    ET_CHECK_MSG(isTensor(), "EValue is not a Tensor.");
+    return std::move(payload.as_tensor);
+  }
+
+  at::Tensor& toTensor() & {
+    ET_CHECK_MSG(isTensor(), "EValue is not a Tensor.");
+    return payload.as_tensor;
+  }
+
+  const at::Tensor& toTensor() const& {
+    ET_CHECK_MSG(isTensor(), "EValue is not a Tensor.");
+    return payload.as_tensor;
+  }
+
+  /****** String Type ******/
+  /*implicit*/ EValue(const char* s, size_t size) : tag(Tag::String) {
+    payload.copyable_union.as_string = at::ArrayRef<char>(s, size);
+  }
+
+  bool isString() const {
+    return tag == Tag::String;
+  }
+
+  std::string_view toString() const {
+    ET_CHECK_MSG(isString(), "EValue is not a String.");
+    return std::string_view(
+        payload.copyable_union.as_string.data(),
+        payload.copyable_union.as_string.size());
+  }
+
+  /****** Int List Type ******/
+  /*implicit*/ EValue(at::ArrayRef<int64_t> i) : tag(Tag::ListInt) {
+    payload.copyable_union.as_int_list = i;
+  }
+
+  bool isIntList() const {
+    return tag == Tag::ListInt;
+  }
+
+  at::ArrayRef<int64_t> toIntList() const {
+    ET_CHECK_MSG(isIntList(), "EValue is not an Int List.");
+    return payload.copyable_union.as_int_list;
+  }
+
+  /****** Bool List Type ******/
+  /*implicit*/ EValue(at::ArrayRef<bool> b) : tag(Tag::ListBool) {
+    payload.copyable_union.as_bool_list = b;
+  }
+
+  bool isBoolList() const {
+    return tag == Tag::ListBool;
+  }
+
+  at::ArrayRef<bool> toBoolList() const {
+    ET_CHECK_MSG(isBoolList(), "EValue is not a Bool List.");
+    return payload.copyable_union.as_bool_list;
+  }
+
+  /****** Double List Type ******/
+  /*implicit*/ EValue(at::ArrayRef<double> d) : tag(Tag::ListDouble) {
+    payload.copyable_union.as_double_list = d;
+  }
+
+  bool isDoubleList() const {
+    return tag == Tag::ListDouble;
+  }
+
+  at::ArrayRef<double> toDoubleList() const {
+    ET_CHECK_MSG(isDoubleList(), "EValue is not a Double List.");
+    return payload.copyable_union.as_double_list;
+  }
+
+  /****** Tensor List Type ******/
+  /*implicit*/ EValue(EValObjectList<at::Tensor> t) : tag(Tag::ListTensor) {
+    payload.copyable_union.as_tensor_list = t;
+  }
+
+  bool isTensorList() const {
+    return tag == Tag::ListTensor;
+  }
+
+  at::ArrayRef<at::Tensor> toTensorList() const {
+    ET_CHECK_MSG(isTensorList(), "EValue is not a Tensor List.");
+    return payload.copyable_union.as_tensor_list.get();
+  }
+
+  /****** List Optional Tensor Type ******/
+  /*implicit*/ EValue(EValObjectList<std::optional<at::Tensor>> t)
+      : tag(Tag::ListOptionalTensor) {
+    payload.copyable_union.as_list_optional_tensor = t;
+  }
+
+  bool isListOptionalTensor() const {
+    return tag == Tag::ListOptionalTensor;
+  }
+
+  at::ArrayRef<std::optional<at::Tensor>> toListOptionalTensor() {
+    return payload.copyable_union.as_list_optional_tensor.get();
+  }
+
+  /****** ScalarType Type ******/
+  at::ScalarType toScalarType() const {
+    ET_CHECK_MSG(isInt(), "EValue is not a ScalarType.");
+    return static_cast<at::ScalarType>(payload.copyable_union.as_int);
+  }
+
+  /****** MemoryFormat Type ******/
+  at::MemoryFormat toMemoryFormat() const {
+    ET_CHECK_MSG(isInt(), "EValue is not a MemoryFormat.");
+    return static_cast<at::MemoryFormat>(payload.copyable_union.as_int);
+  }
+
+  template <typename T>
+  T to() &&;
+
+  template <typename T>
+  typename evalue_to_ref_overload_return<T>::type to() &;
+
+  /**
+   * Converts the EValue to an optional object that can represent both T and
+   * an uninitialized state.
+   */
+  template <typename T>
+  inline std::optional<T> toOptional() {
+    if (this->isNone()) {
+      return std::nullopt;
+    }
+    return this->to<T>();
+  }
+
+ private:
+  // Pre cond: the payload value has had its destructor called
+  void clearToNone() noexcept {
+    payload.copyable_union.as_int = 0;
+    tag = Tag::None;
+  }
+
+  // Shared move logic
+  void moveFrom(EValue&& rhs) noexcept {
+    if (rhs.isTensor()) {
+      new (&payload.as_tensor) at::Tensor(std::move(rhs.payload.as_tensor));
+      rhs.payload.as_tensor.~Tensor();
+    } else {
+      payload.copyable_union = rhs.payload.copyable_union;
+    }
+    tag = rhs.tag;
+    rhs.clearToNone();
+  }
+
+  // Destructs stored tensor if there is one
+  void destroy() {
+    // Necessary for ATen tensor to refcount decrement the intrusive_ptr to
+    // tensorimpl that got a refcount increment when we placed it in the evalue,
+    // no-op if executorch tensor #ifdef could have a
+    // minor performance bump for a code maintainability hit
+    if (isTensor()) {
+      payload.as_tensor.~Tensor();
+    } else if (isTensorList()) {
+      for (auto& tensor : toTensorList()) {
+        tensor.~Tensor();
+      }
+    } else if (isListOptionalTensor()) {
+      for (auto& optional_tensor : toListOptionalTensor()) {
+        optional_tensor.~optional();
+      }
+    }
+  }
+
+  EValue(const Payload& p, Tag t) : tag(t) {
+    if (isTensor()) {
+      new (&payload.as_tensor) at::Tensor(p.as_tensor);
+    } else {
+      payload.copyable_union = p.copyable_union;
+    }
+  }
+};
+
+#define EVALUE_DEFINE_TO(T, method_name)                           \
+  template <>                                                      \
+  inline evalue_to_ref_overload_return<T>::type EValue::to<T>()& { \
+    return static_cast<T>(this->method_name());                    \
+  }
+
+template <>
+inline at::Tensor& EValue::to<at::Tensor>() & {
+  return this->toTensor();
+}
+
+EVALUE_DEFINE_TO(at::Scalar, toScalar)
+EVALUE_DEFINE_TO(int64_t, toInt)
+EVALUE_DEFINE_TO(bool, toBool)
+EVALUE_DEFINE_TO(double, toDouble)
+EVALUE_DEFINE_TO(std::string_view, toString)
+EVALUE_DEFINE_TO(at::ScalarType, toScalarType)
+EVALUE_DEFINE_TO(at::MemoryFormat, toMemoryFormat)
+EVALUE_DEFINE_TO(std::optional<at::Tensor>, toOptional<at::Tensor>)
+EVALUE_DEFINE_TO(at::ArrayRef<int64_t>, toIntList)
+EVALUE_DEFINE_TO(
+    std::optional<at::ArrayRef<int64_t>>,
+    toOptional<at::ArrayRef<int64_t>>)
+EVALUE_DEFINE_TO(
+    std::optional<at::ArrayRef<double>>,
+    toOptional<at::ArrayRef<double>>)
+EVALUE_DEFINE_TO(at::ArrayRef<std::optional<at::Tensor>>, toListOptionalTensor)
+EVALUE_DEFINE_TO(at::ArrayRef<double>, toDoubleList)
+#undef EVALUE_DEFINE_TO
+
+template <typename T>
+at::ArrayRef<T> EValObjectList<T>::get() const {
+  for (size_t i = 0; i < wrapped_vals_.size(); i++) {
+    unwrapped_vals_[i] = wrapped_vals_[i]->template to<T>();
+  }
+  return at::ArrayRef<T>{unwrapped_vals_, wrapped_vals_.size()};
+}
+
+} // namespace executor
+} // namespace torch
--- a/test/edge/custom_ops.cpp
+++ b/test/edge/custom_ops.cpp
@ -0,0 +1,10 @@
+#include <ATen/Tensor.h>
+
+namespace custom {
+namespace native {
+at::Tensor& add_3_out(const at::Tensor& a, const at::Tensor& b, const at::Tensor& c, at::Tensor& out) {
+    out = a.add(b).add(c);
+    return out;
+}
+}
+}
--- a/test/edge/custom_ops.yaml
+++ b/test/edge/custom_ops.yaml
@ -0,0 +1,4 @@
+- func: custom::add_3.out(Tensor a, Tensor b, Tensor c, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: custom::add_3_out
--- a/test/edge/event_tracer.h
+++ b/test/edge/event_tracer.h
@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <stdlib.h>
+#include <cstdint>
+
+#pragma once
+
+namespace torch {
+namespace executor {
+
+typedef uint32_t AllocatorID;
+typedef int32_t ChainID;
+typedef uint32_t DebugHandle;
+
+/**
+ * EventTracer is a class that users can inherit and implement to
+ * log/serialize/stream etc. the profiling and debugging events that are
+ * generated at runtime for a model. An example of this is the ETDump
+ * implementation in the SDK codebase that serializes these events to a
+ * flatbuffer.
+ */
+class EventTracer {};
+
+struct EventTracerEntry {};
+
+} // namespace executor
+} // namespace torch
--- a/test/edge/event_tracer_hooks.h
+++ b/test/edge/event_tracer_hooks.h
@ -0,0 +1,91 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <event_tracer.h>
+
+/**
+ * @file
+ *
+ * This file contains the hooks that are inserted across various parts of the
+ * core runtime code to call into the EventTracer class for logging of profiling
+ * and debugging events. Any calls made to the EventTracer from the runtime must
+ * be made via these hooks.
+ * Users shouldn't directly add these hooks in their code and it's meant only
+ * for usage in ExecuTorch internal code.
+ *
+ * The benefit of defining these hooks is that we can easily control whether or
+ * not we want to compile in the EventTracer code based on the status of the
+ * ET_EVENT_TRACER_ENABLED flag.
+ */
+
+namespace torch {
+namespace executor {
+namespace internal {
+
+/**
+ * This class enables scope based profiling where needed using RAII.
+ * Profiling will be started when the object is created and will end
+ * when the object goes out of scope.
+ */
+class EventTracerProfileScope final {
+ public:
+  EventTracerProfileScope(EventTracer* event_tracer, const char* name) {};
+
+  ~EventTracerProfileScope() {};
+
+ private:
+  EventTracer* event_tracer_;
+  EventTracerEntry event_entry_;
+};
+
+/**
+ * This class enables scope based profiling where needed using RAII.
+ * Profiling will be started when the object is created and will end
+ * when the object goes out of scope.
+ */
+class EventTracerProfileOpScope final {
+ public:
+  EventTracerProfileOpScope(EventTracer* event_tracer, const char* name) {};
+
+  ~EventTracerProfileOpScope() {};
+
+ private:
+  EventTracer* event_tracer_;
+  EventTracerEntry event_entry_;
+};
+
+/**
+ * This class helps us set and then clear out the chain id and debug handle
+ * values stored in the event tracer class using RAII. This is typically called
+ * in the executor loop before entering the codegen layer to configure the chain
+ * id and debug handle of the current instruction being executed.
+ * After we return from the kernel execution we can then reset the chain id and
+ * debug handle to defaults when this object goes out of scope.
+ */
+class EventTracerProfileInstructionScope final {
+ public:
+  EventTracerProfileInstructionScope(
+      EventTracer* event_tracer,
+      ChainID chain_idx,
+      DebugHandle debug_handle) {};
+
+  ~EventTracerProfileInstructionScope() {};
+
+ private:
+  EventTracer* event_tracer_;
+};
+
+void event_tracer_log_evalue(EventTracer* event_tracer, EValue& evalue) {
+  (void)evalue;
+}
+
+} // namespace internal
+} // namespace executor
+} // namespace torch
--- a/test/edge/kernel_runtime_context.h
+++ b/test/edge/kernel_runtime_context.h
@ -0,0 +1,44 @@
+#pragma once
+
+#include "event_tracer.h"
+
+namespace torch {
+namespace executor {
+
+/**
+ * Bucket type abstraction that contains many elements of runtime state that
+ * a kernel author may want available, but would otherwise be unable to access.
+ *
+ * Forwarded along to all operators when running in lean mode. NOTE: Will not be
+ * forwarded to operators if running in ATen mode as those operators do not
+ * expect to receive a KernelRuntimeContext and would not use it.
+ *
+ * This includes things like setting an error state, a scratch allocator for
+ * operators that need more then constant space, and a TensorResizer for dynamic
+ * shape tensors allowing programs to be more flexible with Tensor shape.
+ */
+class KernelRuntimeContext {
+  public:
+  /**
+   * Construct a new kernel runtime context along with an optional event tracer.
+   */
+  KernelRuntimeContext(EventTracer* event_tracer = nullptr)
+      : event_tracer_(event_tracer) {}
+
+  /**
+   * INTERNAL ONLY
+   *
+   * Returns a pointer to an instance of EventTracer to do profiling/debugging
+   * logging inside the codegen layer. This is only for internal usage inside
+   * the codegen layer and users should not be accessing this.
+   */
+  EventTracer* internal_event_tracer() {
+    return event_tracer_;
+  }
+
+  private:
+  EventTracer* event_tracer_;
+};
+
+} // namespace executor
+} // namespace torch
--- a/test/edge/operator_registry.cpp
+++ b/test/edge/operator_registry.cpp
@ -0,0 +1,45 @@
+#include <c10/util/Exception.h>
+#include <operator_registry.h>
+
+namespace torch {
+namespace executor {
+
+KernelRegistry& getKernelRegistry() {
+  static KernelRegistry kernel_registry;
+  return kernel_registry;
+}
+
+bool register_kernels(const ArrayRef<Kernel>& kernels) {
+  return getKernelRegistry().register_kernels(kernels);
+}
+
+bool KernelRegistry::register_kernels(
+    const ArrayRef<Kernel>& kernels) {
+  for (const auto& kernel : kernels) {
+    this->kernels_map_[kernel.name_] = kernel.kernel_;
+  }
+  return true;
+}
+
+bool hasKernelFn(const char* name) {
+  return getKernelRegistry().hasKernelFn(name);
+}
+
+bool KernelRegistry::hasKernelFn(const char* name) {
+  auto kernel = this->kernels_map_.find(name);
+  return kernel != this->kernels_map_.end();
+}
+
+KernelFunction& getKernelFn(const char* name) {
+  return getKernelRegistry().getKernelFn(name);
+}
+
+KernelFunction& KernelRegistry::getKernelFn(const char* name) {
+  auto kernel = this->kernels_map_.find(name);
+  TORCH_CHECK_MSG(kernel != this->kernels_map_.end(), "Kernel not found!");
+  return kernel->second;
+}
+
+
+} // namespace executor
+} // namespace torch
--- a/test/edge/operator_registry.h
+++ b/test/edge/operator_registry.h
@ -0,0 +1,72 @@
+#pragma once
+
+#include <cstring>
+#include <functional>
+#include <map>
+
+#include "Evalue.h"
+#include "kernel_runtime_context.h"
+
+#include <c10/util/ArrayRef.h>
+
+namespace torch {
+namespace executor {
+
+using KernelFunction = std::function<void(KernelRuntimeContext&, EValue**)>;
+
+template<typename T>
+using ArrayRef = at::ArrayRef<T>;
+
+#define EXECUTORCH_SCOPE_PROF(x)
+
+struct Kernel {
+  const char* name_;
+  KernelFunction kernel_;
+
+  Kernel() = default;
+
+  /**
+   * We are doing a copy of the string pointer instead of duplicating the string
+   * itself, we require the lifetime of the kernel name to be at least as long
+   * as the kernel registry.
+   */
+  explicit Kernel(const char* name, KernelFunction func)
+      : name_(name), kernel_(func) {}
+};
+
+/**
+ * See KernelRegistry::hasKernelFn()
+ */
+bool hasKernelFn(const char* name);
+
+/**
+ * See KernelRegistry::getKernelFn()
+ */
+KernelFunction& getKernelFn(const char* name);
+
+
+[[nodiscard]] bool register_kernels(const ArrayRef<Kernel>&);
+
+struct KernelRegistry {
+ public:
+  KernelRegistry() : kernelRegSize_(0) {}
+
+  bool register_kernels(const ArrayRef<Kernel>&);
+
+  /**
+   * Checks whether an kernel with a given name is registered
+   */
+  bool hasKernelFn(const char* name);
+
+  /**
+   * Checks whether an kernel with a given name is registered
+   */
+  KernelFunction& getKernelFn(const char* name);
+
+ private:
+  std::map<const char*, KernelFunction> kernels_map_;
+  uint32_t kernelRegSize_;
+};
+
+} // namespace executor
+} // namespace torch
--- a/test/edge/selected_operators.yaml
+++ b/test/edge/selected_operators.yaml
@ -0,0 +1,463 @@
+build_features: []
+custom_classes: []
+include_all_non_op_selectives: false
+include_all_operators: false
+kernel_metadata: {}
+et_kernel_metadata:
+  custom::add_3.out:
+    - v1/6;0,1,2,3|6;0,1,2,3|6;0,1,2,3
+    - v1/3;0,1,2,3|3;0,1,2,3|3;0,1,2,3
+  aten::add.out:
+    - v1/6;0,1,2,3|6;0,1,2,3|6;0,1,2,3
+    - v1/3;0,1,2,3|3;0,1,2,3|3;0,1,2,3
+operators:
+  aten::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::_reshape_alias_copy.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::_softmax.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::_to_copy.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::_unique2.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::add.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::addmm.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::avg_pool2d.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::baddbmm.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::bitwise_and.Tensor_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::bmm.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::cat.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::clamp.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::clone.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::constant_pad_nd.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::conv1d.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::convolution.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::cumsum.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::detach_copy.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::div.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::embedding.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::eq.Scalar_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::eq.Tensor_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::exp.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::expand_copy.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::floor_divide.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::gelu.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::grid_sampler_2d.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::gt.Scalar_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::index.Tensor_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::index_put.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::index_select.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::leaky_relu.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::linalg_inv_ex.inverse:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::logit.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::masked_fill.Scalar_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::max.unary_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::max_pool2d_with_indices.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::mean.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::minimum.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::mm.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::mul.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::native_batch_norm.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::native_layer_norm.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::ne.Scalar_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::nonzero.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::permute_copy.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::pixel_shuffle.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::relu.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::remainder.Scalar_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::repeat.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::round.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::rsub.Scalar_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::select_copy.int_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::sigmoid.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::slice_copy.Tensor_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::softplus.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::sort.values:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::split_copy.Tensor_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::split_with_sizes_copy.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::stack.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::sub.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::sum.IntList_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::tanh.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::topk.values:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::transpose_copy.int_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::unbind_copy.int_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::unsafe_split.Tensor_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::unsqueeze_copy.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::upsample_bilinear2d.vec_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::upsample_nearest2d.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::upsample_nearest2d.vec_out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::view_copy.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  aten::zeros_like.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
+  custom::add_3.out:
+    debug_info:
+    - functions.yaml
+    include_all_overloads: false
+    is_root_operator: true
+    is_used_for_training: true
--- a/test/edge/templates/Functions.h
+++ b/test/edge/templates/Functions.h
@ -0,0 +1,25 @@
+// clang-format off
+#pragma once
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+// ${generated_comment}
+
+${static_dispatch_extra_headers}
+
+namespace torch {
+namespace executor {
+
+${Functions_declarations}
+
+} // namespace executor
+} // namespace torch
--- a/test/edge/templates/NativeFunctions.h
+++ b/test/edge/templates/NativeFunctions.h
@ -0,0 +1,31 @@
+#pragma once
+
+// ${generated_comment}
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,            \
+  meaning the file will need to be re-compiled every time an operator     \
+  is changed or added. Consider if your change would be better placed in  \
+  another file, or if a more specific header might achieve the same goal. \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the      \
+  file will need to be re-compiled every time an operator is changed or added.  \
+  Consider including a specific operator from <ATen/ops/{my_operator}_native.h> \
+  and see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+${nativeFunctions_declarations}
--- a/test/edge/templates/RegisterCodegenUnboxedKernels.cpp
+++ b/test/edge/templates/RegisterCodegenUnboxedKernels.cpp
@ -0,0 +1,28 @@
+#include <operator_registry.h>
+#include <event_tracer_hooks.h>
+#include "${fn_header}" // Generated Function import headers
+
+namespace torch {
+namespace executor {
+
+using namespace internal;
+
+namespace {
+using KernelArrayRef = ::at::ArrayRef<::torch::executor::Kernel>;
+
+static Kernel kernels_to_register[] = {
+    ${unboxed_kernels} // Generated operators
+};
+
+// Explicitly convert to ArrayRef, so that the API can take an empty C array of
+// Kernels.
+static KernelArrayRef kernel_array_ref(
+    kernels_to_register,
+    kernels_to_register + sizeof(kernels_to_register) / sizeof(Kernel));
+
+// Return value not used. Keep the static variable assignment to register
+// operators in static initialization time.
+static auto success_with_kernel_reg = register_kernels(kernel_array_ref);
+} // namespace
+} // namespace executor
+} // namespace torch
--- a/test/edge/templates/RegisterDispatchKeyCustomOps.cpp
+++ b/test/edge/templates/RegisterDispatchKeyCustomOps.cpp
@ -0,0 +1,27 @@
+// clang-format off
+// Generated code for registering custom operators into the dispatcher.
+
+#include <torch/library.h>
+#include <ATen/Tensor.h>
+
+$ops_headers
+
+namespace torch {
+namespace executor {
+namespace function {
+
+
+${dispatch_anonymous_definitions}
+
+// All out variants ops
+${static_init_dispatch_registrations}
+
+namespace ${dispatch_namespace}
+{
+  ${dispatch_namespaced_definitions}
+
+} // namespace ${dispatch_namespace}
+
+} // namespace function
+} // namespace executor
+} // namespace torch
--- a/test/edge/templates/RegisterKernels.h
+++ b/test/edge/templates/RegisterKernels.h
@ -0,0 +1,22 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// ${generated_comment}
+// Exposing an API for registering all kernels at once.
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/operator_registry.h>
+#include <executorch/runtime/platform/profiler.h>
+
+namespace torch {
+namespace executor {
+
+Error register_all_kernels();
+
+} // namespace executor
+} // namespace torch
--- a/test/edge/templates/RegisterSchema.cpp
+++ b/test/edge/templates/RegisterSchema.cpp
@ -0,0 +1,10 @@
+// ${generated_comment}
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <torch/library.h>
+
+namespace at {
+TORCH_LIBRARY_FRAGMENT(aten, m) {
+    ${aten_schema_registrations};
+}
+$schema_registrations
+} // namespace at
--- a/test/edge/test_main.cpp
+++ b/test/edge/test_main.cpp
@ -0,0 +1,18 @@
+#include <gtest/gtest.h>
+
+std::string add_negative_flag(const std::string& flag) {
+  std::string filter = ::testing::GTEST_FLAG(filter);
+  if (filter.find('-') == std::string::npos) {
+    filter.push_back('-');
+  } else {
+    filter.push_back(':');
+  }
+  filter += flag;
+  return filter;
+}
+int main(int argc, char* argv[]) {
+    ::testing::InitGoogleTest(&argc, argv);
+    ::testing::GTEST_FLAG(filter) = add_negative_flag("*_CUDA:*_MultiCUDA");
+
+    return RUN_ALL_TESTS();
+}
--- a/test/edge/test_operator_registration.cpp
+++ b/test/edge/test_operator_registration.cpp
@ -0,0 +1,53 @@
+#include "kernel_runtime_context.h"
+#include "operator_registry.h"
+
+#include <gtest/gtest.h>
+
+namespace torch {
+namespace executor {
+
+// add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+TEST(OperatorRegistrationTest, Add) {
+    EValue values[4];
+    values[0] = EValue(at::ones({2, 3}));
+    values[1] = EValue(at::ones({2, 3}));
+    values[2] = EValue(int64_t(1));
+    values[3] = EValue(at::zeros({2, 3}));
+    ASSERT_TRUE(hasKernelFn("aten::add.out"));
+    auto op = getKernelFn("aten::add.out");
+
+    EValue* kernel_values[4];
+    for (size_t i = 0; i < 4; i++) {
+        kernel_values[i] = &values[i];
+    }
+    KernelRuntimeContext context{};
+    op(context, kernel_values);
+    at::Tensor expected = at::ones({2, 3});
+    expected = at::fill(expected, 2);
+    ASSERT_TRUE(expected.equal(kernel_values[3]->toTensor()));
+
+}
+
+// custom::add_3.out(Tensor a, Tensor b, Tensor c, *, Tensor(a!) out) -> Tensor(a!)
+TEST(OperatorRegistrationTest, CustomAdd3) {
+    EValue values[4];
+    values[0] = EValue(at::ones({2, 3}));
+    values[1] = EValue(at::ones({2, 3}));
+    values[2] = EValue(at::ones({2, 3}));
+    values[3] = EValue(at::zeros({2, 3}));
+    ASSERT_TRUE(hasKernelFn("custom::add_3.out"));
+    auto op = getKernelFn("custom::add_3.out");
+
+    EValue* kernel_values[4];
+    for (size_t i = 0; i < 4; i++) {
+        kernel_values[i] = &values[i];
+    }
+    KernelRuntimeContext context{};
+    op(context, kernel_values);
+    at::Tensor expected = at::ones({2, 3});
+    expected = at::fill(expected, 3);
+    ASSERT_TRUE(expected.equal(kernel_values[3]->toTensor()));
+
+}
+} // namespace executor
+} // namespace torch
--- a/tools/BUCK.bzl
+++ b/tools/BUCK.bzl
@ -287,3 +287,18 @@ def define_tools_targets(
            ":autograd",
        ],
    )
+
+    python_test(
+        name = "test_torchgen_executorch",
+        srcs = [
+            "test/test_executorch_gen.py",
+            "test/test_executorch_signatures.py",
+            "test/test_executorch_types.py",
+            "test/test_executorch_unboxing.py",
+        ],
+        contacts = contacts,
+        visibility = ["PUBLIC"],
+        deps = [
+            torchgen_deps,
+        ],
+    )
--- a/tools/test/test_executorch_custom_ops.py
+++ b/tools/test/test_executorch_custom_ops.py
@ -0,0 +1,147 @@
+from __future__ import annotations
+
+import tempfile
+import unittest
+from typing import Any
+from unittest.mock import ANY, Mock, patch
+
+import expecttest
+
+import torchgen
+from torchgen.executorch.api.custom_ops import ComputeNativeFunctionStub
+from torchgen.executorch.model import ETKernelIndex
+from torchgen.gen_executorch import gen_headers
+from torchgen.model import Location, NativeFunction
+from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.utils import FileManager
+
+
+SPACES = "    "
+
+
+def _get_native_function_from_yaml(yaml_obj: dict[str, object]) -> NativeFunction:
+    native_function, _ = NativeFunction.from_yaml(
+        yaml_obj,
+        loc=Location(__file__, 1),
+        valid_tags=set(),
+    )
+    return native_function
+
+
+class TestComputeNativeFunctionStub(expecttest.TestCase):
+    """
+    Could use torch.testing._internal.common_utils to reduce boilerplate.
+    GH CI job doesn't build torch before running tools unit tests, hence
+    manually adding these parametrized tests.
+    """
+
+    def _test_function_schema_generates_correct_kernel(
+        self, obj: dict[str, Any], expected: str
+    ) -> None:
+        func = _get_native_function_from_yaml(obj)
+
+        gen = ComputeNativeFunctionStub()
+        res = gen(func)
+        self.assertIsNotNone(res)
+        self.assertExpectedInline(
+            str(res),
+            expected,
+        )
+
+    def test_function_schema_generates_correct_kernel_tensor_out(self) -> None:
+        obj = {"func": "custom::foo.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"}
+        expected = """
+at::Tensor & wrapper_CPU_out_foo_out(const at::Tensor & self, at::Tensor & out) {
+    return out;
+}
+    """
+        self._test_function_schema_generates_correct_kernel(obj, expected)
+
+    def test_function_schema_generates_correct_kernel_no_out(self) -> None:
+        obj = {"func": "custom::foo.Tensor(Tensor self) -> Tensor"}
+        expected = """
+at::Tensor wrapper_CPU_Tensor_foo(const at::Tensor & self) {
+    return self;
+}
+    """
+        self._test_function_schema_generates_correct_kernel(obj, expected)
+
+    def test_function_schema_generates_correct_kernel_no_return(self) -> None:
+        obj = {"func": "custom::foo.out(Tensor self, *, Tensor(a!)[] out) -> ()"}
+        expected = f"""
+void wrapper_CPU_out_foo_out(const at::Tensor & self, at::TensorList out) {{
+{SPACES}
+}}
+    """
+        self._test_function_schema_generates_correct_kernel(obj, expected)
+
+    def test_function_schema_generates_correct_kernel_3_returns(self) -> None:
+        obj = {
+            "func": "custom::foo(Tensor self, Tensor[] other) -> (Tensor, Tensor, Tensor)"
+        }
+        expected = """
+::std::tuple<at::Tensor,at::Tensor,at::Tensor> wrapper_CPU__foo(const at::Tensor & self, at::TensorList other) {
+    return ::std::tuple<at::Tensor, at::Tensor, at::Tensor>(
+                at::Tensor(), at::Tensor(), at::Tensor()
+            );
+}
+    """
+        self._test_function_schema_generates_correct_kernel(obj, expected)
+
+    def test_function_schema_generates_correct_kernel_1_return_no_out(self) -> None:
+        obj = {"func": "custom::foo(Tensor[] a) -> Tensor"}
+        expected = """
+at::Tensor wrapper_CPU__foo(at::TensorList a) {
+    return at::Tensor();
+}
+    """
+        self._test_function_schema_generates_correct_kernel(obj, expected)
+
+    def test_schema_has_no_return_type_argument_throws(self) -> None:
+        func = _get_native_function_from_yaml(
+            {"func": "custom::foo.bool(Tensor self) -> bool"}
+        )
+
+        gen = ComputeNativeFunctionStub()
+        with self.assertRaisesRegex(Exception, "Can't handle this return type"):
+            gen(func)
+
+
+class TestGenCustomOpsHeader(unittest.TestCase):
+    @patch.object(torchgen.utils.FileManager, "write_with_template")
+    @patch.object(torchgen.utils.FileManager, "write")
+    def test_fm_writes_custom_ops_header_when_boolean_is_true(
+        self, unused: Mock, mock_method: Mock
+    ) -> None:
+        with tempfile.TemporaryDirectory() as tempdir:
+            fm = FileManager(tempdir, tempdir, False)
+            gen_headers(
+                native_functions=[],
+                gen_custom_ops_header=True,
+                custom_ops_native_functions=[],
+                selector=SelectiveBuilder.get_nop_selector(),
+                kernel_index=ETKernelIndex(index={}),
+                cpu_fm=fm,
+                use_aten_lib=False,
+            )
+            mock_method.assert_called_once_with(
+                "CustomOpsNativeFunctions.h", "NativeFunctions.h", ANY
+            )
+
+    @patch.object(torchgen.utils.FileManager, "write_with_template")
+    @patch.object(torchgen.utils.FileManager, "write")
+    def test_fm_doesnot_writes_custom_ops_header_when_boolean_is_false(
+        self, unused: Mock, mock_method: Mock
+    ) -> None:
+        with tempfile.TemporaryDirectory() as tempdir:
+            fm = FileManager(tempdir, tempdir, False)
+            gen_headers(
+                native_functions=[],
+                gen_custom_ops_header=False,
+                custom_ops_native_functions=[],
+                selector=SelectiveBuilder.get_nop_selector(),
+                kernel_index=ETKernelIndex(index={}),
+                cpu_fm=fm,
+                use_aten_lib=False,
+            )
+            mock_method.assert_not_called()
--- a/tools/test/test_executorch_gen.py
+++ b/tools/test/test_executorch_gen.py
@ -0,0 +1,689 @@
+from __future__ import annotations
+
+import os
+import tempfile
+import unittest
+
+import yaml
+
+from torchgen.executorch.model import ETKernelIndex, ETKernelKey
+from torchgen.gen import LineLoader
+from torchgen.gen_executorch import (
+    ComputeCodegenUnboxedKernels,
+    gen_functions_declarations,
+    parse_yaml_files,
+    translate_native_yaml,
+)
+from torchgen.model import (
+    BackendIndex,
+    BackendMetadata,
+    DispatchKey,
+    Location,
+    NativeFunction,
+    OperatorName,
+)
+from torchgen.selective_build.selector import SelectiveBuilder
+
+
+TEST_YAML = """
+- func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  ufunc_inner_loop:
+    Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
+    ScalarOnly: add (Bool)
+  dispatch:
+    SparseCPU: add_out_sparse_cpu
+    SparseCUDA: add_out_sparse_cuda
+    SparseCsrCPU: add_out_sparse_csr_cpu
+    SparseCsrCUDA: add_out_sparse_csr_cuda
+    MkldnnCPU: mkldnn_add_out
+    MPS: add_out_mps
+
+- func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: add.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: add_sparse
+    SparseCsrCPU, SparseCsrCUDA: add_sparse_csr
+    MkldnnCPU: mkldnn_add
+    ZeroTensor: add_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
+  tags: core
+
+- func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  dispatch:
+    CPU, CUDA: mul_out
+    MPS: mul_out_mps
+    SparseCPU: mul_out_sparse_cpu
+    SparseCUDA: mul_out_sparse_cuda
+    SparseCsrCPU, SparseCsrCUDA: mul_out_sparse_csr
+    MkldnnCPU: mkldnn_mul_out
+
+- func: mul.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: mul.out
+  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: mul_sparse
+    SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr
+    MkldnnCPU: mkldnn_mul
+    ZeroTensor: mul_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
+  tags: core
+
+"""
+
+
+TEST_KERNEL_YAML = """
+- func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  ufunc_inner_loop:
+    Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
+    ScalarOnly: add (Bool)
+  type_alias:
+    T0: [Float, Double]
+    T1: [Double, Int]
+  dim_order_alias:
+    D0: [0, 1, 2, 3]
+    D1: [0, 3, 2, 1]
+  kernels:
+    - arg_meta: null
+      kernel_name: default_impl
+    - arg_meta:
+        self: [T0, D0]
+        other: [T1, D0]
+        out: [T0, D0]
+      kernel_name: test_impl
+    - arg_meta:
+        self: [T1, D0]
+        other: [T1, D1]
+        out: [T0, D1]
+      kernel_name: test_impl_2
+
+- func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: add.out
+  variants: function, method
+  tags: core
+
+- func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  device_check: NoCheck   # TensorIterator
+  structured: True
+  structured_inherits: TensorIteratorBase
+  type_alias:
+    T0: [Float]
+    T1: [Double]
+  dim_order_alias:
+    D0: [0, 1, 2, 3]
+  kernels:
+    - arg_meta: null
+      kernel_name: default_impl
+    - arg_meta:
+        self: [T0, D0]
+        other: [T1, D0]
+        out: [T0, D0]
+      kernel_name: test_impl
+
+- func: mul.Tensor(Tensor self, Tensor other) -> Tensor
+  device_check: NoCheck   # TensorIterator
+  structured_delegate: mul.out
+  variants: function, method
+  tags: core
+
+"""
+
+
+class TestParseNativeYaml(unittest.TestCase):
+    def setUp(self) -> None:
+        self.temp_dir = tempfile.mkdtemp()
+
+        self.aten_yaml_path = os.path.join(self.temp_dir, "test_native_functions.yaml")
+        with open(self.aten_yaml_path, "w") as f:
+            f.write(TEST_YAML)
+        self.ops_yaml_path = os.path.join(self.temp_dir, "test.yaml")
+        self.tags_yaml_path = os.path.join(self.temp_dir, "tags.yaml")
+        with open(self.tags_yaml_path, "w") as f:
+            f.write(
+                """
+- tag: core
+  desc: test
+            """
+            )
+        with open(self.ops_yaml_path, "w") as f:
+            f.write(
+                """
+- op: add.out
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU: torch::executor::add_out_kernel
+
+- op: mul.out
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU: torch::executor::mul_out_kernel
+                """
+            )
+
+    def test_translate_native_yaml_writes_correct_data(self) -> None:
+        out_yaml_path = os.path.join(self.temp_dir, "out.yaml")
+        with open(out_yaml_path, "w") as out_file:
+            translate_native_yaml(
+                tags_yaml_path=self.tags_yaml_path,
+                aten_yaml_path=self.aten_yaml_path,
+                native_yaml_path=self.ops_yaml_path,
+                use_aten_lib=False,
+                out_file=out_file,
+            )
+        with open(out_yaml_path) as out_file:
+            es = yaml.load(out_file, Loader=LineLoader)
+        self.assertTrue(all("func" in e for e in es))
+        self.assertTrue(all(e.get("variants") == "function" for e in es))
+
+        # Check that kernel fields aren't introduced in yaml
+        for e in es:
+            self.assertFalse({"kernels", "type_alias", "dim_order_alias"} < e.keys())
+
+    def test_parse_yaml_files(self) -> None:
+        custom_ops_yaml_path = None
+        selector = SelectiveBuilder.get_nop_selector()
+        use_aten_lib = False
+
+        parsed_yaml, custom_ops_parsed_yaml = parse_yaml_files(
+            aten_yaml_path=self.aten_yaml_path,
+            tags_yaml_path=self.tags_yaml_path,
+            native_yaml_path=self.ops_yaml_path,
+            custom_ops_yaml_path=custom_ops_yaml_path,
+            selector=selector,
+            use_aten_lib=use_aten_lib,
+        )
+
+        # Just the default kernel entry
+        expected_kernel_entry = {"add.out": 1, "mul.out": 1}
+        self.assertTrue(len(parsed_yaml.native_functions) == len(expected_kernel_entry))
+
+        op_entries = parsed_yaml.kernel_index.index
+        for op_name, kernel_mapping in op_entries.items():
+            self.assertTrue(
+                len(kernel_mapping) == expected_kernel_entry.pop(str(op_name))
+            )
+
+        self.assertTrue(len(expected_kernel_entry) == 0)
+
+    def tearDown(self) -> None:
+        import shutil
+
+        try:
+            shutil.rmtree(self.temp_dir)
+        except OSError:
+            pass
+
+
+class TestParseKernelYamlFiles(unittest.TestCase):
+    def setUp(self) -> None:
+        self.temp_dir = tempfile.mkdtemp()
+
+        self.aten_kernel_yaml_path = os.path.join(
+            self.temp_dir, "test_kernel_native_functions.yaml"
+        )
+        with open(self.aten_kernel_yaml_path, "w") as f:
+            f.write(TEST_KERNEL_YAML)
+        self.ops_yaml_path = os.path.join(self.temp_dir, "test.yaml")
+        self.tags_yaml_path = os.path.join(self.temp_dir, "tags.yaml")
+        with open(self.tags_yaml_path, "w") as f:
+            f.write(
+                """
+- tag: core
+  desc: test
+            """
+            )
+        with open(self.ops_yaml_path, "w") as f:
+            f.write(
+                """
+- op: add.out
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU: torch::executor::add_out_kernel
+
+- op: mul.out
+  device_check: NoCheck   # TensorIterator
+  dispatch:
+    CPU: torch::executor::mul_out_kernel
+                """
+            )
+
+    def test_translate_kernel_native_yaml_writes_correct_data(self) -> None:
+        out_yaml_path = os.path.join(self.temp_dir, "out2.yaml")
+        with open(out_yaml_path, "w") as out_file:
+            translate_native_yaml(
+                tags_yaml_path=self.tags_yaml_path,
+                aten_yaml_path=self.aten_kernel_yaml_path,
+                native_yaml_path=self.ops_yaml_path,
+                use_aten_lib=False,
+                out_file=out_file,
+            )
+        with open(out_yaml_path) as out_file:
+            es = yaml.load(out_file, Loader=LineLoader)
+        self.assertTrue(all("func" in e for e in es))
+        self.assertTrue(all(e.get("variants") == "function" for e in es))
+
+        # Check persistence of kernel fields in yaml
+        for e in es:
+            self.assertTrue({"kernels", "type_alias", "dim_order_alias"} < e.keys())
+
+    def test_parse_yaml_files(self) -> None:
+        custom_ops_yaml_path = None
+        selector = SelectiveBuilder.get_nop_selector()
+        use_aten_lib = False
+
+        parsed_yaml, custom_ops_parsed_yaml = parse_yaml_files(
+            aten_yaml_path=self.aten_kernel_yaml_path,
+            tags_yaml_path=self.tags_yaml_path,
+            native_yaml_path=self.ops_yaml_path,
+            custom_ops_yaml_path=custom_ops_yaml_path,
+            selector=selector,
+            use_aten_lib=use_aten_lib,
+        )
+
+        expected_kernel_entry = {"add.out": 9, "mul.out": 2}
+        self.assertTrue(len(parsed_yaml.native_functions) == len(expected_kernel_entry))
+
+        op_entries = parsed_yaml.kernel_index.index
+        for op_name, kernel_mapping in op_entries.items():
+            self.assertTrue(
+                len(kernel_mapping) == expected_kernel_entry.pop(str(op_name))
+            )
+
+        self.assertTrue(len(expected_kernel_entry) == 0)
+
+    def tearDown(self) -> None:
+        import shutil
+
+        try:
+            shutil.rmtree(self.temp_dir)
+        except OSError:
+            pass
+
+
+class TestGenFunctionsDeclarations(unittest.TestCase):
+    def setUp(self) -> None:
+        (
+            self.custom_1_native_function,
+            custom_1_backend_index,
+        ) = NativeFunction.from_yaml(
+            {"func": "custom_1::op_1() -> bool", "dispatch": {"CPU": "kernel_1"}},
+            loc=Location(__file__, 1),
+            valid_tags=set(),
+        )
+        (
+            self.custom_2_native_function,
+            custom_2_backend_index,
+        ) = NativeFunction.from_yaml(
+            {
+                "func": "custom_2::op_2() -> bool",
+                "dispatch": {"CPU": "kernel_2"},
+            },
+            loc=Location(__file__, 1),
+            valid_tags=set(),
+        )
+        (
+            self.custom_3_native_function,
+            custom_3_backend_index,
+        ) = NativeFunction.from_yaml(
+            {
+                "func": "custom_3::op_3(Tensor(a!) self, Tensor x) -> Tensor(a!)",
+                "dispatch": {"CPU": "kernel_3"},
+                "variants": "method",
+            },
+            loc=Location(__file__, 1),
+            valid_tags=set(),
+        )
+
+        backend_indices: dict[DispatchKey, dict[OperatorName, BackendMetadata]] = {
+            DispatchKey.CPU: {},
+            DispatchKey.QuantizedCPU: {},
+        }
+        BackendIndex.grow_index(backend_indices, custom_1_backend_index)
+        BackendIndex.grow_index(backend_indices, custom_2_backend_index)
+        self.static_dispatch_idx = [
+            BackendIndex(
+                dispatch_key=k,
+                use_out_as_primary=True,
+                external=False,
+                device_guard=False,
+                index=backend_indices[k],
+            )
+            for k in backend_indices
+        ]
+        self.kernel_index = ETKernelIndex.from_backend_indices(backend_indices)
+
+    def test_operators_with_different_namespaces_are_grouped_correctly(self) -> None:
+        declarations = gen_functions_declarations(
+            native_functions=[
+                self.custom_1_native_function,
+                self.custom_2_native_function,
+            ],
+            kernel_index=self.kernel_index,
+            selector=SelectiveBuilder.get_nop_selector(),
+            use_aten_lib=False,
+        )
+        self.assertTrue(
+            """
+namespace custom_1 {
+
+// custom_1::op_1() -> bool
+TORCH_API inline bool op_1(torch::executor::KernelRuntimeContext & context) {
+    return ::at::native::kernel_1(context);
+}
+
+} // namespace custom_1
+"""
+            in declarations
+        )
+
+        self.assertTrue(
+            """
+namespace custom_2 {
+
+// custom_2::op_2() -> bool
+TORCH_API inline bool op_2(torch::executor::KernelRuntimeContext & context) {
+    return ::at::native::kernel_2(context);
+}
+
+} // namespace custom_2
+        """
+            in declarations
+        )
+
+    def test_aten_lib_has_context_arg(self) -> None:
+        declarations = gen_functions_declarations(
+            native_functions=[
+                self.custom_1_native_function,
+            ],
+            kernel_index=self.kernel_index,
+            selector=SelectiveBuilder.get_nop_selector(),
+            use_aten_lib=True,
+        )
+        self.assertTrue(
+            """
+namespace custom_1 {
+
+// custom_1::op_1() -> bool
+TORCH_API inline bool op_1(torch::executor::KernelRuntimeContext & context) {
+    return at::op_1();
+}
+
+} // namespace custom_1
+        """
+            in declarations
+        )
+
+    def test_aten_lib_method_variant(self) -> None:
+        declarations = gen_functions_declarations(
+            native_functions=[
+                self.custom_3_native_function,
+            ],
+            kernel_index=self.kernel_index,
+            selector=SelectiveBuilder.get_nop_selector(),
+            use_aten_lib=True,
+        )
+        self.assertTrue(
+            """
+namespace custom_3 {
+
+// custom_3::op_3(Tensor(a!) self, Tensor x) -> Tensor(a!)
+TORCH_API inline at::Tensor & op_3(torch::executor::KernelRuntimeContext & context, at::Tensor & self, const at::Tensor & x) {
+    return self.op_3(x);
+}
+
+} // namespace custom_3
+        """
+            in declarations
+        )
+
+
+class TestComputeCodegenUnboxedKernels(unittest.TestCase):
+    def setUp(self) -> None:
+        (
+            self.native_function_no_kern,
+            _,
+        ) = NativeFunction.from_yaml(
+            {
+                "func": "custom_1::op_1() -> bool",
+                "dispatch": {"CPU": "unused_kernel_1"},
+            },
+            loc=Location(__file__, 1),
+            valid_tags=set(),
+        )
+
+        self.default_kernel_key = ETKernelKey(default=True)
+        self.default_backend_metadata = BackendMetadata(
+            "default_kernel", False, "at::native"
+        )
+        self.default_kernel_entry = (
+            [self.default_kernel_key],
+            self.default_backend_metadata,
+        )
+
+    def test_codegen_unboxed_specialized(self) -> None:
+        specialized_kernel_key = ETKernelKey.gen_from_yaml(
+            {"self": ("T0", "D0"), "other": ("T0", "D0"), "out": ("T0", "D0")},
+            {"T0": ["Double"]},
+            {"D0": [0, 1, 2, 3]},
+        )
+        selector = SelectiveBuilder.from_yaml_dict(
+            {
+                "include_all_operators": True,
+                "et_kernel_metadata": {
+                    "custom_1::op_1": ["v1/7;0,1,2,3|7;0,1,2,3|7;0,1,2,3"]
+                },
+            }
+        )
+        use_aten_lib = False
+        entry = (
+            self.native_function_no_kern,
+            (specialized_kernel_key, self.default_backend_metadata),
+        )
+
+        result = ComputeCodegenUnboxedKernels(
+            selector, use_aten_lib, add_exception_boundary=False
+        )(entry)
+        # Concat used to prevent whitespace stripping
+        expected_str = (
+            """
+Kernel(
+    "custom_1::op_1",
+    "v1/7;0,1,2,3|7;0,1,2,3|7;0,1,2,3",
+    [](torch::executor::KernelRuntimeContext & context, EValue** stack) {
+        """
+            + """
+
+
+        internal::EventTracerProfileOpScope event_tracer_op_scope(context.internal_event_tracer(), "native_call_op_1");
+        EXECUTORCH_SCOPE_PROF("native_call_op_1");
+        bool result_ = at::native::default_kernel(context, );
+        internal::event_tracer_log_evalue(context.internal_event_tracer(), *stack[0]);
+
+        *stack[0] = EValue(result_);
+
+    }
+),
+"""
+        )
+
+        self.assertEqual(expected_str, result)
+
+    def test_codegen_unboxed_specialized_not_matching(self) -> None:
+        specialized_kernel_key = ETKernelKey.gen_from_yaml(
+            {"self": ("T0", "D0"), "other": ("T0", "D0"), "out": ("T0", "D0")},
+            {"T0": ["Double"]},
+            {"D0": [0, 1, 2, 3]},
+        )
+        selector = SelectiveBuilder.from_yaml_dict(
+            {
+                "include_all_operators": True,
+                "et_kernel_metadata": {
+                    "custom_1::op_1": ["v1/8;0,1,2,3|7;0,1,2,3|7;0,1,2,3"]
+                },
+            }
+        )
+        use_aten_lib = False
+        entry = (
+            self.native_function_no_kern,
+            (specialized_kernel_key, self.default_backend_metadata),
+        )
+
+        self.assertRaises(
+            Exception,
+            ComputeCodegenUnboxedKernels(
+                selector, use_aten_lib, add_exception_boundary=False
+            ),
+            entry,
+        )
+
+    def test_codegen_unboxed_specialized_missing_root_op(self) -> None:
+        specialized_kernel_key = ETKernelKey.gen_from_yaml(
+            {"self": ("T0", "D0"), "other": ("T0", "D0"), "out": ("T0", "D0")},
+            {"T0": ["Double"]},
+            {"D0": [0, 1, 2, 3]},
+        )
+        selector = SelectiveBuilder.from_yaml_dict(
+            {
+                "et_kernel_metadata": {
+                    "custom_1::op_1": ["v1/7;0,1,2,3|7;0,1,2,3|7;0,1,2,3"]
+                }
+            }
+        )
+        use_aten_lib = False
+        entry = (
+            self.native_function_no_kern,
+            (specialized_kernel_key, self.default_backend_metadata),
+        )
+
+        for add_exception_boundary in (True, False):
+            result = ComputeCodegenUnboxedKernels(
+                selector, use_aten_lib, add_exception_boundary
+            )(entry)
+            # Concat used to prevent whitespace stripping
+            expected_str = """"""
+
+            self.assertEqual(expected_str, result)
+
+    def test_codegen_unboxed_default(self) -> None:
+        """
+        This test checks that if there is no specialized kernel, the default kernel is used.
+        """
+        selector = SelectiveBuilder.from_yaml_dict(
+            {
+                "include_all_operators": True,
+                "et_kernel_metadata": {
+                    "custom_1::op_1": ["v1/7;0,1,2,3|7;0,1,2,3|7;0,1,2,3"]
+                },
+            }
+        )
+        use_aten_lib = False
+        entry = (self.native_function_no_kern, self.default_kernel_entry)
+
+        result = ComputeCodegenUnboxedKernels(
+            selector, use_aten_lib, add_exception_boundary=False
+        )(entry)
+        # Concat used to prevent whitespace stripping
+        expected_str = (
+            """
+Kernel(
+    "custom_1::op_1",
+    [](torch::executor::KernelRuntimeContext & context, EValue** stack) {
+        """
+            + """
+
+
+        internal::EventTracerProfileOpScope event_tracer_op_scope(context.internal_event_tracer(), "native_call_op_1");
+        EXECUTORCH_SCOPE_PROF("native_call_op_1");
+        bool result_ = at::native::default_kernel(context, );
+        internal::event_tracer_log_evalue(context.internal_event_tracer(), *stack[0]);
+
+        *stack[0] = EValue(result_);
+
+    }
+),
+"""
+        )
+
+        self.assertEqual(expected_str, result)
+
+        result = ComputeCodegenUnboxedKernels(
+            selector, use_aten_lib, add_exception_boundary=True
+        )(entry)
+        # Concat used to prevent whitespace stripping
+        expected_str = (
+            """
+Kernel(
+    "custom_1::op_1",
+    [](torch::executor::KernelRuntimeContext & context, EValue** stack) {
+        """
+            + """
+
+        try {
+        internal::EventTracerProfileOpScope event_tracer_op_scope(context.internal_event_tracer(), "native_call_op_1");
+        EXECUTORCH_SCOPE_PROF("native_call_op_1");
+        bool result_ = at::native::default_kernel(context, );
+        internal::event_tracer_log_evalue(context.internal_event_tracer(), *stack[0]);
+
+        *stack[0] = EValue(result_);
+        } catch (const std::exception& ex) {
+          ET_LOG(Error, "Kernel threw an exception: %s", ex.what());
+          context.fail(torch::executor::Error::Internal);
+        }
+    }
+),
+"""
+        )
+        self.maxDiff = None
+        self.assertEqual(expected_str, result)
+
+    def test_codegen_unboxed_default_kernel_key_selected(self) -> None:
+        """
+        This test checks that if there is no specialized kernel, the default kernel is used, when the selector only has default key.
+        """
+        selector = SelectiveBuilder.from_yaml_dict(
+            {
+                "include_all_operators": True,
+                "et_kernel_metadata": {"custom_1::op_1": ["default"]},
+            }
+        )
+        use_aten_lib = False
+        entry = (self.native_function_no_kern, self.default_kernel_entry)
+
+        result = ComputeCodegenUnboxedKernels(
+            selector, use_aten_lib, add_exception_boundary=False
+        )(entry)
+        # Concat used to prevent whitespace stripping
+        expected_str = (
+            """
+Kernel(
+    "custom_1::op_1",
+    [](torch::executor::KernelRuntimeContext & context, EValue** stack) {
+        """
+            + """
+
+
+        internal::EventTracerProfileOpScope event_tracer_op_scope(context.internal_event_tracer(), "native_call_op_1");
+        EXECUTORCH_SCOPE_PROF("native_call_op_1");
+        bool result_ = at::native::default_kernel(context, );
+        internal::event_tracer_log_evalue(context.internal_event_tracer(), *stack[0]);
+
+        *stack[0] = EValue(result_);
+
+    }
+),
+"""
+        )
+
+        self.assertEqual(expected_str, result)
--- a/tools/test/test_executorch_signatures.py
+++ b/tools/test/test_executorch_signatures.py
@ -0,0 +1,59 @@
+import unittest
+
+from torchgen.executorch.api.types import ExecutorchCppSignature
+from torchgen.local import parametrize
+from torchgen.model import Location, NativeFunction
+
+
+DEFAULT_NATIVE_FUNCTION, _ = NativeFunction.from_yaml(
+    {"func": "foo.out(Tensor input, *, Tensor(a!) out) -> Tensor(a!)"},
+    loc=Location(__file__, 1),
+    valid_tags=set(),
+)
+
+
+class ExecutorchCppSignatureTest(unittest.TestCase):
+    def setUp(self) -> None:
+        self.sig = ExecutorchCppSignature.from_native_function(DEFAULT_NATIVE_FUNCTION)
+
+    def test_runtime_signature_contains_runtime_context(self) -> None:
+        # test if `KernelRuntimeContext` argument exists in `RuntimeSignature`
+        with parametrize(
+            use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+        ):
+            args = self.sig.arguments(include_context=True)
+            self.assertEqual(len(args), 3)
+            self.assertTrue(any(a.name == "context" for a in args))
+
+    def test_runtime_signature_does_not_contain_runtime_context(self) -> None:
+        # test if `KernelRuntimeContext` argument is missing in `RuntimeSignature`
+        with parametrize(
+            use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+        ):
+            args = self.sig.arguments(include_context=False)
+            self.assertEqual(len(args), 2)
+            self.assertFalse(any(a.name == "context" for a in args))
+
+    def test_runtime_signature_declaration_correct(self) -> None:
+        with parametrize(
+            use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+        ):
+            decl = self.sig.decl(include_context=True)
+            self.assertEqual(
+                decl,
+                (
+                    "torch::executor::Tensor & foo_outf("
+                    "torch::executor::KernelRuntimeContext & context, "
+                    "const torch::executor::Tensor & input, "
+                    "torch::executor::Tensor & out)"
+                ),
+            )
+            no_context_decl = self.sig.decl(include_context=False)
+            self.assertEqual(
+                no_context_decl,
+                (
+                    "torch::executor::Tensor & foo_outf("
+                    "const torch::executor::Tensor & input, "
+                    "torch::executor::Tensor & out)"
+                ),
+            )
--- a/tools/test/test_executorch_types.py
+++ b/tools/test/test_executorch_types.py
@ -0,0 +1,114 @@
+import unittest
+
+from torchgen import local
+from torchgen.api.types import (
+    BaseCType,
+    boolT,
+    ConstRefCType,
+    CType,
+    longT,
+    MutRefCType,
+    NamedCType,
+    OptionalCType,
+    TupleCType,
+    VectorCType,
+    voidT,
+)
+from torchgen.executorch.api.et_cpp import argument_type, return_type, returns_type
+from torchgen.executorch.api.types import ArrayRefCType, scalarT, tensorListT, tensorT
+from torchgen.model import Argument, FunctionSchema, Return
+
+
+class ExecutorchCppTest(unittest.TestCase):
+    """
+    Test torchgen.executorch.api.cpp
+    """
+
+    def _test_argumenttype_type(self, arg_str: str, expected: NamedCType) -> None:
+        arg = Argument.parse(arg_str)
+        self.assertEqual(str(argument_type(arg, binds=arg.name)), str(expected))
+
+    @local.parametrize(
+        use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+    )
+    def test_argumenttype_type(self) -> None:
+        data = [
+            ("Tensor self", NamedCType("self", ConstRefCType(BaseCType(tensorT)))),
+            ("Tensor(a!) out", NamedCType("out", MutRefCType(BaseCType(tensorT)))),
+            (
+                "Tensor? opt",
+                NamedCType("opt", ConstRefCType(OptionalCType(BaseCType(tensorT)))),
+            ),
+            ("Scalar scalar", NamedCType("scalar", ConstRefCType(BaseCType(scalarT)))),
+            (
+                "Scalar? scalar",
+                NamedCType("scalar", ConstRefCType(OptionalCType(BaseCType(scalarT)))),
+            ),
+            ("int[] size", NamedCType("size", ArrayRefCType(BaseCType(longT)))),
+            ("int? dim", NamedCType("dim", OptionalCType(BaseCType(longT)))),
+            ("Tensor[] weight", NamedCType("weight", BaseCType(tensorListT))),
+            (
+                "Scalar[] spacing",
+                NamedCType("spacing", ArrayRefCType(ConstRefCType(BaseCType(scalarT)))),
+            ),
+            (
+                "Tensor?[] weight",
+                NamedCType("weight", ArrayRefCType(OptionalCType(BaseCType(tensorT)))),
+            ),
+            (
+                "SymInt[]? output_size",
+                NamedCType(
+                    "output_size", OptionalCType(ArrayRefCType(BaseCType(longT)))
+                ),
+            ),
+            (
+                "int[]? dims",
+                NamedCType("dims", OptionalCType(ArrayRefCType(BaseCType(longT)))),
+            ),
+            (
+                "bool[3] output_mask",
+                NamedCType("output_mask", ArrayRefCType(BaseCType(boolT))),
+            ),
+        ]
+        for d in data:
+            self._test_argumenttype_type(*d)
+
+    def _test_returntype_type(self, ret_str: str, expected: CType) -> None:
+        ret = Return.parse(ret_str)
+        self.assertEqual(str(return_type(ret)), str(expected))
+
+    @local.parametrize(
+        use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+    )
+    def test_returntype_type(self) -> None:
+        data = [
+            ("Tensor", BaseCType(tensorT)),
+            ("Tensor(a!)", MutRefCType(BaseCType(tensorT))),
+            ("Tensor[]", VectorCType(BaseCType(tensorT))),
+        ]
+        for d in data:
+            self._test_returntype_type(*d)
+
+    @local.parametrize(
+        use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+    )
+    def test_returns_type(self) -> None:
+        func = FunctionSchema.parse(
+            "min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)"
+        )
+        expected = TupleCType([BaseCType(tensorT), BaseCType(tensorT)])
+        self.assertEqual(str(returns_type(func.returns)), str(expected))
+
+    @local.parametrize(
+        use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+    )
+    def test_void_return_type(self) -> None:
+        func = FunctionSchema.parse(
+            "_foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()"
+        )
+        expected = BaseCType(voidT)
+        self.assertEqual(str(returns_type(func.returns)), str(expected))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tools/test/test_executorch_unboxing.py
+++ b/tools/test/test_executorch_unboxing.py
@ -0,0 +1,176 @@
+import unittest
+from types import ModuleType
+
+from torchgen import local
+from torchgen.api import cpp as aten_cpp, types as aten_types
+from torchgen.api.types import (
+    ArgName,
+    BaseCType,
+    ConstRefCType,
+    MutRefCType,
+    NamedCType,
+)
+from torchgen.executorch.api import et_cpp as et_cpp, types as et_types
+from torchgen.executorch.api.unboxing import Unboxing
+from torchgen.model import BaseTy, BaseType, ListType, OptionalType, Type
+
+
+def aten_argumenttype_type_wrapper(
+    t: Type, *, mutable: bool, binds: ArgName, remove_non_owning_ref_types: bool = False
+) -> NamedCType:
+    return aten_cpp.argumenttype_type(
+        t,
+        mutable=mutable,
+        binds=binds,
+        remove_non_owning_ref_types=remove_non_owning_ref_types,
+    )
+
+
+ATEN_UNBOXING = Unboxing(argument_type_gen=aten_argumenttype_type_wrapper)
+ET_UNBOXING = Unboxing(argument_type_gen=et_cpp.argumenttype_type)
+
+
+class TestUnboxing(unittest.TestCase):
+    """
+    Could use torch.testing._internal.common_utils to reduce boilerplate.
+    GH CI job doesn't build torch before running tools unit tests, hence
+    manually adding these parametrized tests.
+    """
+
+    @local.parametrize(
+        use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+    )
+    def test_symint_argument_translate_ctype_aten(self) -> None:
+        # test if `SymInt[]` JIT argument can be translated into C++ argument correctly.
+        # should be `IntArrayRef` due to the fact that Executorch doesn't use symint sig.
+
+        # pyre-fixme[16]: `enum.Enum` has no attribute `SymInt`
+        # pyre-fixme[19]: Call `BaseType.__init__` expects 0 positional arguments, 1 was provided.
+        symint_list_type = ListType(elem=BaseType(BaseTy.SymInt), size=None)
+
+        out_name, ctype, _, _ = ATEN_UNBOXING.argumenttype_evalue_convert(
+            t=symint_list_type, arg_name="size", mutable=False
+        )
+
+        self.assertEqual(out_name, "size_list_out")
+        self.assertIsInstance(ctype, BaseCType)
+        # pyre-fixme[16]:
+        self.assertEqual(ctype, aten_types.BaseCType(aten_types.intArrayRefT))
+
+    @local.parametrize(
+        use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+    )
+    def test_symint_argument_translate_ctype_executorch(self) -> None:
+        # test if `SymInt[]` JIT argument can be translated into C++ argument correctly.
+        # should be `IntArrayRef` due to the fact that Executorch doesn't use symint sig.
+
+        # pyre-fixme[16]: `enum.Enum` has no attribute `SymInt`
+        # pyre-fixme[19]: Call `BaseType.__init__` expects 0 positional arguments, 1 was provided.
+        symint_list_type = ListType(elem=BaseType(BaseTy.SymInt), size=None)
+
+        out_name, ctype, _, _ = ET_UNBOXING.argumenttype_evalue_convert(
+            t=symint_list_type, arg_name="size", mutable=False
+        )
+
+        self.assertEqual(out_name, "size_list_out")
+        self.assertIsInstance(ctype, et_types.ArrayRefCType)
+        # pyre-fixme[16]:
+        self.assertEqual(
+            ctype, et_types.ArrayRefCType(elem=BaseCType(aten_types.longT))
+        )
+
+    @local.parametrize(
+        use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+    )
+    def _test_const_tensor_argument_translate_ctype(
+        self, unboxing: Unboxing, types: ModuleType
+    ) -> None:
+        # pyre-fixme[16]: `enum.Enum` has no attribute `Tensor`
+        # pyre-fixme[19]: Call `BaseType.__init__` expects 0 positional arguments, 1 was provided.
+        tensor_type = BaseType(BaseTy.Tensor)
+
+        out_name, ctype, _, _ = unboxing.argumenttype_evalue_convert(
+            t=tensor_type, arg_name="self", mutable=False
+        )
+
+        self.assertEqual(out_name, "self_base")
+        # pyre-fixme[16]:
+        self.assertEqual(ctype, ConstRefCType(BaseCType(types.tensorT)))
+
+    def test_const_tensor_argument_translate_ctype_aten(self) -> None:
+        self._test_const_tensor_argument_translate_ctype(ATEN_UNBOXING, aten_types)
+
+    def test_const_tensor_argument_translate_ctype_executorch(self) -> None:
+        self._test_const_tensor_argument_translate_ctype(ET_UNBOXING, et_types)
+
+    @local.parametrize(
+        use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+    )
+    def _test_mutable_tensor_argument_translate_ctype(
+        self, unboxing: Unboxing, types: ModuleType
+    ) -> None:
+        # pyre-fixme[16]: `enum.Enum` has no attribute `Tensor`
+        # pyre-fixme[19]: Call `BaseType.__init__` expects 0 positional arguments, 1 was provided.
+        tensor_type = BaseType(BaseTy.Tensor)
+
+        out_name, ctype, _, _ = unboxing.argumenttype_evalue_convert(
+            t=tensor_type, arg_name="out", mutable=True
+        )
+
+        self.assertEqual(out_name, "out_base")
+        # pyre-fixme[16]:
+        self.assertEqual(ctype, MutRefCType(BaseCType(types.tensorT)))
+
+    def test_mutable_tensor_argument_translate_ctype_aten(self) -> None:
+        self._test_mutable_tensor_argument_translate_ctype(ATEN_UNBOXING, aten_types)
+
+    def test_mutable_tensor_argument_translate_ctype_executorch(self) -> None:
+        self._test_mutable_tensor_argument_translate_ctype(ET_UNBOXING, et_types)
+
+    @local.parametrize(
+        use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+    )
+    def _test_tensor_list_argument_translate_ctype(
+        self, unboxing: Unboxing, types: ModuleType
+    ) -> None:
+        # pyre-fixme[16]: `enum.Enum` has no attribute `Tensor`
+        # pyre-fixme[19]: Call `BaseType.__init__` expects 0 positional arguments, 1 was provided.
+        tensor_list_type = ListType(elem=BaseType(BaseTy.Tensor), size=None)
+
+        out_name, ctype, _, _ = unboxing.argumenttype_evalue_convert(
+            t=tensor_list_type, arg_name="out", mutable=True
+        )
+
+        self.assertEqual(out_name, "out_list_out")
+        # pyre-fixme[16]:
+        self.assertEqual(ctype, BaseCType(types.tensorListT))
+
+    def test_tensor_list_argument_translate_ctype_aten(self) -> None:
+        self._test_tensor_list_argument_translate_ctype(ATEN_UNBOXING, aten_types)
+
+    def test_tensor_list_argument_translate_ctype_executorch(self) -> None:
+        self._test_tensor_list_argument_translate_ctype(ET_UNBOXING, et_types)
+
+    @local.parametrize(
+        use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+    )
+    def _test_optional_int_argument_translate_ctype(
+        self, unboxing: Unboxing, types: ModuleType
+    ) -> None:
+        # pyre-fixme[16]: `enum.Enum` has no attribute `Tensor`
+        # pyre-fixme[19]: Call `BaseType.__init__` expects 0 positional arguments, 1 was provided.
+        optional_int_type = OptionalType(elem=BaseType(BaseTy.int))
+
+        out_name, ctype, _, _ = unboxing.argumenttype_evalue_convert(
+            t=optional_int_type, arg_name="something", mutable=True
+        )
+
+        self.assertEqual(out_name, "something_opt_out")
+        # pyre-fixme[16]:
+        self.assertEqual(ctype, types.OptionalCType(BaseCType(types.longT)))
+
+    def test_optional_int_argument_translate_ctype_aten(self) -> None:
+        self._test_optional_int_argument_translate_ctype(ATEN_UNBOXING, aten_types)
+
+    def test_optional_int_argument_translate_ctype_executorch(self) -> None:
+        self._test_optional_int_argument_translate_ctype(ET_UNBOXING, et_types)
--- a/tools/test/test_selective_build.py
+++ b/tools/test/test_selective_build.py
@ -298,3 +298,45 @@ operators:
            valid_tags=set(),
        )
        self.assertTrue(selector.is_native_function_selected(native_function))
+
+
+class TestExecuTorchSelectiveBuild(unittest.TestCase):
+    def test_et_kernel_selected(self) -> None:
+        yaml_config = """
+et_kernel_metadata:
+  aten::add.out:
+   - "v1/6;0,1|6;0,1|6;0,1|6;0,1"
+  aten::sub.out:
+   - "v1/6;0,1|6;0,1|6;0,1|6;0,1"
+"""
+        selector = SelectiveBuilder.from_yaml_str(yaml_config)
+        self.assertListEqual(
+            ["v1/6;0,1|6;0,1|6;0,1|6;0,1"],
+            selector.et_get_selected_kernels(
+                "aten::add.out",
+                [
+                    "v1/6;0,1|6;0,1|6;0,1|6;0,1",
+                    "v1/3;0,1|3;0,1|3;0,1|3;0,1",
+                    "v1/6;1,0|6;0,1|6;0,1|6;0,1",
+                ],
+            ),
+        )
+        self.assertListEqual(
+            ["v1/6;0,1|6;0,1|6;0,1|6;0,1"],
+            selector.et_get_selected_kernels(
+                "aten::sub.out", ["v1/6;0,1|6;0,1|6;0,1|6;0,1"]
+            ),
+        )
+        self.assertListEqual(
+            [],
+            selector.et_get_selected_kernels(
+                "aten::mul.out", ["v1/6;0,1|6;0,1|6;0,1|6;0,1"]
+            ),
+        )
+        # We don't use version for now.
+        self.assertListEqual(
+            ["v2/6;0,1|6;0,1|6;0,1|6;0,1"],
+            selector.et_get_selected_kernels(
+                "aten::add.out", ["v2/6;0,1|6;0,1|6;0,1|6;0,1"]
+            ),
+        )
--- a/torchgen/build.bzl
+++ b/torchgen/build.bzl
@ -18,3 +18,13 @@ def define_targets(rules):
            rules.requirement("typing-extensions"),
        ],
    )
+
+    rules.py_binary(
+        name = "gen_executorch",
+        srcs = [":torchgen"],
+        visibility = ["//visibility:public"],
+        deps = [
+            rules.requirement("PyYAML"),
+            rules.requirement("typing-extensions"),
+        ],
+    )
--- a/torchgen/executorch/init.py
+++ b/torchgen/executorch/init.py
--- a/torchgen/executorch/api/init.py
+++ b/torchgen/executorch/api/init.py
--- a/torchgen/executorch/api/custom_ops.py
+++ b/torchgen/executorch/api/custom_ops.py
@ -0,0 +1,151 @@
+from __future__ import annotations
+
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from torchgen import dest
+
+
+# disable import sorting to avoid circular dependency.
+from torchgen.api.types import DispatcherSignature  # usort: skip
+from torchgen.context import method_with_native_function
+from torchgen.model import BaseTy, BaseType, DispatchKey, NativeFunction, Variant
+from torchgen.utils import concatMap, Target
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from torchgen.executorch.model import ETKernelIndex
+    from torchgen.selective_build.selector import SelectiveBuilder
+
+
+# Generates RegisterKernelStub.cpp, which provides placeholder kernels for custom operators. This will be used at
+# model authoring side.
+@dataclass(frozen=True)
+class ComputeNativeFunctionStub:
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> str | None:
+        if Variant.function not in f.variants:
+            return None
+
+        sig = DispatcherSignature.from_schema(
+            f.func, prefix=f"wrapper_CPU_{f.func.name.overload_name}_", symint=False
+        )
+        assert sig is not None
+        if len(f.func.returns) == 0:
+            ret_name = ""
+        elif len(f.func.returns) == 1:
+            if f.func.arguments.out:
+                ret_name = f.func.arguments.out[0].name
+            else:
+                ret_name = next(
+                    (
+                        a.name
+                        for a in f.func.arguments.flat_non_out
+                        if a.type == f.func.returns[0].type
+                    ),
+                    "",
+                )
+            if not ret_name:
+                # if return type is tensor
+                if f.func.returns[0].type == BaseType(BaseTy.Tensor):
+                    # Returns an empty tensor
+                    ret_name = "at::Tensor()"
+                else:
+                    raise Exception(  # noqa: TRY002
+                        f"Can't handle this return type {f.func}"
+                    )  # noqa: TRY002
+        elif len(f.func.arguments.out) == len(f.func.returns):
+            # Returns a tuple of out arguments
+            tensor_type = "at::Tensor &"
+            comma = ", "
+            ret_name = f"""::std::tuple<{comma.join([tensor_type] * len(f.func.returns))}>(
+                {comma.join([r.name for r in f.func.arguments.out])}
+            )"""
+        else:
+            assert all(a.type == BaseType(BaseTy.Tensor) for a in f.func.returns), (
+                f"Only support tensor returns but got {f.func.returns}"
+            )
+            # Returns a tuple of empty tensors
+            tensor_type = "at::Tensor"
+            comma = ", "
+            ret_name = f"""::std::tuple<{comma.join([tensor_type] * len(f.func.returns))}>(
+                {comma.join(["at::Tensor()" for _ in f.func.returns])}
+            )"""
+        ret_str = f"return {ret_name};" if len(f.func.returns) > 0 else ""
+        return f"""
+{sig.defn()} {{
+    {ret_str}
+}}
+    """
+
+
+def gen_custom_ops_registration(
+    *,
+    native_functions: Sequence[NativeFunction],
+    selector: SelectiveBuilder,
+    kernel_index: ETKernelIndex,
+    rocm: bool,
+) -> tuple[str, str]:
+    """
+    Generate custom ops registration code for dest.RegisterDispatchKey.
+
+    :param native_functions: a sequence of `NativeFunction`
+    :param selector: for selective build.
+    :param kernel_index: kernels for all the ops.
+    :param rocm: bool for dest.RegisterDispatchKey.
+    :return: generated C++ code to register custom operators into PyTorch
+    """
+
+    # convert kernel index to BackendIndex. This is because we can't handle ETKernelIndex yet.
+    # TODO larryliu: evaluate if this code is still needed. If yes let it handle ETKernelIndex.
+
+    dispatch_key = DispatchKey.CPU
+    backend_index = kernel_index._to_backend_index()
+    static_init_dispatch_registrations = ""
+    ns_grouped_native_functions: dict[str, list[NativeFunction]] = defaultdict(list)
+    for native_function in native_functions:
+        ns_grouped_native_functions[native_function.namespace].append(native_function)
+
+    for namespace, functions in ns_grouped_native_functions.items():
+        if len(functions) == 0:
+            continue
+        dispatch_registrations_body = "\n".join(
+            list(
+                concatMap(
+                    dest.RegisterDispatchKey(
+                        backend_index,
+                        Target.REGISTRATION,
+                        selector,
+                        rocm=rocm,
+                        symint=False,
+                        class_method_name=None,
+                        skip_dispatcher_op_registration=False,
+                    ),
+                    functions,
+                )
+            )
+        )
+        static_init_dispatch_registrations += f"""
+TORCH_LIBRARY_IMPL({namespace}, {dispatch_key}, m) {{
+{dispatch_registrations_body}
+}}"""
+    anonymous_definition = "\n".join(
+        list(
+            concatMap(
+                dest.RegisterDispatchKey(
+                    backend_index,
+                    Target.ANONYMOUS_DEFINITION,
+                    selector,
+                    rocm=rocm,
+                    symint=False,
+                    class_method_name=None,
+                    skip_dispatcher_op_registration=False,
+                ),
+                native_functions,
+            )
+        )
+    )
+    return anonymous_definition, static_init_dispatch_registrations
--- a/torchgen/executorch/api/et_cpp.py
+++ b/torchgen/executorch/api/et_cpp.py
@ -0,0 +1,367 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+from typing_extensions import assert_never
+
+from torchgen import local
+from torchgen.api.types import (
+    ArgName,
+    BaseCType,
+    Binding,
+    ConstRefCType,
+    CType,
+    MutRefCType,
+    NamedCType,
+    SpecialArgName,
+    TupleCType,
+    VectorCType,
+    voidT,
+)
+from torchgen.executorch.api.types import (
+    ArrayRefCType,
+    BaseTypeToCppMapping,
+    OptionalCType,
+    scalarT,
+    tensorListT,
+    tensorT,
+)
+from torchgen.model import (
+    Argument,
+    Arguments,
+    BaseTy,
+    BaseType,
+    ListType,
+    NativeFunction,
+    OptionalType,
+    Return,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+"""
+This file describes the translation of JIT schema to the public C++ API, which is what people use when they call
+functions like at::add. It also serves as a native function API, which is the signature of kernels,
+since in Executorch CppSignature is the same as NativeSignature.
+
+Difference between this file and torchgen.api.cpp.py:
+
+  - Executorch doesn't support TensorOptions, however in this file we still keep the logic here to be compatible with
+    torchgen.api.cpp, so that we can do stuff like ATen mode (running ATen kernels in Executorch).
+
+  - Executorch doesn't support Dimname.
+
+  - Executorch runtime doesn't support SymInt, will treat it as int.
+"""
+
+
+# Translation of "value types" in JIT schema to C++ API type.  Value
+# types look the same no matter if they are argument types or return
+# types.  Returns None if the type in question is not a value type.
+def valuetype_type(
+    t: Type,
+    *,
+    binds: ArgName,
+) -> NamedCType | None:
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor or t.name == BaseTy.Scalar:
+            return None
+        # For SymInt we simply treat it as int.
+        elif str(t) == "SymInt":
+            return NamedCType(binds, BaseCType(BaseTypeToCppMapping[BaseTy.int]))
+        # All other BaseType currently map directly to BaseCppTypes.
+        return NamedCType(binds, BaseCType(BaseTypeToCppMapping[t.name]))
+    elif isinstance(t, OptionalType):
+        elem = valuetype_type(t.elem, binds=binds)
+        if elem is None:
+            return None
+        return NamedCType(binds, OptionalCType(elem.type))
+    elif isinstance(t, ListType):
+        if str(t.elem) == "bool":
+            assert t.size is not None
+            return NamedCType(
+                binds, ArrayRefCType(BaseCType(BaseTypeToCppMapping[BaseTy.bool]))
+            )
+        else:
+            return None
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# Translation of types occurring in JIT arguments to a C++ argument type.
+# If remove_non_owning_ref_types is set, we'll guarantee that the outputed CType is not a non-owning reference type.
+# For example, we'll return std::vector<int> instead of IntArrayRef.
+# See Note [translation from C++ reference to value types]
+def argumenttype_type(
+    t: Type,
+    *,
+    mutable: bool,
+    binds: ArgName,
+    remove_non_owning_ref_types: bool = False,
+) -> NamedCType:
+    # If it's a value type, do the value type translation
+    r = valuetype_type(
+        t,
+        binds=binds,
+    )
+    if r is not None:
+        return r
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            if mutable and not local.use_const_ref_for_mutable_tensors():
+                return NamedCType(binds, MutRefCType(BaseCType(tensorT)))
+            else:
+                return NamedCType(binds, ConstRefCType(BaseCType(tensorT)))
+        elif t.name == BaseTy.Scalar:
+            return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
+        else:
+            raise AssertionError(f"base type should have been value type {t}")
+    elif isinstance(t, OptionalType):
+        if str(t.elem) == "Tensor":
+            if mutable and not local.use_const_ref_for_mutable_tensors():
+                return NamedCType(
+                    binds, MutRefCType(BaseCType(tensorT))
+                )  # TODO: fix this discrepancy
+            else:
+                return NamedCType(
+                    binds, ConstRefCType(OptionalCType(BaseCType(tensorT)))
+                )
+        elif str(t.elem) == "Scalar":
+            return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT))))
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
+        return NamedCType(binds, OptionalCType(elem.type))
+    elif isinstance(t, ListType):
+        # TODO: keeping these special cases for Tensor[] and Tensor?[] so that we can hookup with ATen kernels.
+        if str(t.elem) == "Tensor":
+            return NamedCType(binds, BaseCType(tensorListT))
+        elif str(t.elem) == "Dimname":
+            raise NotImplementedError("Executorch doesn't support Dimname")
+        elif str(t.elem) == "Tensor?":
+            return NamedCType(binds, ArrayRefCType(OptionalCType(BaseCType(tensorT))))
+        elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
+        return NamedCType(binds, ArrayRefCType(elem.type))
+    else:
+        raise AssertionError(f"unrecognized type {repr(t)}")
+
+
+# Translate a JIT argument into its C++ type
+def argument_type(a: Argument, *, binds: ArgName) -> NamedCType:
+    return argumenttype_type(a.type, mutable=a.is_write, binds=binds)
+
+
+# Translation of a (non-multi) return type from JIT to C++
+# N.B: returntype_type returns a CType, not a NamedCType.
+# This is mostly because of the mismatch between return types and return names.
+# e.g. a function with a return type of 'void' has 0 return names,
+# and a function with a return type of 'std::tuple' has >1 return name.
+def returntype_type(t: Type, *, mutable: bool) -> CType:
+    # placeholder is ignored
+    r = valuetype_type(t, binds="__placeholder__")
+    if r is not None:
+        return r.type
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.Tensor:
+            if mutable:
+                if local.use_const_ref_for_mutable_tensors():
+                    return ConstRefCType(BaseCType(tensorT))
+                else:
+                    return MutRefCType(BaseCType(tensorT))
+            else:
+                # Note [Tensor Copy Returns]
+                # Currently, we use "Argument.is_write" to determine
+                # whether or not Tensor return types should be copies or references.
+                # If that ever changes, take a look at other locations of this note!
+                return BaseCType(tensorT)
+        elif t.name == BaseTy.Scalar:
+            return BaseCType(scalarT)
+    elif isinstance(t, ListType):
+        assert not mutable, (
+            "Native functions should never return a mutable tensor list. They should return void."
+        )
+        elem = returntype_type(t.elem, mutable=False)
+        assert t.size is None, f"fixed size list returns not supported: {t}"
+        return VectorCType(elem)
+
+    raise AssertionError(f"unrecognized return type {t}")
+
+
+# Translation of a single return to its C++ type
+def return_type(r: Return) -> CType:
+    return returntype_type(r.type, mutable=r.is_write)
+
+
+# Translation of a full (possibly multi) return from JIT to its C++ type
+def returns_type(rs: Sequence[Return]) -> CType:
+    if len(rs) == 0:
+        return BaseCType(voidT)
+    elif len(rs) == 1:
+        return return_type(rs[0])
+    else:
+        return TupleCType([return_type(r) for r in rs])
+
+
+def return_names(f: NativeFunction, *, fallback_name: str = "result") -> Sequence[str]:
+    returns: list[str] = []
+    for i, r in enumerate(f.func.returns):
+        # If we have an inplace function, the return argument is
+        # implicitly named self.
+        # TODO: Consider incorporating this into the data model
+        if f.func.name.name.inplace:
+            assert i == 0, "illegal inplace function with multiple returns"
+            name = "self"
+        # If we are out function, the name is the name of the
+        # corresponding output function (r.name will get recorded
+        # in field_name later.)
+        elif f.func.is_out_fn():
+            name = f.func.arguments.out[i].name
+        # If the return argument is explicitly named...
+        elif r.name:
+            name_conflict = any(
+                r.name == a.name for a in f.func.schema_order_arguments()
+            )
+            if name_conflict and not f.func.is_out_fn():
+                name = f"{r.name}_return"
+            else:
+                name = r.name
+        # If there is no explicit name and no fallback name was passed in, we just name the output result,
+        # unless it's a multi-return, in which case it's result0,
+        # result1, etc (zero-indexed)
+        else:
+            name = fallback_name if len(f.func.returns) == 1 else f"{fallback_name}{i}"
+        returns.append(name)
+    return returns
+
+
+JIT_TO_CPP_DEFAULT = {
+    "False": "false",
+    "True": "true",
+    "None": "torch::execustd::nullopt",  # UGH this one is type directed
+    "[]": "{}",
+    "contiguous_format": "torch::executorch::MemoryFormat::Contiguous",
+    "long": "torch::executorch::kLong",
+}
+
+
+# Convert a JIT default into C++ expression representing the default
+def default_expr(d: str, t: Type) -> str:
+    if d == "None" and str(t) == "Tensor?":
+        return "{}"
+    if isinstance(t, BaseType) and t.name is BaseTy.str:
+        # Schema allows single quotes but C++ needs double
+        if len(d) >= 2 and d[0] == "'" and d[-1] == "'":
+            s = ""
+            i = 1
+            while i + 1 < len(d):
+                if d[i] != "\\":
+                    if d[i] == '"':
+                        s += '\\"'
+                    else:
+                        s += d[i]
+                    i += 1
+                else:
+                    if d[i + 1] == "'":
+                        s += "'"
+                    else:
+                        s += d[i : i + 2]
+                    i += 2
+
+            return f'"{s}"'
+
+    if isinstance(t, OptionalType):
+        if d == "None":
+            return "torch::executor::nullopt"
+
+        return default_expr(d, t.elem)
+
+    if isinstance(t, ListType):
+        if d.startswith("[") and d.endswith("]"):
+            return "{" + d[1:-1] + "}"
+        elif t.size is None:
+            # NOTE: Sized lists can have scalar defaults
+            raise ValueError(f"Expected a list default '[...]' but found: '{d}'")
+
+    return JIT_TO_CPP_DEFAULT.get(d, d)
+
+
+# Convert an argument into its C++ API form
+
+
+def argument(
+    a: Argument | TensorOptionsArguments | SelfArgument,
+    *,
+    cpp_no_default_args: set[str],
+    method: bool,
+    faithful: bool,
+    has_tensor_options: bool,
+) -> list[Binding]:
+    def sub_argument(
+        a: Argument | TensorOptionsArguments | SelfArgument,
+    ) -> list[Binding]:
+        return argument(
+            a,
+            cpp_no_default_args=cpp_no_default_args,
+            method=method,
+            faithful=faithful,
+            has_tensor_options=has_tensor_options,
+        )
+
+    if isinstance(a, Argument):
+        binds: ArgName
+        if a.name == "memory_format" and has_tensor_options:
+            binds = SpecialArgName.possibly_redundant_memory_format
+        else:
+            binds = a.name
+        default: str | None = None
+        if a.name not in cpp_no_default_args and a.default is not None:
+            default = default_expr(a.default, a.type)
+        return [
+            Binding(
+                nctype=argument_type(a, binds=binds),
+                name=a.name,
+                default=default,
+                argument=a,
+            )
+        ]
+    elif isinstance(a, TensorOptionsArguments):
+        raise NotImplementedError("Need to implement type resolution for TensorOptions")
+    elif isinstance(a, SelfArgument):
+        if method:
+            # Caller is responsible for installing implicit this in context!
+            return []
+        else:
+            return sub_argument(a.argument)
+    else:
+        assert_never(a)
+
+
+def arguments(
+    arguments: Arguments,
+    *,
+    faithful: bool,
+    method: bool,
+    cpp_no_default_args: set[str],
+) -> list[Binding]:
+    args: list[Argument | TensorOptionsArguments | SelfArgument] = []
+    if faithful:
+        args.extend(arguments.non_out)
+        args.extend(arguments.out)
+    else:
+        args.extend(arguments.out)
+        args.extend(arguments.non_out)
+    return [
+        r.no_default() if faithful else r
+        for a in args
+        for r in argument(
+            a,
+            faithful=faithful,
+            method=method,
+            has_tensor_options=arguments.tensor_options is not None,
+            cpp_no_default_args=cpp_no_default_args,
+        )
+    ]
--- a/torchgen/executorch/api/types/init.py
+++ b/torchgen/executorch/api/types/init.py
@ -0,0 +1,4 @@
+from torchgen.executorch.api.types.types import *
+
+
+from torchgen.executorch.api.types.signatures import *  # usort: skip
--- a/torchgen/executorch/api/types/signatures.py
+++ b/torchgen/executorch/api/types/signatures.py
@ -0,0 +1,76 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import torchgen.api.cpp as aten_cpp
+from torchgen.executorch.api.types.types import contextArg
+
+
+if TYPE_CHECKING:
+    from torchgen.api.types import Binding, CType
+    from torchgen.model import FunctionSchema, NativeFunction
+
+
+@dataclass(frozen=True)
+class ExecutorchCppSignature:
+    """
+    This signature is merely a CppSignature with Executorch types (optionally
+    contains KernelRuntimeContext as well). The inline definition of
+    CppSignature is generated in Functions.h and it's used by unboxing
+    functions.
+    """
+
+    # The schema this signature is derived from
+    func: FunctionSchema
+
+    # The set of C++ arguments which should not have defaults applied to them
+    cpp_no_default_args: set[str]
+
+    # Allows you to prepend an arbitrary prefix to the signature name.
+    # This is useful for parts of the codegen that generate wrappers around kernels,
+    # and need to avoid naming collisions.
+    prefix: str = ""
+
+    def arguments(self, *, include_context: bool = True) -> list[Binding]:
+        return ([contextArg] if include_context else []) + et_cpp.arguments(
+            self.func.arguments,
+            faithful=True,  # always faithful, out argument at the end
+            method=False,  # method not supported
+            cpp_no_default_args=self.cpp_no_default_args,
+        )
+
+    def name(self) -> str:
+        return self.prefix + aten_cpp.name(
+            self.func,
+            faithful_name_for_out_overloads=True,
+        )
+
+    def decl(self, name: str | None = None, *, include_context: bool = True) -> str:
+        args_str = ", ".join(
+            a.decl() for a in self.arguments(include_context=include_context)
+        )
+        if name is None:
+            name = self.name()
+        return f"{self.returns_type().cpp_type()} {name}({args_str})"
+
+    def defn(self, name: str | None = None) -> str:
+        args = [a.defn() for a in self.arguments()]
+        args_str = ", ".join(args)
+        if name is None:
+            name = self.name()
+        return f"{self.returns_type().cpp_type()} {name}({args_str})"
+
+    def returns_type(self) -> CType:
+        return et_cpp.returns_type(self.func.returns)
+
+    @staticmethod
+    def from_native_function(
+        f: NativeFunction, *, prefix: str = ""
+    ) -> ExecutorchCppSignature:
+        return ExecutorchCppSignature(
+            func=f.func, prefix=prefix, cpp_no_default_args=f.cpp_no_default_args
+        )
+
+
+from torchgen.executorch.api import et_cpp
--- a/torchgen/executorch/api/types/types.py
+++ b/torchgen/executorch/api/types/types.py
@ -0,0 +1,77 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from torchgen.api.types import (
+    BaseCppType,
+    BaseCType,
+    Binding,
+    boolT,
+    CType,
+    doubleT,
+    Expr,
+    longT,
+    MutRefCType,
+    NamedCType,
+)
+from torchgen.model import BaseTy
+
+
+halfT = BaseCppType("torch::executor", "Half")
+bfloat16T = BaseCppType("torch::executor", "BFloat16")
+stringT = BaseCppType("torch::executor", "string_view")
+scalarTypeT = BaseCppType("torch::executor", "ScalarType")
+tensorT = BaseCppType("torch::executor", "Tensor")
+tensorListT = BaseCppType("torch::executor", "TensorList")
+scalarT = BaseCppType("torch::executor", "Scalar")
+memoryFormatT = BaseCppType("torch::executor", "MemoryFormat")
+intArrayRefT = BaseCppType("torch::executor", "IntArrayRef")
+optionalT = BaseCppType("torch::executor", "optional")
+contextT = BaseCppType("torch::executor", "KernelRuntimeContext")
+
+contextExpr = Expr(
+    expr="context",
+    type=NamedCType(name="context", type=MutRefCType(BaseCType(contextT))),
+)
+
+contextArg = Binding(
+    name="context",
+    nctype=contextExpr.type,
+    argument=None,  # type: ignore[arg-type]
+    default=None,
+)
+
+BaseTypeToCppMapping: dict[BaseTy, BaseCppType] = {
+    BaseTy.int: longT,
+    BaseTy.float: doubleT,
+    BaseTy.bool: boolT,
+    BaseTy.str: stringT,
+    BaseTy.ScalarType: scalarTypeT,
+    BaseTy.Tensor: tensorT,
+    BaseTy.Scalar: scalarT,
+    BaseTy.MemoryFormat: memoryFormatT,
+}
+
+
+@dataclass(frozen=True)
+class OptionalCType(CType):
+    elem: CType
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"torch::executor::optional<{self.elem.cpp_type()}>"
+
+    def remove_const_ref(self) -> CType:
+        return OptionalCType(self.elem.remove_const_ref())
+
+
+@dataclass(frozen=True)
+class ArrayRefCType(CType):
+    elem: CType
+
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
+        return f"torch::executor::ArrayRef<{self.elem.cpp_type()}>"
+
+    def remove_const_ref(self) -> CType:
+        return ArrayRefCType(self.elem.remove_const_ref())
--- a/torchgen/executorch/api/unboxing.py
+++ b/torchgen/executorch/api/unboxing.py
@ -0,0 +1,218 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Callable, TYPE_CHECKING
+
+from torchgen.model import (
+    Argument,
+    BaseTy,
+    BaseType,
+    ListType,
+    NativeFunction,
+    OptionalType,
+    Type,
+)
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from torchgen.api.types import Binding, CType, NamedCType
+
+
+connector = "\n\t"
+
+
+# Return unboxing function name for a NativeFunction
+def name(f: NativeFunction) -> str:
+    return f.func.name.unambiguous_name()
+
+
+@dataclass(frozen=True)
+class Unboxing:
+    """
+    Takes a sequence of Bindings and unbox EValues to these Bindings. Return generated code that performs correct unboxing.
+    A sample generated code:
+    // aten::mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    void mul_out(EValue** stack) {
+        EValue& self = *stack[0];
+        EValue& other = *stack[1];
+        EValue& out = *stack[2];
+        const torch::executor::Tensor & self_base = self.to<torch::executor::Tensor>();
+        const torch::executor::Tensor & other_base = other.to<torch::executor::Tensor>();
+        torch::executor::Tensor & out_base = out.to<torch::executor::Tensor>();
+
+        EXECUTORCH_SCOPE_PROF("native_call_mul.out");
+        torch::executor::mul_outf(self_base, other_base, out_base);
+
+
+    }
+    """
+
+    # this is a callable that converts a JIT argument, into its C++ type.
+    # Translates (type, mutability, binds) to NamedCType. E.g., torchgen.api.cpp.argumenttype_type.
+    argument_type_gen: Callable[
+        ...,
+        NamedCType,
+    ]
+
+    # Convert all the arguments in a NativeFunction to C++ code
+    def convert_arguments(
+        self, args: Sequence[Binding]
+    ) -> tuple[list[Binding], list[str]]:
+        code_list = [f"EValue& {args[i].name} = *stack[{i}];" for i in range(len(args))]
+        binding_list = []
+        for arg in args:
+            # expecting only Argument
+            if not isinstance(arg.argument, Argument):
+                raise Exception(  # noqa: TRY002
+                    f"Unexpected argument type, expecting `Argument` but got {arg}"
+                )
+            argument: Argument = arg.argument
+            unboxed_name, _, code, decl = self.argumenttype_evalue_convert(
+                argument.type, argument.name, mutable=argument.is_write
+            )
+            code_list.extend(decl)
+            code_list.extend(code)
+            binding_list.append(arg.with_name(unboxed_name))
+        return binding_list, code_list
+
+    def argumenttype_evalue_convert(
+        self, t: Type, arg_name: str, *, mutable: bool = False
+    ) -> tuple[str, CType, list[str], list[str]]:
+        """
+        Takes in the type, name and mutability corresponding to an argument, and generates a tuple of:
+        (1) the C++ code necessary to unbox the argument
+        (2) A Binding corresponding to the newly created unboxed variable, including variable name and its CType
+        :param t: a `Type` of an argument
+        :param arg_name: argument name
+        :param mutable: boolean for whether this argument type is mutable
+        :return: unboxed result
+        """
+        ctype = self.argument_type_gen(t, mutable=mutable, binds=arg_name).type
+
+        if isinstance(t, BaseType):
+            out_name = f"{arg_name}_base"
+            code, decl = self._gen_code_base_type(
+                arg_name=arg_name, out_name=out_name, ctype=ctype
+            )
+        elif isinstance(t, OptionalType):
+            out_name = f"{arg_name}_opt_out"
+            code, decl = self._gen_code_optional_type(
+                arg_name=arg_name, out_name=out_name, t=t, ctype=ctype
+            )
+        elif isinstance(t, ListType):
+            out_name = f"{arg_name}_list_out"
+            code, decl = self._gen_code_list_type(
+                arg_name=arg_name, out_name=out_name, t=t, ctype=ctype
+            )
+        else:
+            raise Exception(  # noqa: TRY002
+                f"Cannot handle type {t}. arg_name: {arg_name}"
+            )  # noqa: TRY002
+        return out_name, ctype, code, decl
+
+    def _gen_code_base_type(
+        self, arg_name: str, out_name: str, ctype: CType
+    ) -> tuple[list[str], list[str]]:
+        return [
+            f"{ctype.cpp_type()} {out_name} = {arg_name}.to<{ctype.cpp_type(strip_ref=True)}>();"
+        ], []
+
+    def _gen_code_optional_type(
+        self, arg_name: str, out_name: str, t: OptionalType, ctype: CType
+    ) -> tuple[list[str], list[str]]:
+        in_name = f"{arg_name}_opt_in"
+        res_name, base_type, res_code, decl = self.argumenttype_evalue_convert(
+            t.elem, in_name
+        )
+        return (
+            f"""
+    auto {out_name} = {arg_name}.toOptional<{base_type.cpp_type(strip_ref=True)}>();
+            """.split("\n"),
+            decl,
+        )
+
+    def _gen_code_list_type(
+        self, arg_name: str, out_name: str, t: ListType, ctype: CType
+    ) -> tuple[list[str], list[str]]:
+        in_name = f"{arg_name}_list_in"
+        elem_name = f"{arg_name}_elem"
+        code = []
+        res_name, res_ctype, res_code, decl = self.argumenttype_evalue_convert(
+            t.elem, elem_name
+        )
+
+        if isinstance(t.elem, BaseType) and t.elem.name == BaseTy.Tensor:
+            code.extend(
+                f"""
+    auto {out_name} = {arg_name}.toTensorList();
+                """.split("\n")
+            )
+        elif isinstance(t.elem, BaseType) and (
+            t.elem.name == BaseTy.int or t.elem.name == BaseTy.SymInt
+        ):
+            code.extend(
+                f"""
+    auto {out_name} = {arg_name}.toIntList();
+                """.split("\n")
+            )
+        elif isinstance(t.elem, BaseType) and t.elem.name == BaseTy.float:
+            code.extend(
+                f"""
+    auto {out_name} = {arg_name}.toDoubleList();
+                """.split("\n")
+            )
+        elif isinstance(t.elem, BaseType) and t.elem.name == BaseTy.bool:
+            # handle list type with size, e.g., bool[4]
+            code.extend(
+                f"""
+#ifdef USE_ATEN_LIB
+std::array<bool, {t.size}> {out_name};
+auto {in_name} = {arg_name}.toBoolList();
+size_t _i = 0;
+for (auto {elem_name}: {in_name}) {{
+    {out_name}[_i++] = {elem_name};
+}}
+#else
+auto {out_name} = {arg_name}.toBoolList();
+#endif
+                """.split("\n")
+            )
+        # pytorch codegen:
+        # we have to use c10::List for optional element. e.g., Tensor?[] -> c10::List<::std::optional<at::Tensor>>
+        elif (
+            isinstance(t.elem, OptionalType)
+            and isinstance(t.elem.elem, BaseType)
+            and t.elem.elem.name == BaseTy.Tensor
+        ):
+            code.extend(
+                f"""
+#ifdef USE_ATEN_LIB
+auto {in_name} = {arg_name}.toListOptionalTensor();
+c10::List<::std::optional<at::Tensor>> {out_name};
+for (auto {elem_name}: {in_name}) {{
+    {out_name}.push_back({elem_name});
+}}
+#else
+auto {out_name} = {arg_name}.toListOptionalTensor();
+#endif
+                """.split("\n")
+            )
+        else:
+            # use ArrayRef as default.
+            vec_name = arg_name + "_vec"
+            # need to bring vector instantiation out of scope so that ArrayRef has valid data
+            decl.append(
+                f"std::vector<{res_ctype.cpp_type(strip_ref=True)}> {vec_name};"
+            )
+            code.extend(
+                f"""
+    for (EValue {elem_name}: {in_name}) {{
+        {connector.join(res_code)}
+        {vec_name}.push_back({res_name});
+    }}
+    {ctype.cpp_type(strip_ref=True)} {out_name}({vec_name});
+                """.split("\n")
+            )
+        return code, decl
--- a/torchgen/executorch/model.py
+++ b/torchgen/executorch/model.py
@ -0,0 +1,220 @@
+# Represents all kernels used by an Executorch model.
+# It maintains a dict[OperatorName, dict[ETKernelKey, BackendMetadata]] structure.
+
+from __future__ import annotations
+
+import itertools
+from collections import defaultdict, namedtuple
+from dataclasses import dataclass
+from enum import IntEnum
+from typing_extensions import assert_never
+
+from torchgen.model import (
+    BackendIndex,
+    BackendMetadata,
+    DispatchKey,
+    NativeFunction,
+    NativeFunctionsGroup,
+    OperatorName,
+)
+
+
+KERNEL_KEY_VERSION = 1
+
+
+# TODO: Duplicated Subset from codegen.tool.gen_oplist, remove declaration in codegen
+class ScalarType(IntEnum):
+    Byte = 0
+    Char = 1
+    Short = 2
+    Int = 3
+    Long = 4
+    Float = 6
+    Double = 7
+    Bool = 11
+
+
+ETParsedYaml = namedtuple("ETParsedYaml", ["native_functions", "kernel_index"])
+
+
+@dataclass(frozen=True)
+class ETKernelKeyOpArgMeta:
+    arg_name: str
+    dtype: str
+    # The order of the dimensions if entry is a Tensor
+    dim_order: tuple[int, ...]
+
+    def to_native_string(self) -> str:
+        dtype_str = ScalarType[self.dtype].value
+        dim_str = str(self.dim_order)[1:-1].replace(" ", "")
+        return f"{dtype_str};{dim_str}"
+
+
+@dataclass(frozen=True)
+class ETKernelKey:
+    # Field undefined is default = True
+    arg_meta: tuple[ETKernelKeyOpArgMeta, ...] = ()
+
+    # Indicator for this kernel being used as a catch all
+    default: bool = False
+
+    version: int = KERNEL_KEY_VERSION
+
+    @staticmethod
+    def gen_from_yaml(
+        args: dict[str, tuple[str, str]],
+        type_alias_map: dict[str, list[str]],  # TODO: Support unwrapped str val
+        dim_order_alias_map: dict[str, list[int]],
+    ) -> list[ETKernelKey]:
+        """Generate ETKernelKeys from arg kernel specs
+        Multiple ETKernelKeys are returned due to dtype permutations from utilizing
+        type_alias_map (actualizing each potential type permutation as a KernelKey)
+
+        Args:
+            args: Mapping from argument name to kernel specs
+                Kernel specs are a tuple of (dtype, dim_order).
+                Currently tuple entries must be aliased via the alias map arguments
+            type_alias_map: Mapping from type alias to potential type enums
+                i.e { T0 : [Double, Int] } means T0 can be either Double or Int
+                Used for lookup by args
+            dim_order_alias_map: Mapping from alias to a list of dimension orders
+                Used for lookup by args
+        """
+        # Cast to dim order to int
+        dim_order_alias_map = {
+            k: [int(alias) for alias in v] for k, v in dim_order_alias_map.items()
+        }
+        kernel_keys = []
+
+        # Get all used Dtype Alias
+        dtype_alias_used = set()
+        for type_alias, dim_order in args.values():
+            # Enforce usage of alias initially
+            # TODO: Support inlined arguments
+            assert type_alias in type_alias_map, "Undefined type alias: " + str(
+                type_alias
+            )
+            assert dim_order in dim_order_alias_map, (
+                f"Undefined dim_order alias: {dim_order}"
+            )
+            dtype_alias_used.add(type_alias)
+
+        # Generate all permutations of dtype alias values
+        alias_dtypes = [
+            [(alias, dtype) for dtype in type_alias_map[alias]]
+            for alias in dtype_alias_used
+        ]
+        alias_permutations = [
+            dict(permutation) for permutation in list(itertools.product(*alias_dtypes))
+        ]
+
+        # Using each alias value permutation, generate kernel keys
+        op_arg_cache = {}
+        for permutation in alias_permutations:
+            arg_list = []
+            for arg_name, arg_spec in args.items():
+                dtype = permutation[arg_spec[0]]
+                dim_order = dim_order_alias_map[arg_spec[1]]  # type: ignore[assignment]
+                if (
+                    cache_key := (arg_name, dtype, tuple(dim_order))
+                ) not in op_arg_cache:
+                    op_arg_cache[cache_key] = ETKernelKeyOpArgMeta(*cache_key)  # type: ignore[arg-type]
+
+                arg_list.append(op_arg_cache[cache_key])
+            kernel_keys.append(ETKernelKey(tuple(arg_list)))
+
+        return kernel_keys
+
+    def to_native_string(self) -> str:
+        if self.default:
+            return "default"
+        return (
+            "v"
+            + str(KERNEL_KEY_VERSION)
+            + "/"
+            + "|".join([arg.to_native_string() for arg in self.arg_meta])
+        )
+
+
+@dataclass(frozen=True)
+class ETKernelIndex:
+    index: dict[OperatorName, dict[ETKernelKey, BackendMetadata]]
+
+    def has_kernels(self, g: NativeFunction | NativeFunctionsGroup) -> bool:
+        m = self.get_kernels(g)
+        return m is not None
+
+    def get_kernels(
+        self, g: NativeFunction | NativeFunctionsGroup
+    ) -> dict[ETKernelKey, BackendMetadata]:
+        if isinstance(g, NativeFunction):
+            f = g
+        elif isinstance(g, NativeFunctionsGroup):
+            f = g.functional
+        else:
+            assert_never(g)
+        if f.func.name not in self.index:
+            return {}
+        return self.index[f.func.name]
+
+    @staticmethod
+    def grow_from_backend_indices(
+        kernel_index: dict[OperatorName, dict[ETKernelKey, BackendMetadata]],
+        backend_indices: dict[DispatchKey, dict[OperatorName, BackendMetadata]],
+    ) -> None:
+        for dk in backend_indices:
+            index = backend_indices[dk]
+            for op, backend_metadata in index.items():
+                if op in kernel_index:
+                    kernel_index[op][ETKernelKey(default=True)] = backend_metadata
+                else:
+                    kernel_index[op] = {ETKernelKey(default=True): backend_metadata}
+
+    @staticmethod
+    def from_backend_indices(
+        backend_indices: dict[DispatchKey, dict[OperatorName, BackendMetadata]],
+    ) -> ETKernelIndex:
+        kernel_index: dict[OperatorName, dict[ETKernelKey, BackendMetadata]] = (
+            defaultdict(dict)
+        )
+        ETKernelIndex.grow_from_backend_indices(kernel_index, backend_indices)
+        return ETKernelIndex(kernel_index)
+
+    def grow(
+        self, backend_indices: dict[DispatchKey, dict[OperatorName, BackendMetadata]]
+    ) -> ETKernelIndex:
+        ETKernelIndex.grow_from_backend_indices(self.index, backend_indices)
+        return self
+
+    def _to_backend_index(self) -> BackendIndex:
+        """
+        WARNING: this will be deprecated once all the codegen places know how to handle ETKernelIndex.
+        """
+        index: dict[OperatorName, BackendMetadata] = {}
+        for op in self.index:
+            kernel_dict = self.index[op]
+            assert len(kernel_dict.values()) == 1, (
+                f"Can't convert ETKernelIndex to BackendIndex because {op} has more than one kernels. Got {kernel_dict}"
+            )
+            index[op] = kernel_dict.get(
+                ETKernelKey(default=True),
+                BackendMetadata(kernel="", structured=False, cpp_namespace=""),
+            )
+        return BackendIndex(
+            dispatch_key=DispatchKey.CPU,
+            use_out_as_primary=False,
+            device_guard=False,
+            external=False,
+            index=index,
+        )
+
+    # Note duplicate ETKernelKey from index_b will clobber the metadata from index_a
+    @staticmethod
+    def merge_indices(index_a: ETKernelIndex, index_b: ETKernelIndex) -> ETKernelIndex:
+        combined = defaultdict(dict, index_a.index.copy())
+
+        for op, entry in index_b.index.items():
+            for key, metadata in entry.items():
+                combined[op][key] = metadata
+
+        return ETKernelIndex(combined)
--- a/torchgen/executorch/parse.py
+++ b/torchgen/executorch/parse.py
@ -0,0 +1,153 @@
+from __future__ import annotations
+
+from collections import defaultdict, namedtuple
+from typing import Any
+
+import yaml
+
+from torchgen.executorch.model import ETKernelIndex, ETKernelKey
+from torchgen.gen import LineLoader, parse_native_yaml
+from torchgen.model import (
+    BackendMetadata,
+    DispatchKey,
+    FunctionSchema,
+    NativeFunction,
+    OperatorName,
+)
+from torchgen.utils import NamespaceHelper
+
+
+# Parse native_functions.yaml into a sequence of NativeFunctions and ET Backend Indices.
+ETParsedYaml = namedtuple("ETParsedYaml", ["native_functions", "et_kernel_indices"])
+
+# Fields in native_functions.yaml used to determine which kernels should be used
+ET_FIELDS = ["kernels", "type_alias", "dim_order_alias"]
+
+
+def parse_from_yaml(ei: dict[str, object]) -> dict[ETKernelKey, BackendMetadata]:
+    """Given a loaded yaml representing kernel assignment information, extract the
+    mapping from `kernel keys` to `BackendMetadata` (the latter representing the kernel instance)
+
+    Args:
+        ei: Dict keys {kernels, type_alias, dim_order_alias}
+            See ETKernelKey for description of arguments
+    """
+    e = ei.copy()
+    if (kernels := e.pop("kernels", None)) is None:
+        return {}
+
+    type_alias: dict[str, list[str]] = e.pop("type_alias", {})  # type: ignore[assignment]
+    dim_order_alias: dict[str, list[str]] = e.pop("dim_order_alias", {})  # type: ignore[assignment]
+    dim_order_alias.pop("__line__", None)
+
+    kernel_mapping: dict[ETKernelKey, BackendMetadata] = {}
+
+    for entry in kernels:  # type: ignore[attr-defined]
+        arg_meta = entry.get("arg_meta")
+        if arg_meta is not None:
+            arg_meta.pop("__line__")
+
+        kernel_name = entry.get("kernel_name")
+        namespace_helper = NamespaceHelper.from_namespaced_entity(
+            kernel_name, max_level=3
+        )
+        kernel_namespace = namespace_helper.get_cpp_namespace(default="at")
+        backend_metadata = BackendMetadata(
+            kernel=namespace_helper.entity_name,
+            structured=False,
+            cpp_namespace=(kernel_namespace + "::native"),
+        )
+
+        kernel_keys = (
+            [ETKernelKey((), default=True)]
+            if arg_meta is None
+            else ETKernelKey.gen_from_yaml(arg_meta, type_alias, dim_order_alias)  # type: ignore[arg-type]
+        )
+
+        for kernel_key in kernel_keys:
+            assert kernel_key not in kernel_mapping, (
+                "Duplicate kernel key: " + str(kernel_key) + " " + str(e)
+            )
+            kernel_mapping[kernel_key] = backend_metadata
+
+    return kernel_mapping
+
+
+def parse_et_yaml_struct(es: object) -> ETKernelIndex:
+    """Given a loaded yaml representing a list of operators, for each op extract the mapping
+    of `kernel keys` to `BackendMetadata` (the latter representing the kernel instance
+    that should be used by the kernel key).
+    """
+    indices: dict[OperatorName, dict[ETKernelKey, BackendMetadata]] = {}
+    for ei in es:  # type: ignore[attr-defined]
+        e = ei.copy()
+
+        funcs = e.pop("func")
+        assert isinstance(funcs, str), f"not a str: {funcs}"
+        namespace_helper = NamespaceHelper.from_namespaced_entity(
+            namespaced_entity=funcs, max_level=1
+        )
+        opname = FunctionSchema.parse(namespace_helper.entity_name).name
+
+        assert opname not in indices, f"Duplicate func found in yaml: {opname} already"
+
+        if len(index := parse_from_yaml(e)) != 0:
+            indices[opname] = index
+
+    return ETKernelIndex(indices)
+
+
+def extract_kernel_fields(es: object) -> dict[OperatorName, dict[str, Any]]:
+    """Given a loaded yaml representing a list of operators, extract the
+    kernel key related fields indexed by the operator name.
+    """
+    fields: dict[OperatorName, dict[str, Any]] = defaultdict(dict)
+    for ei in es:  # type: ignore[attr-defined]
+        funcs = ei.get("func")
+        assert isinstance(funcs, str), f"not a str: {funcs}"
+        namespace_helper = NamespaceHelper.from_namespaced_entity(
+            namespaced_entity=funcs, max_level=1
+        )
+        opname = FunctionSchema.parse(namespace_helper.entity_name).name
+
+        for field in ET_FIELDS:
+            if (value := ei.get(field)) is not None:
+                fields[opname][field] = value
+
+    return fields
+
+
+def parse_et_yaml(
+    path: str,
+    tags_yaml_path: str,
+    ignore_keys: set[DispatchKey] | None = None,
+    skip_native_fns_gen: bool = False,
+) -> tuple[list[NativeFunction], dict[OperatorName, dict[str, Any]]]:
+    """Parse native_functions.yaml into NativeFunctions and an Operator Indexed Dict
+    of fields to persist from native_functions.yaml to functions.yaml
+    """
+    with open(path) as f:
+        es = yaml.load(f, Loader=LineLoader)
+
+    et_kernel = extract_kernel_fields(es)
+
+    # Remove ET specific fields from entries for BC compatibility
+    strip_et_fields(es)
+
+    native_yaml = parse_native_yaml(
+        path,
+        tags_yaml_path,
+        ignore_keys,
+        skip_native_fns_gen=skip_native_fns_gen,
+        loaded_yaml=es,
+    )
+    return native_yaml.native_functions, et_kernel
+
+
+def strip_et_fields(es: object) -> None:
+    """Given a loaded yaml representing a list of operators,
+    remove ET specific fields from every entries for BC compatibility
+    """
+    for entry in es:  # type: ignore[attr-defined]
+        for field in ET_FIELDS:
+            entry.pop(field, None)
--- a/torchgen/gen_executorch.py
+++ b/torchgen/gen_executorch.py