[nativert] Move Executor to PyTorch core (#157514)

Test Plan: CI Rollback Plan: Differential Revision: D77693984 Pull Request resolved: https://github.com/pytorch/pytorch/pull/157514 Approved by: https://github.com/zhxchen17
2025-10-20 21:14:14 +08:00 · 2025-07-03 23:31:51 +00:00
parent ad86c05b78
commit f7130c097e
3 changed files with 594 additions and 0 deletions
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -601,6 +601,7 @@ libtorch_nativert_sources = [
    "torch/nativert/executor/Placement.cpp",
    "torch/nativert/executor/ExecutionPlanner.cpp",
    "torch/nativert/executor/ExecutionFrame.cpp",
+    "torch/nativert/executor/Executor.cpp",
    "torch/nativert/executor/GraphExecutorBase.cpp",
    "torch/nativert/executor/ConstantFolder.cpp",
    "torch/nativert/executor/OpKernel.cpp",
--- a/torch/nativert/executor/Executor.cpp
+++ b/torch/nativert/executor/Executor.cpp
@ -0,0 +1,387 @@
+#include <memory>
+
+#include <c10/util/Enumerate.h>
+#include <c10/util/Synchronized.h>
+#include <torch/nativert/executor/ExecutionFrame.h>
+#include <torch/nativert/executor/Executor.h>
+#include <torch/nativert/executor/ParallelGraphExecutor.h>
+#include <torch/nativert/executor/SerialGraphExecutor.h>
+#include <torch/nativert/executor/Weights.h>
+#include <torch/nativert/kernels/C10Kernel.h>
+#include <torch/nativert/kernels/KernelFactory.h>
+
+// Maximum number of retries when trying to get a frame from
+// clearedExecutionFrames_
+constexpr uint32_t kClearExecutionFrameRetries = 10;
+
+namespace torch::nativert {
+
+Executor::Executor(
+    torch::nativert::ExecutorConfig executorConfig,
+    std::shared_ptr<Graph> graph,
+    std::shared_ptr<Weights> weights,
+    const Placement& placement,
+    std::shared_ptr<caffe2::serialize::PyTorchStreamReader> pytorchStreamReader,
+    const MakeProxyExecutorFn& makeProxyExecutorFunc)
+    : executorConfig_(std::move(executorConfig)),
+      graph_(std::move(graph)),
+      placement_(placement),
+      constantFolder_(
+          executorConfig_.runConstFolding
+              ? std::optional<ConstantFolder>(*graph_)
+              : std::nullopt),
+      makeProxyExecutorFunc_(makeProxyExecutorFunc),
+      executionFrames_(executorConfig_.maxNumConcurrentThreads),
+      clearedExecutionFrames_(executorConfig_.maxNumConcurrentThreads),
+      numExecutionFrames_(0),
+      lastClearedTimestamp_(getCurrentTimestampSeconds()) {
+  if (weights) {
+    initialize(std::move(weights), std::move(pytorchStreamReader));
+  }
+}
+
+void Executor::initialize(
+    std::shared_ptr<Weights> weights,
+    std::shared_ptr<caffe2::serialize::PyTorchStreamReader>
+        pytorchStreamReader) {
+  auto start = std::chrono::high_resolution_clock::now();
+
+  auto executionKernels = KernelFactory().initializeNodeKernels(
+      *graph_,
+      weights,
+      executorConfig_,
+      placement_,
+      std::move(pytorchStreamReader),
+      makeProxyExecutorFunc_);
+
+  if (constantFolder_.has_value()) {
+    constantFolder_->unlinkConstants(executionKernels.nodeKernels);
+  }
+
+  const auto& kernelSchemas = getKernelSchemas(executionKernels.nodeKernels);
+
+  if (executorConfig_.maxParallelOps > 1) {
+    graphExecutor_ = std::make_unique<ParallelGraphExecutor>(
+        *graph_, std::move(executionKernels.nodeKernels), executorConfig_);
+  } else {
+    graphExecutor_ = std::make_unique<torch::nativert::SerialGraphExecutor>(
+        *graph_, std::move(executionKernels.nodeKernels), executorConfig_);
+  }
+
+  delegateExecutors_ = std::move(executionKernels.delegateExecutors);
+  constFoldingExecutions_ = std::move(executionKernels.constFoldingExecutions);
+
+  // initialize weights_
+  processWeights(weights);
+  atomicSwapWeights(weights);
+
+  if (executorConfig_.layoutPlannerSettings.enabled()) {
+    layoutPlanner_ = std::make_unique<LayoutPlanner>(
+        *graph_,
+        kernelSchemas,
+        ExecutionFrame::getPersistentValueMask(*graph_, weights.get()),
+        executorConfig_.layoutPlannerSettings);
+  }
+
+  auto end = std::chrono::high_resolution_clock::now();
+  LOG(INFO) << "Initialization completed in "
+            << std::chrono::duration_cast<std::chrono::milliseconds>(
+                   end - start)
+                   .count()
+            << " ms";
+}
+
+/* static */ c10::
+    FastMap<std::string /* target */, torch::nativert::FunctionSchema>
+    Executor::getKernelSchemas(
+        const std::vector<std::unique_ptr<OpKernel>>& kernels) {
+  c10::FastMap<std::string, torch::nativert::FunctionSchema> output;
+  for (const auto& kernel : kernels) {
+    if (const auto* casted = dynamic_cast<C10Kernel*>(kernel.get()); casted) {
+      output.insert({std::string(kernel->node()->target()), casted->schema()});
+    }
+  }
+  return output;
+}
+
+void Executor::atomicSwapWeights(std::shared_ptr<Weights> weights) {
+  weights_.withLock([&](auto& w) { w = std::move(weights); });
+
+  // update weights in delegate executors
+  for (auto& delegateExecutor : delegateExecutors_) {
+    delegateExecutor->commitWeights();
+  }
+}
+
+void Executor::maybeRunConstantFolding(std::shared_ptr<Weights> weights) {
+  for (auto& execution : constFoldingExecutions_) {
+    ExecutionFrame constFoldingFrame(execution.executor->graph());
+    std::vector<c10::IValue> inputs;
+    inputs.reserve(graph_->signature().inputsToWeights().size());
+    for (const auto& [_, name] : graph_->signature().inputsToWeights()) {
+      inputs.push_back(weights->at(name));
+    }
+
+    auto outputs = execution.executor->execute(constFoldingFrame, inputs);
+    for (const auto& [idx, value] :
+         c10::enumerate(execution.executor->graph().outputs())) {
+      weights->updateFoldedConst(value->name(), outputs.at(idx));
+    }
+  }
+}
+
+void Executor::processWeights(std::shared_ptr<Weights> weights) {
+  maybeRunConstantFolding(weights);
+  if (constantFolder_.has_value()) {
+    constantFolder_->evaluate(*weights);
+  }
+  for (auto& delegateExecutor : delegateExecutors_) {
+    delegateExecutor->processWeights(weights);
+  }
+}
+
+namespace {
+void validateInput(
+    const std::string& inputName,
+    const at::Tensor& inputTensor,
+    const torch::nativert::TensorMeta& tensorValueMeta) {
+  CHECK(inputTensor.dtype() == tensorValueMeta.dtype())
+      << "Input tensor dtype mismatch for " << inputName << ", expecting "
+      << c10::toString(tensorValueMeta.dtype()) << " but got "
+      << inputTensor.dtype().name();
+
+  CHECK(inputTensor.device() == tensorValueMeta.device())
+      << "Input tensor device mismatch for " << inputName << ", expecting "
+      << tensorValueMeta.device().str() << " but got "
+      << inputTensor.device().str();
+}
+
+} // namespace
+
+// validate input tensor's dtype matches tensorMeta
+void Executor::validateInputs(const std::vector<c10::IValue>& inputs) const {
+  const auto& inputValues = graph_->userInputs();
+  const auto& tensorValuesMeta = graph_->tensorValuesMeta();
+  TORCH_CHECK(inputs.size() == inputValues.size(), "Input size mismatch");
+  for (auto&& [i, actualInput] : c10::enumerate(inputs)) {
+    if (actualInput.isTensor()) {
+      const auto& inputName = std::string(inputValues[i]->name());
+      auto it = tensorValuesMeta.find(inputName);
+      CHECK(it != tensorValuesMeta.end())
+          << "Couldn't find " << inputName << " in tensorValuesMeta";
+      validateInput(inputName, actualInput.toTensor(), it->second);
+    }
+  }
+}
+
+Executor::ExecutorFramePtr Executor::getExecutorFrameFromPool() {
+  std::shared_ptr<Weights> weights;
+  weights_.withLock([&](auto& w) { weights = w; });
+
+  // First try to get a frame from clearedExecutionFrames_ if clearing is in
+  // progress
+  if (C10_UNLIKELY(clearingInProgress_)) {
+    ExecutionFrameEntry frameEntry;
+    uint32_t retry = 0;
+    while (
+        retry <
+        kClearExecutionFrameRetries) { // Limit retries to avoid infinite loop
+      if (clearedExecutionFrames_.readIfNotEmpty(frameEntry)) {
+        if (retry > 0) {
+          VLOG(1) << "Took " << retry
+                  << " retries to pop from clearedExecutionFrames_";
+        }
+        ExecutorFramePtr ptr{std::move(frameEntry.frame), *this};
+        if (ptr->weightVersion() != weights->version()) {
+          ptr->setWeights(*weights);
+        }
+        return ptr;
+      }
+      retry++;
+    }
+    // If we couldn't get a frame from cleared pool after retries, move onto
+    // main pool
+  }
+
+  // Try to get a frame from the main pool or create a new one
+  std::unique_ptr<ExecutionFrame> frame;
+  while (!executionFrames_.readIfNotEmpty(frame)) {
+    int64_t numFrames = numExecutionFrames_.load();
+    if (numFrames < executorConfig_.maxNumConcurrentThreads) {
+      if (numExecutionFrames_.compare_exchange_strong(
+              numFrames, numFrames + 1)) {
+        return ExecutorFramePtr{
+            std::make_unique<ExecutionFrame>(
+                *graph_, *weights, executorConfig_, layoutPlanner_.get()),
+            *this};
+      }
+    } else {
+      sem_.acquire();
+    }
+  }
+  ExecutorFramePtr ptr{std::move(frame), *this};
+
+  if (ptr->weightVersion() != weights->version()) {
+    ptr->setWeights(*weights);
+  }
+  return ptr;
+}
+
+void Executor::clearStaleExecutionFrames() {
+  if (!cleanupLock_.try_lock()) {
+    // Another thread is already doing cleanup
+    return;
+  }
+  // Update timestamp first to minimize contention
+  lastClearedTimestamp_ = getCurrentTimestampSeconds();
+
+  int numPopped = 0;
+  std::unique_ptr<ExecutionFrame> frame;
+
+  // Move frames from executionFrames_ to clearedExecutionFrames_
+  while (executionFrames_.readIfNotEmpty(frame)) {
+    ++numPopped;
+    // Keep the first popped entries up to minimum size
+    if (numPopped > executorConfig_.minNumExecutionFrames) {
+      // Discard stale frames
+      frame.reset();
+      numExecutionFrames_ -= 1;
+      continue;
+    }
+
+    ExecutionFrameEntry entry;
+    entry.used = false;
+    entry.frame = std::move(frame);
+    clearedExecutionFrames_.writeIfNotFull(std::move(entry));
+    // Enable clients to pop from clearedExecutionFrames_ while clearing is in
+    // progress
+    clearingInProgress_ = true;
+  }
+
+  uint32_t numPushed = 0;
+  ExecutionFrameEntry frameEntry;
+  // Move frames back from clearedExecutionFrames_ to executionFrames_
+  while (clearedExecutionFrames_.readIfNotEmpty(frameEntry)) {
+    ++numPushed;
+    executionFrames_.writeIfNotFull(std::move(frameEntry.frame));
+    clearingInProgress_ = false;
+  }
+
+  clearingInProgress_ = false;
+  VLOG(1) << "Cleared " << (numPopped - numPushed) << " out of " << numPopped
+          << " ExecutionFrame instances in the pool";
+
+  cleanupLock_.unlock();
+}
+
+void Executor::returnExecutorFrameToPool(
+    std::unique_ptr<ExecutionFrame> frame) {
+  // Check if it's time to clean up stale frames
+  if (executorConfig_.doExecutionFrameCleanup &&
+      lastClearedTimestamp_ +
+              executorConfig_.executionFramePoolCleanupIntervalSec <
+          getCurrentTimestampSeconds()) {
+    clearStaleExecutionFrames();
+  }
+
+  try {
+    frame->destroyBorrowedIValues();
+
+    // Create an entry with used=true
+    if (C10_UNLIKELY(!clearingInProgress_)) {
+      CHECK(executionFrames_.writeIfNotFull(std::move(frame)))
+          << "ExecutionFrame pool full";
+    } else {
+      ExecutionFrameEntry frameEntry;
+      frameEntry.used = true;
+      frameEntry.frame = std::move(frame);
+
+      CHECK(clearedExecutionFrames_.writeIfNotFull(std::move(frameEntry)))
+          << "Cleared ExecutionFrame pool full";
+    }
+  } catch (...) {
+    sem_.release();
+    throw;
+  }
+  sem_.release();
+}
+
+std::vector<c10::IValue> Executor::execute(std::vector<c10::IValue> inputs) {
+  if (executorConfig_.validateInputs) {
+    validateInputs(inputs);
+  }
+
+  auto executionFrame = getExecutorFrameFromPool();
+  return graphExecutor_->execute(*executionFrame, std::move(inputs));
+}
+
+std::vector<c10::IValue> Executor::execute(
+    const std::vector<c10::IValue>& args,
+    const std::unordered_map<std::string, c10::IValue>& kwargs,
+    const ITreeSpec& inputTreeSpec) {
+  auto executionFrame = getExecutorFrameFromPool();
+
+  std::optional<std::vector<c10::IValue>> outputs;
+  const auto userInputs = graph_->userInputs();
+  const auto& tensorValuesMeta = graph_->tensorValuesMeta();
+  TORCH_CHECK_EQ(userInputs.size(), inputTreeSpec.numIValues());
+
+  auto executionFrameFillUserInputs = [&](const c10::IValue& leaf,
+                                          const Value* value) {
+    // validate input tensor's dtype and device matches tensorMeta
+    if (executorConfig_.validateInputs && leaf.isTensor()) {
+      const auto& inputName = std::string(value->name());
+      auto it = tensorValuesMeta.find(inputName);
+      CHECK(it != tensorValuesMeta.end())
+          << "Couldn't find " << inputName << " in tensorValuesMeta";
+      validateInput(inputName, leaf.toTensor(), it->second);
+    }
+    executionFrame->setBorrowedIValue(
+        value->id(), c10::MaybeOwnedTraits<c10::IValue>::createBorrow(leaf));
+  };
+  ivalueApplyFromArgs(
+      executionFrameFillUserInputs, args, kwargs, inputTreeSpec);
+  try {
+    outputs = graphExecutor_->executeWithPrefilledFrame(*executionFrame);
+  } catch (const std::exception& e) {
+    LOG(ERROR) << "Exception during executeWithPrefilledFrame: " << e.what();
+    throw;
+  }
+
+  return std::move(*outputs);
+}
+
+ProfileMetrics Executor::benchmarkIndividualNodes(
+    std::vector<std::vector<c10::IValue>> inputsList,
+    const uint32_t warmupRuns,
+    const uint32_t mainRuns) {
+  CHECK(inputsList.size() > 0) << "Need at least one input to benchmark";
+  CHECK(warmupRuns >= 1 && mainRuns >= 1) << "Need at least one run";
+
+  for (const auto& inputs : inputsList) {
+    if (executorConfig_.validateInputs) {
+      validateInputs(inputs);
+    }
+  }
+  auto executionFrame = getExecutorFrameFromPool();
+  auto benchmarkResults = graphExecutor_->benchmarkIndividualNodes(
+      *executionFrame, inputsList, warmupRuns, mainRuns);
+
+  return benchmarkResults;
+}
+
+int64_t Executor::getCurrentTimestampSeconds() const {
+  return std::chrono::duration_cast<std::chrono::seconds>(
+             std::chrono::steady_clock::now().time_since_epoch())
+      .count();
+}
+
+std::vector<DelegateExecutor*> Executor::getDelegates() {
+  std::vector<DelegateExecutor*> delegates;
+  for (const auto& delegateExecutor : delegateExecutors_) {
+    delegates.push_back(delegateExecutor.get());
+  }
+  return delegates;
+}
+
+} // namespace torch::nativert
--- a/torch/nativert/executor/Executor.h
+++ b/torch/nativert/executor/Executor.h
@ -0,0 +1,206 @@
+#pragma once
+
+#include <atomic>
+#include <memory>
+
+#include <c10/util/FbcodeMaps.h>
+#include <c10/util/Logging.h>
+#include <c10/util/Semaphore.h>
+#include <c10/util/Synchronized.h>
+
+#include <torch/nativert/detail/ITree.h>
+#include <torch/nativert/detail/MPMCQueue.h>
+#include <torch/nativert/executor/ConstantFolder.h>
+#include <torch/nativert/executor/DelegateExecutor.h>
+#include <torch/nativert/executor/ExecutionPlanner.h>
+#include <torch/nativert/executor/ExecutorConfig.h>
+#include <torch/nativert/executor/GraphExecutorBase.h>
+#include <torch/nativert/executor/Placement.h>
+#include <torch/nativert/executor/memory/FunctionSchema.h>
+#include <torch/nativert/executor/memory/LayoutPlanner.h>
+#include <torch/nativert/graph/Graph.h>
+#include <torch/nativert/graph/GraphSignature.h>
+#include <torch/nativert/kernels/KernelFactory.h>
+
+namespace torch::nativert {
+
+using namespace torch::nativert::detail;
+
+struct DistributedRunConfig;
+
+/**
+ * A very dumb executor. Basically just runs each node in order and contains a
+ * giant unordered map for every intermediate, no optimizations applied.
+ */
+class Executor {
+  class ExecutorFrameDeleter {
+   public:
+    explicit ExecutorFrameDeleter(Executor& e) : e_(&e) {}
+    ExecutorFrameDeleter(ExecutorFrameDeleter&&) = default;
+    ExecutorFrameDeleter& operator=(ExecutorFrameDeleter&&) = default;
+    ExecutorFrameDeleter(const ExecutorFrameDeleter&) = default;
+    ExecutorFrameDeleter& operator=(const ExecutorFrameDeleter&) = default;
+    ~ExecutorFrameDeleter() = default;
+
+    void operator()(ExecutionFrame* p) {
+      e_->returnExecutorFrameToPool(std::unique_ptr<ExecutionFrame>(p));
+    }
+
+   private:
+    Executor* e_;
+  };
+  class ExecutorFramePtr {
+   public:
+    ExecutorFramePtr(std::unique_ptr<ExecutionFrame> ptr, Executor& e)
+        : ptr_(std::unique_ptr<ExecutionFrame, ExecutorFrameDeleter>(
+              ptr.release(),
+              ExecutorFrameDeleter{e})) {}
+    ExecutorFramePtr() = delete;
+    ExecutorFramePtr(ExecutorFramePtr&&) = default;
+    ExecutorFramePtr& operator=(ExecutorFramePtr&&) = default;
+    ExecutorFramePtr(const ExecutorFramePtr&) = delete;
+    ExecutorFramePtr& operator=(const ExecutorFramePtr&) = delete;
+    ~ExecutorFramePtr() = default;
+
+    ExecutionFrame& operator*() {
+      return *ptr_;
+    }
+
+    ExecutionFrame* operator->() {
+      return ptr_.get();
+    }
+
+   private:
+    std::unique_ptr<ExecutionFrame, ExecutorFrameDeleter> ptr_;
+  };
+
+ public:
+  // Constrcutor used for Inference Path
+  Executor(
+      torch::nativert::ExecutorConfig executorConfig,
+      std::shared_ptr<Graph> graph,
+      std::shared_ptr<Weights> weights,
+      const Placement& placement = Placement(),
+      std::shared_ptr<caffe2::serialize::PyTorchStreamReader>
+          pytorchStreamReader = nullptr,
+      const MakeProxyExecutorFn& makeProxyExecutorFunc = nullptr);
+
+  std::shared_ptr<Weights> getWeights() {
+    std::shared_ptr<Weights> ret;
+    weights_.withLock([&](auto& w) { ret = w; });
+    return ret;
+  }
+
+  void processWeights(std::shared_ptr<Weights> weights);
+  void atomicSwapWeights(std::shared_ptr<Weights> weights);
+
+  // This API only returns the flattened UserOutputs,
+  // intended to be used for Inference path
+  // TODO Investigate whether we should remove this, still seems
+  //      useful for testing.
+  std::vector<c10::IValue> execute(std::vector<c10::IValue> inputs);
+
+  std::vector<c10::IValue> execute(
+      const std::vector<c10::IValue>& args,
+      const std::unordered_map<std::string, c10::IValue>& kwargs,
+      const ITreeSpec& inputTreeSpec);
+
+  ProfileMetrics benchmarkIndividualNodes(
+      std::vector<std::vector<c10::IValue>> inputsList,
+      const uint32_t warmupRuns,
+      const uint32_t mainRuns);
+
+  const torch::nativert::GraphSignature& graphSignature() const {
+    return graph_->signature();
+  }
+
+  static std::string className() {
+    return "Executor.v0";
+  }
+
+  const torch::nativert::ExecutorConfig& executorConfig() const {
+    return executorConfig_;
+  }
+
+  std::vector<DelegateExecutor*> getDelegates();
+
+  // Get the number of execution frames in the pool
+  int getNumExecutionFrames() const {
+    return numExecutionFrames_.load();
+  }
+
+  static c10::FastMap<std::string /* target */, torch::nativert::FunctionSchema>
+  getKernelSchemas(const std::vector<std::unique_ptr<OpKernel>>& kernels);
+
+ protected:
+  torch::nativert::ExecutorConfig executorConfig_;
+
+  std::shared_ptr<Graph> graph_;
+
+  // manages the parameters, buffers and tensor constants
+  c10::Synchronized<std::shared_ptr<Weights>> weights_;
+
+  void initialize(
+      std::shared_ptr<Weights> weights,
+      std::shared_ptr<caffe2::serialize::PyTorchStreamReader>
+          pytorchStreamReader);
+
+  ExecutorFramePtr getExecutorFrameFromPool();
+  void returnExecutorFrameToPool(std::unique_ptr<ExecutionFrame> frame);
+
+  // Clears stale execution frames from the pool
+  void clearStaleExecutionFrames();
+
+ private:
+  // Structure to track execution frame usage
+  struct ExecutionFrameEntry {
+    bool used{false};
+    std::unique_ptr<ExecutionFrame> frame;
+
+    // Add move constructor and assignment operator
+    ExecutionFrameEntry() = default;
+    ExecutionFrameEntry(ExecutionFrameEntry&& other) noexcept
+        : used(other.used), frame(std::move(other.frame)) {}
+    ExecutionFrameEntry& operator=(ExecutionFrameEntry&& other) noexcept {
+      used = other.used;
+      frame = std::move(other.frame);
+      return *this;
+    }
+    // Delete copy constructor and assignment operator
+    ExecutionFrameEntry(const ExecutionFrameEntry&) = delete;
+    ExecutionFrameEntry& operator=(const ExecutionFrameEntry&) = delete;
+  };
+
+  void maybeRunConstantFolding(std::shared_ptr<Weights> weights);
+  void validateInputs(const std::vector<c10::IValue>& inputs) const;
+
+  // Helper method to get current timestamp in seconds
+  int64_t getCurrentTimestampSeconds() const;
+
+  std::unique_ptr<GraphExecutorBase> graphExecutor_;
+
+  const Placement placement_;
+
+  // NOTE: delegateExecutors_ is used by nodeKernels_ inside graphExecutor_.
+  std::vector<std::unique_ptr<DelegateExecutor>> delegateExecutors_;
+
+  std::vector<ConstFoldingExecution> constFoldingExecutions_;
+
+  std::optional<ConstantFolder> constantFolder_;
+
+  MakeProxyExecutorFn makeProxyExecutorFunc_;
+
+  c10::Semaphore sem_;
+  torch::nativert::detail::MPMCQueue<std::unique_ptr<ExecutionFrame>>
+      executionFrames_;
+  torch::nativert::detail::MPMCQueue<ExecutionFrameEntry>
+      clearedExecutionFrames_;
+  std::atomic_int64_t numExecutionFrames_;
+
+  std::unique_ptr<LayoutPlanner> layoutPlanner_;
+  std::atomic_int64_t lastClearedTimestamp_;
+  std::mutex cleanupLock_;
+  std::atomic_bool clearingInProgress_{false};
+};
+
+} // namespace torch::nativert