Files
pytorch/torch/nativert/executor/ParallelGraphExecutor.h
dolpm 725c327284 [nativert] add memory overlap debug assertion (#157290)
Summary: better safe than sorry. will throw if memory overlap detected when using planned tensors and debug mode is enabled -- this will make our planning unit tests more robust.

Test Plan:
ci

Rollback Plan:

Differential Revision: D77327841

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157290
Approved by: https://github.com/SherlockNoMad, https://github.com/zhxchen17
2025-07-14 19:12:41 +00:00

95 lines
2.6 KiB
C++

#pragma once
#include <c10/util/Semaphore.h>
#include <torch/nativert/executor/GraphExecutorBase.h>
#include <torch/nativert/executor/SessionState.h>
#include <thread>
namespace moodycamel {
struct ProducerToken;
struct ConsumerToken;
struct ConcurrentQueueDefaultTraits;
template <typename T, typename Traits>
class ConcurrentQueue;
} // namespace moodycamel
namespace torch::nativert {
class ThreadPoolExecutor;
typedef std::function<void()> Work;
struct WorkUnit {
const Node* node;
OpKernel* kernel;
std::vector<WorkUnit*> users;
void run(ThreadPoolExecutor* executor, SessionState* sessionState);
};
class ThreadPoolExecutor {
public:
explicit ThreadPoolExecutor();
~ThreadPoolExecutor();
ThreadPoolExecutor(const ThreadPoolExecutor&) = delete;
ThreadPoolExecutor& operator=(ThreadPoolExecutor const&) = delete;
ThreadPoolExecutor(ThreadPoolExecutor&&) = delete;
ThreadPoolExecutor& operator=(ThreadPoolExecutor&&) = delete;
void run(SessionState& session, const std::vector<WorkUnit*>& roots);
void start(int32_t numThreads);
void stop();
// execute unit on the current thread
// NOTE: children can still be offloaded to other threads
C10_ALWAYS_INLINE void execute_inline(SessionState* session, WorkUnit* unit);
void add(SessionState* session, WorkUnit* unit);
void add(
SessionState* session,
std::vector<WorkUnit*>::const_iterator begin,
const std::vector<WorkUnit*>::const_iterator& end);
C10_ALWAYS_INLINE moodycamel::ProducerToken& ptok();
C10_ALWAYS_INLINE moodycamel::ConsumerToken& ctok();
private:
void loop();
std::atomic_bool stopped_{false};
std::unique_ptr<c10::Semaphore> sem_{std::make_unique<c10::Semaphore>()};
std::unique_ptr<moodycamel::ConcurrentQueue<
Work,
moodycamel::ConcurrentQueueDefaultTraits>>
work_;
std::vector<std::thread> threads_;
};
class ParallelGraphExecutor : public GraphExecutorBase {
public:
ParallelGraphExecutor(
const Graph& graph,
std::vector<std::unique_ptr<OpKernel>> nodeKernels,
const ExecutorConfig& executorConfig);
std::vector<c10::IValue> execute(
ExecutionFrame& frame,
std::vector<c10::IValue> inputs) override;
std::vector<c10::IValue> executeWithPrefilledFrame(
ExecutionFrame& frame) override;
private:
ThreadPoolExecutor executor_;
std::vector<WorkUnit*> inputWorkUnits_;
c10::FastMap<const Node*, WorkUnit*> nodeToWorkUnit_;
std::vector<WorkUnit> workUnits_;
const Graph& graph_;
c10::FastMap<const Node*, copyable_atomic<std::uint_fast32_t>> producers_;
};
} // namespace torch::nativert