#pragma once #include #include // @manual #include // @manual #include namespace moodycamel { struct ProducerToken; struct ConsumerToken; struct ConcurrentQueueDefaultTraits; template class ConcurrentQueue; } // namespace moodycamel namespace torch::nativert { class ThreadPoolExecutor; typedef std::function Work; struct WorkUnit { const Node* node; OpKernel* kernel; std::vector users; void run(ThreadPoolExecutor* executor, SessionState* sessionState); }; class ThreadPoolExecutor { public: explicit ThreadPoolExecutor(); ~ThreadPoolExecutor(); ThreadPoolExecutor(const ThreadPoolExecutor&) = delete; ThreadPoolExecutor& operator=(ThreadPoolExecutor const&) = delete; ThreadPoolExecutor(ThreadPoolExecutor&&) = delete; ThreadPoolExecutor& operator=(ThreadPoolExecutor&&) = delete; void run(SessionState& session, const std::vector& roots); void start(int32_t numThreads); void stop(); // execute unit on the current thread // NOTE: children can still be offloaded to other threads C10_ALWAYS_INLINE void execute_inline(SessionState* session, WorkUnit* unit); void add(SessionState* session, WorkUnit* unit); void add( SessionState* session, std::vector::const_iterator&& begin, const std::vector::const_iterator&& end); C10_ALWAYS_INLINE moodycamel::ProducerToken& ptok(); C10_ALWAYS_INLINE moodycamel::ConsumerToken& ctok(); private: void loop(); std::atomic_bool stopped_{false}; std::unique_ptr sem_{std::make_unique()}; std::unique_ptr> work_; std::vector threads_; }; class ParallelGraphExecutor : public GraphExecutorBase { public: ParallelGraphExecutor( const Graph& graph, std::vector> nodeKernels, const torch::nativert::ExecutorConfig& executorConfig); std::vector execute( ExecutionFrame& frame, std::vector inputs) override; std::vector executeWithPrefilledFrame( ExecutionFrame& frame) override; private: ThreadPoolExecutor executor_; std::vector inputWorkUnits_; c10::FastMap nodeToWorkUnit_; std::vector workUnits_; const Graph& graph_; c10::FastMap> producers_; }; } // namespace torch::nativert