Files
pytorch/caffe2/core/plan_executor.cc
Sebastian Messmer 643ca5def2 Replace c10::guts::stuff with std::stuff (#30915)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/30915

Since we now have C++14, we don't need these c10::guts helpers anymore
ghstack-source-id: 95777609

Test Plan: waitforsandcastle

Differential Revision: D18869639

fbshipit-source-id: 97716f932297c64c6e814410ac47b444c33d4e2e
2019-12-16 13:57:19 -08:00

523 lines
18 KiB
C++

#include "caffe2/core/plan_executor.h"
#include <condition_variable>
#include <memory>
#include <mutex>
#include <thread>
#include <unordered_map>
#include <vector>
#include "caffe2/core/timer.h"
#include "caffe2/core/workspace.h"
#include "caffe2/proto/caffe2_pb.h"
C10_DEFINE_bool(
caffe2_handle_executor_threads_exceptions,
false,
"If used we will handle exceptions in executor threads. "
"This avoids SIGABRT but may cause process to deadlock");
namespace caffe2 {
namespace {
struct NetDefInfo {
const NetDef* netDef;
// in order to keep the "override existing nets" on the top-level workflow,
// we need to makr the nets that already exist so that we can override them
// exactly once.
bool needsOverride;
};
using NetDefMap = std::unordered_map<std::string, NetDefInfo>;
struct Reporter {
struct ReporterInstance {
std::mutex report_mutex;
std::condition_variable report_cv;
std::thread report_thread;
ReporterInstance(int intervalMillis, bool* done, std::function<void()> f) {
auto interval = std::chrono::milliseconds(intervalMillis);
auto reportWorker = [=]() {
std::unique_lock<std::mutex> lk(report_mutex);
do {
report_cv.wait_for(lk, interval, [&]() { return *done; });
f();
} while (!*done);
};
report_thread = std::thread(reportWorker);
}
};
void start(int64_t intervalMillis, std::function<void()> f) {
instances_.emplace_back(new ReporterInstance(intervalMillis, &done, f));
}
~Reporter() {
done = true;
for (auto& instance : instances_) {
if (!instance->report_thread.joinable()) {
continue;
}
instance->report_cv.notify_all();
instance->report_thread.join();
}
}
private:
std::vector<std::unique_ptr<ReporterInstance>> instances_;
bool done{false};
};
// Returns a function that returns `true` if we should continue
// iterating, given the current iteration count.
std::function<bool(int64_t)> getContinuationTest(
Workspace* /*ws*/,
const ExecutionStep& step) {
if (step.has_should_stop_blob()) {
CAFFE_ENFORCE(
!step.has_num_iter(),
"Must not specify num_iter if should_stop_blob is set");
}
if (!step.has_should_stop_blob()) { // control by iteration
CAFFE_ENFORCE(!step.has_only_once(), "not supported");
int64_t iterations = step.has_num_iter() ? step.num_iter() : 1;
VLOG(1) << "Will execute step " << step.name() << " for " << iterations
<< " iterations.";
return [=](int64_t i) { return i < iterations; };
} else { // control by signal blob
bool onlyOnce = step.has_only_once() && step.only_once();
VLOG(1) << "Will execute step" << step.name() << (onlyOnce ? " once " : "")
<< " until stopped by blob " << step.should_stop_blob();
if (onlyOnce) {
return [](int64_t i) { return i == 0; };
} else {
return [](int64_t /*i*/) { return true; };
}
}
};
// if the blob doesn't exist or is not initialized, return false
inline bool getShouldStop(const Blob* b) {
if (!b || b->meta().id() == TypeIdentifier::uninitialized()) { // not exist or uninitialized
return false;
}
const auto& t = b->Get<TensorCPU>();
CAFFE_ENFORCE(t.IsType<bool>() && t.numel() == 1, "expects a scalar boolean");
return *(t.template data<bool>());
}
/**
* Injects a blob named 'GLOBAL_WORKSPACE_ID' for each workspace, only if
* another blob named 'NODE_ID' is present. 'NODE_ID' blob can be used in a
* distribued run and in this case 'GLOBAL_WORKSPACE_ID' can be used across
* machines for other purposes (e.g. to support model parallelism). Essentially,
* 'GLOBAL_WORKSPACE_ID' is an identifier for a workspace that is unique across
* all 'NODE_ID's.
*/
struct WorkspaceIdInjector {
static const string NODE_ID;
static const string GLOBAL_WORKSPACE_ID;
void InjectWorkspaceId(Workspace* workspace) {
if (workspace->HasBlob(NODE_ID)) {
Blob* node_id_blob = workspace->GetBlob(NODE_ID);
const TensorCPU& node_id_tensor = node_id_blob->template Get<TensorCPU>();
int node_id = node_id_tensor.template data<int32_t>()[0];
CAFFE_ENFORCE(
seq_ < (1 << 16),
"Integer overflow while calculating GLOBAL_WORKSPACE_ID blob");
int32_t global_ws_id = (seq_++) + (static_cast<int32_t>(node_id) << 16);
Blob* global_ws_id_blob = workspace->CreateLocalBlob(GLOBAL_WORKSPACE_ID);
TensorCPU* global_ws_id_tensor =
BlobGetMutableTensor(global_ws_id_blob, CPU);
global_ws_id_tensor->Resize();
global_ws_id_tensor->template mutable_data<int32_t>()[0] = global_ws_id;
VLOG(1) << "Adding " << GLOBAL_WORKSPACE_ID << " = " << global_ws_id;
}
}
private:
std::atomic<int> seq_{0};
};
const string WorkspaceIdInjector::NODE_ID = "NODE_ID";
const string WorkspaceIdInjector::GLOBAL_WORKSPACE_ID = "GLOBAL_WORKSPACE_ID";
struct CompiledExecutionStep;
/**
* Controls compilation and runtime cloning of execution steps.
*
* If step.create_workspace=False, this wrapper will compile the execution step
* and its children once, and calls to ExecutionStepWrapper::compiled() will
* always return the same compiled step.
* If step.create_workspace=True, no compilation is done at creation time.
* Instead, a new CompiledExecutionStep is created for every compiled() call.
*
* CompiledExecutionStep owns its Workspace, and the lifetime of the
* compiled step along with its workspace will be tied to the lifetime of
* the `CompileGuard` object returned by compiled().
*
* ExecuteStepRecursive will call call compiled() once before the given
* execution step is run and keep it alive for the length of its execution.
* This means that, for steps with create_workspace=true, a child workspace
* will be created every time the step is executed, and destroyed right
* afterwards.
*/
struct ExecutionStepWrapper {
ExecutionStepWrapper(
const ExecutionStep* step,
Workspace* externalWorkspace,
ShouldContinue externalShouldContinue,
NetDefMap* netDefs,
WorkspaceIdInjector* ws_id_injector)
: step_(step),
externalWorkspace_(externalWorkspace),
externalShouldContinue_(externalShouldContinue),
netDefs_(netDefs),
ws_id_injector_(ws_id_injector) {
// If this execution step does not create a child workspace,
// then just eagerly-compile it. This will trigger CreateNet on the
// nets used by this execution step.
if (!step_->create_workspace()) {
compiledStep_ = doCompile();
}
}
class CompiledGuard {
void reset(std::unique_ptr<CompiledExecutionStep>&& compiled) {
compiled_ = std::move(compiled);
compiledRef_ = compiled_.get();
}
void reset(CompiledExecutionStep* compiledRef) {
compiled_.reset();
compiledRef_ = compiledRef;
}
public:
CompiledExecutionStep* operator->() {
return compiledRef_;
}
private:
CompiledGuard() {}
std::unique_ptr<CompiledExecutionStep> compiled_;
CompiledExecutionStep* compiledRef_;
friend struct ExecutionStepWrapper;
};
const ExecutionStep& step() {
return *step_;
}
CompiledGuard compiled() {
CompiledGuard guard;
if (compiledStep_) {
guard.reset(compiledStep_.get());
} else {
guard.reset(doCompile());
}
return guard;
}
private:
std::unique_ptr<CompiledExecutionStep> doCompile();
const ExecutionStep* step_;
Workspace* externalWorkspace_;
ShouldContinue externalShouldContinue_;
NetDefMap* netDefs_;
std::unique_ptr<CompiledExecutionStep> compiledStep_;
WorkspaceIdInjector* ws_id_injector_;
};
struct CompiledExecutionStep {
typedef std::function<bool(int)> ShouldContinue;
CompiledExecutionStep(
const ExecutionStep* mainStep,
Workspace* externalWorkspace,
ShouldContinue externalShouldContinue,
NetDefMap* netDefs,
WorkspaceIdInjector* ws_id_injector)
: step(mainStep) {
if (mainStep->create_workspace()) {
localWorkspace_.reset(new Workspace(externalWorkspace));
workspace = localWorkspace_.get();
ws_id_injector->InjectWorkspaceId(workspace);
} else {
workspace = externalWorkspace;
}
CAFFE_ENFORCE(
(step->substep_size() == 0 || step->network_size() == 0),
"An ExecutionStep should either have substep or networks"
"but not both.");
auto createAndGetNet = [&](const std::string& network_name) {
auto it = netDefs->find(network_name);
CAFFE_ENFORCE(
it != netDefs->end(),
"ExecutionStep " + mainStep->name() + " uses undefined net " +
network_name);
// needsOverride does not need synchronization because it is only
// relevant for non-dynamic executions steps. This is due to the fact
// that concurrent nets run on child workspaces, that do not needOverride.
if (it->second.needsOverride || !workspace->GetNet(network_name)) {
workspace->CreateNet(*it->second.netDef, true);
it->second.needsOverride = false;
}
auto* net = workspace->GetNet(network_name);
CAFFE_ENFORCE(net != nullptr, "Network ", network_name, " not found.");
return net;
};
if (step->substep_size()) {
ShouldContinue substepShouldContinue;
if (!step->concurrent_substeps() || step->substep().size() <= 1) {
substepShouldContinue = externalShouldContinue;
} else {
substepShouldContinue = [this, externalShouldContinue](int64_t it) {
return !gotFailure && externalShouldContinue(it);
};
}
for (const auto& ss : step->substep()) {
auto compiledSubstep = std::make_shared<ExecutionStepWrapper>(
&ss, workspace, substepShouldContinue, netDefs, ws_id_injector);
if (ss.has_run_every_ms()) {
reportSubsteps.push_back(compiledSubstep);
} else {
recurringSubsteps.push_back(compiledSubstep);
}
}
} else {
for (const string& network_name : step->network()) {
networks.push_back(createAndGetNet(network_name));
}
}
if (step->has_should_stop_blob()) {
shouldStop = workspace->GetBlob(step->should_stop_blob());
CAFFE_ENFORCE(
shouldStop, "blob ", step->should_stop_blob(), " does not exist");
}
if (step->has_report_net()) {
CAFFE_ENFORCE(
step->has_report_interval(),
"A report_interval must be provided if report_net is set.");
reportNet = createAndGetNet(step->report_net());
} else {
reportNet = nullptr;
}
netShouldContinue = getContinuationTest(workspace, *step);
shouldContinue = [this, externalShouldContinue](int64_t iter) {
return externalShouldContinue(iter) && this->netShouldContinue(iter);
};
}
const ExecutionStep* step;
Workspace* workspace;
vector<std::shared_ptr<ExecutionStepWrapper>> reportSubsteps;
vector<std::shared_ptr<ExecutionStepWrapper>> recurringSubsteps;
vector<NetBase*> networks;
NetBase* reportNet;
Blob* shouldStop{nullptr};
ShouldContinue netShouldContinue;
ShouldContinue shouldContinue;
std::atomic<bool> gotFailure{false};
private:
std::unique_ptr<Workspace> localWorkspace_;
};
std::unique_ptr<CompiledExecutionStep> ExecutionStepWrapper::doCompile() {
return std::unique_ptr<CompiledExecutionStep>(new CompiledExecutionStep(
step_,
externalWorkspace_,
externalShouldContinue_,
netDefs_,
ws_id_injector_));
}
#define CHECK_SHOULD_STOP(step, shouldStop) \
if (getShouldStop(shouldStop)) { \
VLOG(1) << "Execution step " << step.name() << " stopped by " \
<< step.should_stop_blob(); \
return true; \
}
bool ExecuteStepRecursive(ExecutionStepWrapper& stepWrapper) {
const auto& step = stepWrapper.step();
auto compiledStep = stepWrapper.compiled();
VLOG(1) << "Running execution step " << step.name();
std::unique_ptr<Reporter> reporter;
if (step.has_report_net() || compiledStep->reportSubsteps.size() > 0) {
reporter = std::make_unique<Reporter>();
auto* reportNet = compiledStep->reportNet;
if (reportNet) {
VLOG(1) << "Starting reporter net";
reporter->start(step.report_interval() * 1000, [reportNet]() {
if (!reportNet->Run()) {
LOG(WARNING) << "Error running report_net.";
}
});
}
for (auto& substepWrapper : compiledStep->reportSubsteps) {
reporter->start(
substepWrapper->step().run_every_ms(), [substepWrapper]() {
if (!ExecuteStepRecursive(*substepWrapper)) {
LOG(WARNING) << "Error running report step.";
}
});
}
}
const Blob* shouldStop = compiledStep->shouldStop;
if (step.substep_size()) {
bool sequential =
(!step.concurrent_substeps() || step.substep().size() <= 1) &&
(!step.has_num_concurrent_instances() ||
step.num_concurrent_instances() <= 1);
for (int64_t iter = 0; compiledStep->shouldContinue(iter); ++iter) {
if (sequential) {
VLOG(1) << "Executing step " << step.name() << " iteration " << iter;
for (auto& substepWrapper : compiledStep->recurringSubsteps) {
if (!ExecuteStepRecursive(*substepWrapper)) {
return false;
}
CHECK_SHOULD_STOP(step, shouldStop);
}
} else {
VLOG(1) << "Executing step " << step.name() << " iteration " << iter
<< " with " << step.substep().size() << " concurrent substeps";
std::atomic<int> next_substep{0};
std::mutex exception_mutex;
string first_exception;
auto worker = [&]() {
auto num_substeps = compiledStep->recurringSubsteps.size();
int substep_id = next_substep++ % num_substeps;
if (compiledStep->gotFailure) {
return;
}
try {
if (!ExecuteStepRecursive(
*compiledStep->recurringSubsteps.at(substep_id))) {
compiledStep->gotFailure = true;
}
} catch (const std::exception& ex) {
std::lock_guard<std::mutex> guard(exception_mutex);
if (!first_exception.size()) {
first_exception = c10::GetExceptionString(ex);
LOG(ERROR) << "Parallel worker exception:\n" << first_exception;
}
compiledStep->gotFailure = true;
if (!FLAGS_caffe2_handle_executor_threads_exceptions) {
// In complex plans other threads might get stuck if another
// one fails. So we let exception to go out of thread which
// causes SIGABRT. In local setup one might use this flag
// in order to use Python debugger after a failure
throw;
}
}
};
std::vector<std::thread> threads;
auto numThreads = compiledStep->recurringSubsteps.size();
if (step.has_num_concurrent_instances()) {
numThreads *= step.num_concurrent_instances();
}
for (size_t i = 0; i < numThreads; ++i) {
threads.emplace_back(worker);
}
for (auto& thread : threads) {
thread.join();
}
if (compiledStep->gotFailure) {
LOG(ERROR) << "One of the workers failed.";
if (first_exception.size()) {
CAFFE_THROW(
"One of the workers died with an unhandled exception ",
first_exception);
}
return false;
}
// concurrent substeps should be careful about setting should_stop_blob
CHECK_SHOULD_STOP(step, shouldStop);
}
}
return true;
} else {
// If this ExecutionStep just contains nets, we can directly run it.
for (int64_t iter = 0; compiledStep->shouldContinue(iter); ++iter) {
VLOG(1) << "Executing networks " << step.name() << " iteration " << iter;
for (NetBase* network : compiledStep->networks) {
if (!network->Run()) {
return false;
}
CHECK_SHOULD_STOP(step, shouldStop);
}
}
}
return true;
}
#undef CHECK_SHOULD_STOP
}
bool RunPlanOnWorkspace(
Workspace* ws,
const PlanDef& plan,
ShouldContinue shouldContinue) {
LOG(INFO) << "Started executing plan " << plan.name();
if (plan.execution_step_size() == 0) {
LOG(WARNING) << "Nothing to run - did you define a correct plan?";
// We will do nothing, but the plan is still legal so we will return true.
return true;
}
LOG(INFO) << "Initializing networks for plan " << plan.name();
NetDefMap net_defs;
for (const NetDef& net_def : plan.network()) {
LOG(INFO) << "Processing net '" << net_def.name() << "', type: '"
<< net_def.type() << "', #ops: " << net_def.op_size()
<< ", num_workers: " << net_def.num_workers();
CAFFE_ENFORCE(
net_defs.count(net_def.name()) == 0,
"Your plan contains networks of the same name \"",
net_def.name(),
"\", which should not happen. Check your plan to see "
"if you made a programming error in creating the plan.");
auto netAlreadyExists = ws->GetNet(net_def.name()) != nullptr;
net_defs[net_def.name()] = NetDefInfo{&net_def, netAlreadyExists};
}
WorkspaceIdInjector ws_id_injector;
Timer plan_timer;
for (const ExecutionStep& step : plan.execution_step()) {
Timer step_timer;
ExecutionStepWrapper stepWrapper(
&step, ws, shouldContinue, &net_defs, &ws_id_injector);
if (!ExecuteStepRecursive(stepWrapper)) {
LOG(ERROR) << "Failed initializing step " << step.name();
return false;
}
LOG(INFO) << "Step " << step.name() << " in plan " << plan.name()
<< " took " << step_timer.Seconds() << " seconds.";
}
LOG(INFO) << "Total plan " << plan.name() << " took " << plan_timer.Seconds()
<< " seconds.";
LOG(INFO) << "Plan " << plan.name() << " executed successfully.";
return true;
}
}