mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17481 Usually, feature macros are either defined or undefined and checked accordingly. C10_MOBILE was a weird special case that was always defined but either defined to 1 or to 0. This caused a lot of confusion for me when trying to disable something from mobile build and it also disabled it from the server build (because I was using ifdef). Also, I found a place in the existing code base that made that wrong assumption and used the macro wrongly, see https://fburl.com/y4icohts Reviewed By: dzhulgakov Differential Revision: D14214825 fbshipit-source-id: f3a155b6d43d334e8839e2b2e3c40ed2c773eab6
210 lines
6.4 KiB
C++
210 lines
6.4 KiB
C++
#include "observers/perf_observer.h"
|
|
#include "observers/observer_config.h"
|
|
#ifndef C10_MOBILE
|
|
#include "caffe2/core/flags.h"
|
|
#include "observers/net_observer_reporter_print.h"
|
|
#endif
|
|
|
|
#include <random>
|
|
#include "caffe2/core/common.h"
|
|
#include "caffe2/core/init.h"
|
|
#include "caffe2/core/operator.h"
|
|
|
|
#ifndef C10_MOBILE
|
|
C10_DEFINE_int64(
|
|
aiBench_netInitSampleRate,
|
|
0,
|
|
"One in N sampling rate for net delay");
|
|
|
|
C10_DEFINE_int64(
|
|
aiBench_netFollowupSampleRate,
|
|
0,
|
|
"One in N sampling rate for net delay");
|
|
|
|
C10_DEFINE_int64(
|
|
aiBench_netFollowupSampleCount,
|
|
0,
|
|
"control the following c logs");
|
|
|
|
C10_DEFINE_int64(
|
|
aiBench_operatorNetSampleRatio,
|
|
0,
|
|
"One in N sampling rate for operator delay");
|
|
|
|
C10_DEFINE_int64(
|
|
aiBench_skipIters,
|
|
0,
|
|
"skip the first N iterations of the net run");
|
|
#endif
|
|
|
|
namespace caffe2 {
|
|
namespace {
|
|
|
|
bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
|
|
AddGlobalNetObserverCreator([](NetBase* subject) {
|
|
return caffe2::make_unique<PerfNetObserver>(subject);
|
|
});
|
|
|
|
#if !defined(C10_MOBILE)
|
|
// for aibench usage
|
|
caffe2::ObserverConfig::setReporter(
|
|
caffe2::make_unique<caffe2::NetObserverReporterPrint>());
|
|
|
|
caffe2::ObserverConfig::initSampleRate(
|
|
FLAGS_aiBench_netInitSampleRate,
|
|
FLAGS_aiBench_netFollowupSampleRate,
|
|
FLAGS_aiBench_netFollowupSampleCount,
|
|
FLAGS_aiBench_operatorNetSampleRatio,
|
|
FLAGS_aiBench_skipIters);
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
} // namespace
|
|
|
|
REGISTER_CAFFE2_EARLY_INIT_FUNCTION(
|
|
registerGlobalPerfNetObserverCreator,
|
|
®isterGlobalPerfNetObserverCreator,
|
|
"Caffe2 net global observer creator");
|
|
|
|
PerfNetObserver::PerfNetObserver(NetBase* subject_)
|
|
: NetObserver(subject_), numRuns_(0) {}
|
|
|
|
PerfNetObserver::~PerfNetObserver() {}
|
|
|
|
void PerfNetObserver::Start() {
|
|
static int visitCount = 0;
|
|
// Select whether to log the operator or the net.
|
|
// We have one sample rate for the entire app.
|
|
int netInitSampleRate = ObserverConfig::getNetInitSampleRate();
|
|
int netFollowupSampleRate = ObserverConfig::getNetFollowupSampleRate();
|
|
int netFollowupSampleCount = ObserverConfig::getNetFollowupSampleCount();
|
|
int operatorNetSampleRatio = ObserverConfig::getOpoeratorNetSampleRatio();
|
|
int skipIters = ObserverConfig::getSkipIters();
|
|
int sampleRate = visitCount > 0 ? netFollowupSampleRate : netInitSampleRate;
|
|
if (skipIters <= numRuns_ && sampleRate > 0 && rand() % sampleRate == 0) {
|
|
visitCount++;
|
|
if (visitCount == netFollowupSampleCount) {
|
|
visitCount = 0;
|
|
}
|
|
if (operatorNetSampleRatio > 0 && rand() % operatorNetSampleRatio == 0) {
|
|
logType_ = PerfNetObserver::OPERATOR_DELAY;
|
|
} else {
|
|
logType_ = PerfNetObserver::NET_DELAY;
|
|
}
|
|
} else {
|
|
logType_ = PerfNetObserver::NONE;
|
|
}
|
|
numRuns_++;
|
|
|
|
if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
|
|
/* Always recreate new operator observers
|
|
whenever we measure operator delay */
|
|
const auto& operators = subject_->GetOperators();
|
|
for (auto* op : operators) {
|
|
observerMap_[op] = op->AttachObserver(
|
|
caffe2::make_unique<PerfOperatorObserver>(op, this));
|
|
}
|
|
}
|
|
|
|
if (logType_ != PerfNetObserver::NONE) {
|
|
/* Only start timer when we need to */
|
|
timer_.Start();
|
|
}
|
|
}
|
|
|
|
void PerfNetObserver::Stop() {
|
|
if (logType_ == PerfNetObserver::NONE) {
|
|
return;
|
|
}
|
|
auto currentRunTime = timer_.MilliSeconds();
|
|
std::map<std::string, PerformanceInformation> info;
|
|
PerformanceInformation net_perf;
|
|
net_perf.latency = currentRunTime;
|
|
if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
|
|
const auto& operators = subject_->GetOperators();
|
|
for (int idx = 0; idx < operators.size(); ++idx) {
|
|
const auto* op = operators[idx];
|
|
auto name = getObserverName(op, idx);
|
|
PerformanceInformation p;
|
|
|
|
p.latency = static_cast<const PerfOperatorObserver*>(observerMap_[op])
|
|
->getMilliseconds();
|
|
|
|
p.engine = op->engine();
|
|
p.type = op->type();
|
|
p.tensor_shapes =
|
|
static_cast<const PerfOperatorObserver*>(observerMap_[op])
|
|
->getTensorShapes();
|
|
|
|
if (op->has_debug_def()) {
|
|
for (auto arg : op->debug_def().arg()) {
|
|
p.args.emplace_back(arg);
|
|
}
|
|
}
|
|
|
|
info.insert({name, p});
|
|
}
|
|
|
|
/* clear all operator delay after use so that we don't spent time
|
|
collecting the operator delay info in later runs */
|
|
for (auto* op : operators) {
|
|
op->DetachObserver(observerMap_[op]);
|
|
}
|
|
observerMap_.clear();
|
|
}
|
|
info.insert({"NET_DELAY", net_perf});
|
|
ObserverConfig::getReporter()->report(subject_, info);
|
|
}
|
|
|
|
caffe2::string PerfNetObserver::getObserverName(const OperatorBase* op, int idx)
|
|
const {
|
|
string opType = op->has_debug_def() ? op->debug_def().type() : "NO_TYPE";
|
|
string displayName =
|
|
(op->has_debug_def() ? op->debug_def().name().size()
|
|
? op->debug_def().name()
|
|
: (op->debug_def().output_size() ? op->debug_def().output(0)
|
|
: "NO_OUTPUT")
|
|
: "NO_DEF");
|
|
caffe2::string name =
|
|
"ID_" + c10::to_string(idx) + "_" + opType + "_" + displayName;
|
|
return name;
|
|
}
|
|
|
|
PerfOperatorObserver::PerfOperatorObserver(
|
|
OperatorBase* op,
|
|
PerfNetObserver* netObserver)
|
|
: ObserverBase<OperatorBase>(op),
|
|
netObserver_(netObserver),
|
|
milliseconds_(0) {
|
|
CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net");
|
|
}
|
|
|
|
PerfOperatorObserver::~PerfOperatorObserver() {}
|
|
|
|
void PerfOperatorObserver::Start() {
|
|
/* Get the time from the start of the net minus the time spent
|
|
in previous invocations. It is the time spent on other operators.
|
|
This way, when the operator finishes, the time from the start of the net
|
|
minus the time spent in all other operators is the total time on this
|
|
operator. This is done to avoid saving a timer in each operator */
|
|
milliseconds_ = netObserver_->getTimer().MilliSeconds() - milliseconds_;
|
|
}
|
|
|
|
void PerfOperatorObserver::Stop() {
|
|
/* Time from the start of the net minus the time spent on all other
|
|
operators is the time spent on this operator */
|
|
milliseconds_ = netObserver_->getTimer().MilliSeconds() - milliseconds_;
|
|
tensor_shapes_ = subject_->InputTensorShapes();
|
|
}
|
|
|
|
double PerfOperatorObserver::getMilliseconds() const {
|
|
return milliseconds_;
|
|
}
|
|
|
|
std::vector<TensorShape> PerfOperatorObserver::getTensorShapes() const {
|
|
return tensor_shapes_;
|
|
}
|
|
|
|
} // namespace caffe2
|