mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
All changes other than the one to `tools/linter/adapters/s3_init_config.json` are generated by newer clang-format Pull Request resolved: https://github.com/pytorch/pytorch/pull/153889 Approved by: https://github.com/cyyever, https://github.com/atalman
195 lines
4.9 KiB
C++
195 lines
4.9 KiB
C++
#include <unordered_map>
|
|
#include <unordered_set>
|
|
|
|
#include <c10/util/error.h>
|
|
#include <torch/csrc/profiler/perf-inl.h>
|
|
#include <torch/csrc/profiler/perf.h>
|
|
|
|
namespace torch::profiler::impl::linux_perf {
|
|
|
|
#if defined(__ANDROID__) || defined(__linux__)
|
|
|
|
/*
|
|
* PerfEvent
|
|
* ---------
|
|
*/
|
|
|
|
/*
|
|
* Syscall wrapper for perf_event_open(2)
|
|
*/
|
|
inline static long perf_event_open(
|
|
struct perf_event_attr* hw_event,
|
|
pid_t pid,
|
|
int cpu,
|
|
int group_fd,
|
|
unsigned long flags) {
|
|
return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
|
|
}
|
|
|
|
// TODO sync with Kineto level abstract events in profiler/events.h
|
|
static const std::unordered_map<
|
|
std::string,
|
|
std::pair<perf_type_id, /* perf event type */ uint32_t>>
|
|
EventTable{
|
|
{"cycles",
|
|
std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)},
|
|
{"instructions",
|
|
std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS)},
|
|
|
|
// Non Standard events for testing
|
|
{"pagefaults",
|
|
std::make_pair(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS)},
|
|
{"backend-stall-cycles",
|
|
std::make_pair(
|
|
PERF_TYPE_HARDWARE,
|
|
PERF_COUNT_HW_STALLED_CYCLES_BACKEND)},
|
|
{"frontend-stall-cycles",
|
|
std::make_pair(
|
|
PERF_TYPE_HARDWARE,
|
|
PERF_COUNT_HW_STALLED_CYCLES_FRONTEND)}};
|
|
|
|
PerfEvent::~PerfEvent() {
|
|
if (fd_ > -1) {
|
|
close(fd_);
|
|
}
|
|
fd_ = -1; // poison
|
|
}
|
|
|
|
void PerfEvent::Init() {
|
|
TORCH_CHECK(!name_.empty(), "Invalid profiler event name");
|
|
|
|
auto const it = EventTable.find(name_);
|
|
if (it == EventTable.end()) {
|
|
TORCH_CHECK(false, "Unsupported profiler event name: ", name_);
|
|
}
|
|
|
|
struct perf_event_attr attr{};
|
|
|
|
attr.size = sizeof(perf_event_attr);
|
|
attr.type = it->second.first;
|
|
attr.config = it->second.second;
|
|
attr.disabled = 1;
|
|
attr.inherit = 1;
|
|
attr.exclude_kernel = 1; // TBD
|
|
attr.exclude_hv = 1;
|
|
/*
|
|
* These can be used to calculate estimated totals if the PMU is overcommitted
|
|
* and multiplexing is happening
|
|
*/
|
|
attr.read_format =
|
|
PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
|
|
|
|
pid_t pid = getpid(); // this pid
|
|
int cpu = -1; // all cpus
|
|
int group_fd = -1;
|
|
unsigned long flags = 0;
|
|
|
|
fd_ = static_cast<int>(perf_event_open(&attr, pid, cpu, group_fd, flags));
|
|
if (fd_ == -1) {
|
|
TORCH_CHECK(
|
|
false,
|
|
"perf_event_open() failed, error: ",
|
|
c10::utils::str_error(errno));
|
|
}
|
|
Reset();
|
|
}
|
|
|
|
uint64_t PerfEvent::ReadCounter() const {
|
|
PerfCounter counter{};
|
|
long n = read(fd_, &counter, sizeof(PerfCounter));
|
|
TORCH_CHECK(
|
|
n == sizeof(counter),
|
|
"Read failed for Perf event fd, event : ",
|
|
name_,
|
|
", error: ",
|
|
c10::utils::str_error(errno));
|
|
TORCH_CHECK(
|
|
counter.time_enabled == counter.time_running,
|
|
"Hardware performance counter time multiplexing is not handled yet",
|
|
", name: ",
|
|
name_,
|
|
", enabled: ",
|
|
counter.time_enabled,
|
|
", running: ",
|
|
counter.time_running);
|
|
return counter.value;
|
|
}
|
|
|
|
#else /* __ANDROID__ || __linux__ */
|
|
/*
|
|
* Shim class for unsupported platforms - this will always return 0 counter
|
|
* value
|
|
*/
|
|
|
|
PerfEvent::~PerfEvent() {}
|
|
|
|
void PerfEvent::Init() {}
|
|
|
|
uint64_t PerfEvent::ReadCounter() const {
|
|
return 0;
|
|
}
|
|
|
|
#endif /* __ANDROID__ || __linux__ */
|
|
|
|
/*
|
|
* PerfProfiler
|
|
* ------------
|
|
*/
|
|
|
|
void PerfProfiler::Configure(std::vector<std::string>& event_names) {
|
|
TORCH_CHECK(
|
|
event_names.size() <= MAX_EVENTS,
|
|
"Too many events to configure, configured: ",
|
|
event_names.size(),
|
|
", max allowed:",
|
|
MAX_EVENTS);
|
|
std::unordered_set<std::string> s(event_names.begin(), event_names.end());
|
|
TORCH_CHECK(
|
|
s.size() == event_names.size(), "Duplicate event names are not allowed!")
|
|
for (auto name : event_names) {
|
|
events_.emplace_back(name);
|
|
events_.back().Init();
|
|
}
|
|
|
|
// TODO
|
|
// Reset pthreadpool here to make sure we can attach to new children
|
|
// threads
|
|
}
|
|
|
|
void PerfProfiler::Enable() {
|
|
if (!start_values_.empty()) {
|
|
StopCounting();
|
|
}
|
|
|
|
start_values_.emplace(events_.size(), 0);
|
|
|
|
auto& sv = start_values_.top();
|
|
for (unsigned i = 0; i < events_.size(); ++i) {
|
|
sv[i] = events_[i].ReadCounter();
|
|
}
|
|
StartCounting();
|
|
}
|
|
|
|
void PerfProfiler::Disable(perf_counters_t& vals) {
|
|
StopCounting();
|
|
TORCH_CHECK(
|
|
vals.size() == events_.size(),
|
|
"Can not fit all perf counters in the supplied container");
|
|
TORCH_CHECK(
|
|
!start_values_.empty(), "PerfProfiler must be enabled before disabling");
|
|
|
|
/* Always connecting this disable event to the last enable event i.e. using
|
|
* whatever is on the top of the start counter value stack. */
|
|
perf_counters_t& sv = start_values_.top();
|
|
for (unsigned i = 0; i < events_.size(); ++i) {
|
|
vals[i] = CalcDelta(sv[i], events_[i].ReadCounter());
|
|
}
|
|
start_values_.pop();
|
|
|
|
// Restore it for a parent
|
|
if (!start_values_.empty()) {
|
|
StartCounting();
|
|
}
|
|
}
|
|
} // namespace torch::profiler::impl::linux_perf
|