mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/33851 Rationale and context described in #33828. Script to reproduce the move: https://gist.github.com/suo/16cbefaaeb67ca5a7c6caffd49b7f6e9 ghstack-source-id: 99079645 Test Plan: Make sure CI passes Reviewed By: jamesr66a Differential Revision: D20133869 fbshipit-source-id: 390e9241a9c85366d9005c492ac31f10aa96488e
343 lines
9.5 KiB
C++
343 lines
9.5 KiB
C++
#include <torch/csrc/autograd/profiler.h>
|
|
#include <torch/csrc/jit/frontend/code_template.h>
|
|
|
|
#include <fstream>
|
|
#include <list>
|
|
#include <mutex>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
namespace torch { namespace autograd { namespace profiler {
|
|
|
|
CUDAStubs default_stubs;
|
|
constexpr CUDAStubs* default_stubs_addr = &default_stubs;
|
|
// constant initialization, so it is guaranteed to be initialized before
|
|
// static initialization calls which may invoke registerCUDAMethods
|
|
static CUDAStubs* cuda_stubs = default_stubs_addr;
|
|
|
|
void registerCUDAMethods(CUDAStubs* stubs) {
|
|
cuda_stubs = stubs;
|
|
}
|
|
|
|
ProfilerState state = ProfilerState::Disabled;
|
|
uint16_t next_thread_id = 0;
|
|
// Protects access to next_thread_id and all_event_lists_map.
|
|
std::mutex all_event_lists_map_mutex;
|
|
std::unordered_map<uint16_t, std::shared_ptr<RangeEventList>>
|
|
all_event_lists_map;
|
|
thread_local std::shared_ptr<RangeEventList> event_list;
|
|
thread_local uint16_t thread_id;
|
|
|
|
uint16_t getThreadId() {
|
|
return thread_id;
|
|
}
|
|
|
|
ProfilerConfig::~ProfilerConfig() = default;
|
|
|
|
RangeEventList& getEventList() {
|
|
if (!event_list) {
|
|
std::lock_guard<std::mutex> guard(all_event_lists_map_mutex);
|
|
event_list = std::make_shared<RangeEventList>();
|
|
thread_id = ++next_thread_id;
|
|
all_event_lists_map.emplace(thread_id, event_list);
|
|
}
|
|
return *event_list;
|
|
}
|
|
|
|
void mark(std::string name, bool include_cuda /* = true */) {
|
|
if (state == ProfilerState::Disabled) {
|
|
return;
|
|
}
|
|
if (state == ProfilerState::NVTX) {
|
|
cuda_stubs->nvtxMarkA(name.c_str());
|
|
} else {
|
|
getEventList().record(
|
|
EventKind::Mark,
|
|
StringView(std::move(name)),
|
|
thread_id,
|
|
include_cuda && state == ProfilerState::CUDA);
|
|
}
|
|
}
|
|
|
|
bool profilerEnabled() {
|
|
return state != ProfilerState::Disabled;
|
|
}
|
|
|
|
void pushRangeImpl(
|
|
const StringView& name,
|
|
const char* msg = "",
|
|
int64_t sequence_nr = -1,
|
|
std::vector<std::vector<int64_t>>&& shapes = {}) {
|
|
if (state == ProfilerState::Disabled) {
|
|
return;
|
|
}
|
|
if (state == ProfilerState::NVTX) {
|
|
if(sequence_nr >= 0 || shapes.size() > 0) {
|
|
std::stringstream s;
|
|
if(sequence_nr >= 0)
|
|
s << name.str() << msg << sequence_nr;
|
|
if(shapes.size() > 0) {
|
|
s << ", sizes = [";
|
|
for(int i = 0; i < shapes.size(); i++) {
|
|
if(shapes[i].size() > 0) {
|
|
s << "[";
|
|
for(int dim = 0; dim < shapes[i].size(); dim++) {
|
|
s << shapes[i][dim];
|
|
if(dim < shapes[i].size() - 1)
|
|
s << ", ";
|
|
}
|
|
s << "]";
|
|
}
|
|
else
|
|
s << "[]";
|
|
if(i < shapes.size() - 1)
|
|
s << ", ";
|
|
}
|
|
s << "]";
|
|
}
|
|
cuda_stubs->nvtxRangePushA(s.str().c_str());
|
|
} else {
|
|
cuda_stubs->nvtxRangePushA(name.str());
|
|
}
|
|
} else {
|
|
getEventList().record(
|
|
EventKind::PushRange,
|
|
name,
|
|
thread_id,
|
|
state == ProfilerState::CUDA,
|
|
std::move(shapes));
|
|
}
|
|
}
|
|
|
|
void pushRange(std::string name) {
|
|
pushRangeImpl(StringView(std::move(name)));
|
|
}
|
|
|
|
void popRange() {
|
|
if (state == ProfilerState::Disabled) {
|
|
return;
|
|
}
|
|
if (state == ProfilerState::NVTX) {
|
|
cuda_stubs->nvtxRangePop();
|
|
} else {
|
|
getEventList().record(
|
|
EventKind::PopRange,
|
|
StringView(""),
|
|
thread_id,
|
|
state == ProfilerState::CUDA);
|
|
}
|
|
}
|
|
|
|
void enableProfiler(ProfilerConfig config) {
|
|
ProfilerState new_state = config.state;
|
|
AT_ASSERT(new_state != ProfilerState::Disabled);
|
|
if (new_state == ProfilerState::NVTX && !cuda_stubs->enabled())
|
|
throw std::runtime_error("Can't use NVTX profiler - PyTorch was compiled without CUDA");
|
|
if (state != ProfilerState::Disabled && new_state != state) {
|
|
throw std::runtime_error("can't change kind of profiling (e.g. NVTX to CPU) while profiler is running");
|
|
}
|
|
|
|
pushCallback(
|
|
[config](const RecordFunction& fn) {
|
|
auto* msg = (fn.seqNr() >= 0) ? ", seq = " : "";
|
|
if (config.report_input_shapes) {
|
|
std::vector<std::vector<int64_t>> inputSizes;
|
|
inputSizes.reserve(fn.inputs().size());
|
|
for (const c10::IValue& input : fn.inputs()) {
|
|
if (!input.isTensor()) {
|
|
inputSizes.emplace_back();
|
|
continue;
|
|
}
|
|
const at::Tensor& tensor = input.toTensor();
|
|
if (tensor.defined()) {
|
|
inputSizes.push_back(input.toTensor().sizes().vec());
|
|
} else {
|
|
inputSizes.emplace_back();
|
|
}
|
|
}
|
|
pushRangeImpl(fn.name(), msg, fn.seqNr(), std::move(inputSizes));
|
|
} else {
|
|
pushRangeImpl(fn.name(), msg, fn.seqNr(), {});
|
|
}
|
|
},
|
|
[](const RecordFunction& fn) {
|
|
if (fn.getThreadId() != 0) {
|
|
// If we've overridden the thread_id on the RecordFunction, then find
|
|
// the eventList that was created for the original thread_id. Then,
|
|
// record the end event on this list so that the block is added to
|
|
// the correct list, instead of to a new list. This should only run
|
|
// when calling RecordFunction::end() in a different thread.
|
|
if (state == ProfilerState::Disabled) {
|
|
return;
|
|
} else {
|
|
std::lock_guard<std::mutex> guard(all_event_lists_map_mutex);
|
|
const auto& eventListIter =
|
|
all_event_lists_map.find(fn.getThreadId());
|
|
TORCH_INTERNAL_ASSERT(
|
|
eventListIter != all_event_lists_map.end(),
|
|
"Did not find thread_id matching ",
|
|
fn.getThreadId());
|
|
|
|
auto& eventList = eventListIter->second;
|
|
eventList->record(
|
|
EventKind::PopRange,
|
|
StringView(""),
|
|
fn.getThreadId(),
|
|
state == ProfilerState::CUDA);
|
|
}
|
|
} else {
|
|
popRange();
|
|
}
|
|
},
|
|
config.report_input_shapes);
|
|
state = new_state;
|
|
|
|
if(state == ProfilerState::CUDA) {
|
|
// event recording appears to have some startup overhead, so we need to
|
|
// to generate some dummy events first before recording synchronization events
|
|
for(int i = 0; i < 5; i++) {
|
|
cuda_stubs->onEachDevice([](int d) {
|
|
mark("__cuda_startup");
|
|
cuda_stubs->synchronize();
|
|
});
|
|
}
|
|
|
|
// cuda events must be on the same device, so we need a start event recorded
|
|
// for each gpu. we then use this event to synchronize time on the GPU
|
|
// with the CPU clock.
|
|
cuda_stubs->onEachDevice([](int d) {
|
|
mark("__cuda_start_event");
|
|
});
|
|
}
|
|
mark("__start_profile", false);
|
|
}
|
|
|
|
thread_event_lists disableProfiler() {
|
|
if (state == ProfilerState::Disabled) {
|
|
throw std::runtime_error("can't disable profiler when it's not running");
|
|
}
|
|
ProfilerState old_state = state;
|
|
mark("__stop_profile");
|
|
|
|
popCallback();
|
|
state = ProfilerState::Disabled;
|
|
|
|
if (old_state == ProfilerState::NVTX) {
|
|
return thread_event_lists();
|
|
} else {
|
|
thread_event_lists result;
|
|
std::lock_guard<std::mutex> guard(all_event_lists_map_mutex);
|
|
for (auto it = all_event_lists_map.begin(); it != all_event_lists_map.end();) {
|
|
auto & list = it->second;
|
|
result.emplace_back(list->consolidate());
|
|
// GC lists that are not held by any threads
|
|
if (list.use_count() == 1) {
|
|
auto current_it = it;
|
|
++it;
|
|
all_event_lists_map.erase(current_it);
|
|
} else {
|
|
++it;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
}
|
|
|
|
void Event::record(bool record_cuda) {
|
|
if (record_cuda) {
|
|
cuda_stubs->record(&device_, &event, &cpu_ns_);
|
|
return;
|
|
}
|
|
cpu_ns_ = getTime();
|
|
}
|
|
|
|
double Event::cuda_elapsed_us(const Event & e) {
|
|
if(!e.has_cuda() || !has_cuda()) {
|
|
throw std::logic_error("Events were not recorded for CUDA");
|
|
}
|
|
if(e.device() != device()) {
|
|
throw std::logic_error("Events are not on the same device");
|
|
}
|
|
return cuda_stubs->elapsed(event, e.event);
|
|
}
|
|
|
|
CUDAStubs::~CUDAStubs() = default;
|
|
|
|
|
|
static jit::CodeTemplate event_template(R"(
|
|
{
|
|
"name": "${name}",
|
|
"ph": "X",
|
|
"ts": ${ts},
|
|
"dur": ${dur},
|
|
"tid": ${tid},
|
|
"pid": "CPU Functions",
|
|
"args": {}
|
|
})");
|
|
|
|
|
|
RecordProfile::RecordProfile(std::ostream& out)
|
|
: out_(out) {
|
|
init();
|
|
}
|
|
|
|
RecordProfile::RecordProfile(const std::string& filename)
|
|
: file_(new std::ofstream(filename)), out_(*file_) {
|
|
init();
|
|
}
|
|
|
|
void RecordProfile::init() {
|
|
enableProfiler(ProfilerConfig(ProfilerState::CPU, false /* report shapes */));
|
|
}
|
|
|
|
RecordProfile::~RecordProfile() {
|
|
thread_event_lists event_lists = disableProfiler();
|
|
std::vector<Event*> events;
|
|
for(auto& l : event_lists) {
|
|
for(auto& e : l) {
|
|
events.push_back(&e);
|
|
}
|
|
}
|
|
processEvents(events);
|
|
if (file_){
|
|
file_->close();
|
|
}
|
|
}
|
|
|
|
void RecordProfile::processEvents(const std::vector<Event*>& events) {
|
|
TORCH_CHECK(out_, "could not open file");
|
|
Event* start = nullptr;
|
|
for (Event* e : events) {
|
|
if(0 == strcmp(e->name(), "__start_profile")) {
|
|
start = e;
|
|
break;
|
|
}
|
|
}
|
|
TORCH_CHECK(start, "could not find start?");
|
|
std::vector<Event*> stack;
|
|
out_ << "[\n";
|
|
bool first = true;
|
|
for(Event* e : events) {
|
|
if(e->kind() == "push") {
|
|
stack.push_back(e);
|
|
} else if(e->kind() == "pop") {
|
|
if(!first) {
|
|
out_ << ",\n";
|
|
}
|
|
first = false;
|
|
Event* e_start = stack.back();
|
|
stack.pop_back();
|
|
jit::TemplateEnv env;
|
|
env.s("name", e_start->name());
|
|
env.d("ts", start->cpu_elapsed_us(*e_start));
|
|
env.d("dur", e_start->cpu_elapsed_us(*e));
|
|
env.d("tid", e_start->thread_id());
|
|
out_ << event_template.format(env);
|
|
}
|
|
}
|
|
out_ << "]\n";
|
|
}
|
|
|
|
}}}
|