mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Summary: Symbols are given hidden visibility by default on Linux to emulate the behavior on Windows. This helps developers catch visibility issues in their streamlined Linux dev environment before being surprised, late in the process, by Windows errors. Pull Request resolved: https://github.com/pytorch/pytorch/pull/20461 Reviewed By: kostmo Differential Revision: D15410410 Pulled By: dzhulgakov fbshipit-source-id: 1d684b5a9a80b692966a775c3f1c56b7c72ffc95
253 lines
6.7 KiB
C++
253 lines
6.7 KiB
C++
#pragma once
|
|
|
|
#include <iostream>
|
|
#include <mutex>
|
|
#include <memory>
|
|
#include <vector>
|
|
#include <cstdint>
|
|
#include <string>
|
|
#include <sstream>
|
|
#include <forward_list>
|
|
#include <tuple>
|
|
#include <ATen/ATen.h>
|
|
#include <torch/csrc/WindowsTorchApiMacro.h>
|
|
#ifndef _WIN32
|
|
#include <ctime>
|
|
#endif
|
|
|
|
#include <torch/csrc/autograd/record_function.h>
|
|
|
|
typedef struct CUevent_st* CUDAEventStub;
|
|
|
|
namespace torch { namespace autograd {
|
|
|
|
struct Function;
|
|
|
|
namespace profiler {
|
|
|
|
struct TORCH_API CUDAStubs {
|
|
virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) {
|
|
fail();
|
|
}
|
|
virtual float elapsed(CUDAEventStub event, CUDAEventStub event2) {
|
|
fail();
|
|
return 0.f;
|
|
}
|
|
virtual void nvtxMarkA(const char* name) {
|
|
fail();
|
|
}
|
|
virtual void nvtxRangePushA(const char* name) {
|
|
fail();
|
|
}
|
|
virtual void nvtxRangePop() {
|
|
fail();
|
|
}
|
|
virtual bool enabled() {
|
|
return false;
|
|
}
|
|
virtual void onEachDevice(std::function<void(int)> op) {
|
|
fail();
|
|
}
|
|
virtual void synchronize() {
|
|
fail();
|
|
}
|
|
virtual ~CUDAStubs();
|
|
|
|
private:
|
|
void fail() {
|
|
AT_ERROR("CUDA used in profiler but not enabled.");
|
|
}
|
|
};
|
|
|
|
TORCH_API void registerCUDAMethods(CUDAStubs* stubs);
|
|
|
|
constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
|
|
return ((a + b - 1) / b) * b;
|
|
}
|
|
|
|
#if defined(__MACH__) && !defined(CLOCK_REALTIME)
|
|
#include <sys/time.h>
|
|
// clock_gettime is not implemented on older versions of OS X (< 10.12).
|
|
// If implemented, CLOCK_REALTIME will have already been defined.
|
|
#endif
|
|
|
|
inline int64_t getTime() {
|
|
#ifdef _WIN32
|
|
using namespace std::chrono;
|
|
using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
|
|
return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
|
|
#elif defined(__MACH__) && !defined(CLOCK_REALTIME)
|
|
struct timeval now;
|
|
gettimeofday(&now, NULL);
|
|
return static_cast<int64_t>(now.tv_sec) * 1000000000 + static_cast<int64_t>(now.tv_usec) * 1000;
|
|
#else
|
|
// clock_gettime is *much* faster than std::chrono implementation on Linux
|
|
struct timespec t{};
|
|
clock_gettime(CLOCK_MONOTONIC, &t);
|
|
return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
|
|
#endif
|
|
}
|
|
|
|
// Old GCC versions generate warnings incorrectly
|
|
// see https://stackoverflow.com/questions/2463113/g-c0x-enum-class-compiler-warnings
|
|
#ifndef _MSC_VER
|
|
# pragma GCC diagnostic push
|
|
# pragma GCC diagnostic ignored "-Wattributes"
|
|
#endif
|
|
enum class TORCH_API ProfilerState {
|
|
Disabled,
|
|
CPU, // CPU-only profiling
|
|
CUDA, // CPU + CUDA events
|
|
NVTX, // only emit NVTX markers
|
|
};
|
|
|
|
struct TORCH_API ProfilerConfig {
|
|
ProfilerConfig(ProfilerState state, bool report_input_shapes)
|
|
: state(state), report_input_shapes(report_input_shapes) {}
|
|
~ProfilerConfig();
|
|
ProfilerState state;
|
|
bool report_input_shapes;
|
|
};
|
|
|
|
enum class TORCH_API EventKind : uint16_t {
|
|
Mark,
|
|
PushRange,
|
|
PopRange
|
|
};
|
|
#ifndef _MSC_VER
|
|
# pragma GCC diagnostic pop
|
|
#endif
|
|
|
|
struct TORCH_API Event final {
|
|
Event(
|
|
EventKind kind,
|
|
StringView name,
|
|
uint16_t thread_id,
|
|
bool record_cuda,
|
|
std::vector<std::vector<int64_t>>&& shapes = {})
|
|
: name_(std::move(name)),
|
|
kind_(kind),
|
|
thread_id_(thread_id),
|
|
shapes_(shapes) {
|
|
record(record_cuda);
|
|
}
|
|
|
|
void record(bool record_cuda);
|
|
std::string kind() const {
|
|
switch(kind_) {
|
|
case EventKind::Mark: return "mark";
|
|
case EventKind::PushRange: return "push";
|
|
case EventKind::PopRange: return "pop";
|
|
}
|
|
throw std::runtime_error("unknown EventKind");
|
|
}
|
|
const char* name() const {
|
|
return name_.str();
|
|
}
|
|
uint16_t thread_id() const {
|
|
return thread_id_;
|
|
}
|
|
std::vector<std::vector<int64_t>> shapes() const {
|
|
return shapes_;
|
|
}
|
|
double cpu_elapsed_us(const Event & e) {
|
|
return (e.cpu_ns_ - cpu_ns_)/(1000.0);
|
|
}
|
|
double cuda_elapsed_us(const Event & e);
|
|
bool has_cuda() const {
|
|
return event != nullptr;
|
|
}
|
|
int device() const {
|
|
return device_;
|
|
}
|
|
private:
|
|
// signed to allow for negative intervals, initialized for safety.
|
|
int64_t cpu_ns_ = 0;
|
|
StringView name_;
|
|
EventKind kind_;
|
|
uint16_t thread_id_;
|
|
std::vector<std::vector<int64_t>> shapes_;
|
|
int device_ = -1;
|
|
struct CUevent_st* event = nullptr;
|
|
};
|
|
|
|
// a linked-list of fixed sized vectors, to avoid
|
|
// a std::vector resize from taking a large amount of time inside
|
|
// a profiling event
|
|
struct RangeEventList {
|
|
constexpr static size_t MB = 1024 * 1024;
|
|
constexpr static size_t event_block_size = 16 * MB;
|
|
constexpr static size_t num_block_elements =
|
|
event_block_size / ceilToMultiple(sizeof(Event), alignof(Event));
|
|
static_assert(sizeof(Event[num_block_elements]) <= event_block_size,
|
|
"num_block_elements is calculated incorrectly");
|
|
using block_type = std::vector<Event>;
|
|
|
|
void allocBlock() {
|
|
blocks.emplace_front();
|
|
auto & new_block = blocks.front();
|
|
new_block.reserve(num_block_elements);
|
|
// Materialize all pages in the new block to release jitter when recording events.
|
|
const char * const end_ptr = reinterpret_cast<char*>(new_block.data() + num_block_elements);
|
|
for (volatile const char * ptr = reinterpret_cast<char*>(new_block.data());
|
|
ptr < end_ptr; ptr += 4 * 1024) {
|
|
(*ptr);
|
|
}
|
|
}
|
|
|
|
template<typename... Args>
|
|
void record(Args&&... args) {
|
|
if (blocks.empty() || blocks.front().size() == num_block_elements) {
|
|
allocBlock();
|
|
}
|
|
blocks.front().emplace_back(std::forward<Args>(args)...);
|
|
}
|
|
|
|
std::vector<Event> consolidate() {
|
|
std::vector<Event> result;
|
|
for (auto & block : blocks) {
|
|
result.insert(result.begin(),
|
|
std::make_move_iterator(block.begin()),
|
|
std::make_move_iterator(block.end()));
|
|
}
|
|
blocks.clear();
|
|
return result;
|
|
}
|
|
|
|
std::forward_list<block_type> blocks;
|
|
};
|
|
|
|
TORCH_API RangeEventList& getEventList();
|
|
TORCH_API void mark(std::string name, bool include_cuda = true);
|
|
TORCH_API void pushRange(std::string name);
|
|
TORCH_API void popRange();
|
|
|
|
using thread_event_lists = std::vector<std::vector<Event>>;
|
|
// NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
|
|
// there no autograd functions are being executed when these function are used.
|
|
TORCH_API void enableProfiler(ProfilerConfig);
|
|
TORCH_API thread_event_lists disableProfiler();
|
|
|
|
|
|
// Usage:
|
|
// {
|
|
// RecordProfile guard("filename.trace");
|
|
// // code you want to profile
|
|
// }
|
|
// Then open filename.trace in chrome://tracing
|
|
struct TORCH_API RecordProfile {
|
|
RecordProfile(std::ostream& out);
|
|
RecordProfile(const std::string& filename);
|
|
|
|
~RecordProfile();
|
|
private:
|
|
void init();
|
|
std::unique_ptr<std::ofstream> file_;
|
|
std::ostream& out_;
|
|
void processEvents(const std::vector<Event*>& events);
|
|
};
|
|
|
|
|
|
} // namespace profiler
|
|
}} // namespace torch::autograd
|