mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
This is follow-up of #164912 to mark unused C++ parameters to improve code readability. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165121 Approved by: https://github.com/Skylion007
226 lines
8.1 KiB
C++
226 lines
8.1 KiB
C++
#pragma once
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include <torch/csrc/profiler/api.h>
|
|
#include <torch/csrc/profiler/events.h>
|
|
#include <torch/csrc/profiler/stubs/base.h>
|
|
#include <torch/csrc/profiler/util.h>
|
|
|
|
namespace torch {
|
|
|
|
namespace profiler::impl {
|
|
struct Result;
|
|
namespace kineto {
|
|
struct ActivityTraceWrapper;
|
|
} // namespace kineto
|
|
} // namespace profiler::impl
|
|
|
|
namespace autograd::profiler {
|
|
using experimental_event_t = std::shared_ptr<torch::profiler::impl::Result>;
|
|
using extra_meta_t = std::unordered_map<std::string, std::string>;
|
|
|
|
struct TORCH_API KinetoEvent {
|
|
KinetoEvent(
|
|
const std::shared_ptr<const torch::profiler::impl::Result>& /*result*/,
|
|
const bool verbose);
|
|
|
|
uint64_t startThreadId() const;
|
|
uint64_t endThreadId() const;
|
|
uint8_t activityType() const;
|
|
uint64_t fwdThreadId() const;
|
|
bool hasShapes() const;
|
|
const c10::ArrayRef<std::vector<int64_t>> shapes() const;
|
|
bool hasTypes() const;
|
|
const c10::ArrayRef<std::string> dtypes() const;
|
|
bool hasConcreteInputs() const;
|
|
const c10::ArrayRef<c10::IValue> concreteInputs() const;
|
|
bool hasKwinputs() const;
|
|
bool isHiddenEvent() const;
|
|
const std::unordered_map<std::string, c10::IValue> kwinputs() const;
|
|
uint64_t flops() const;
|
|
int64_t sequenceNr() const;
|
|
bool hasStack() const;
|
|
const c10::ArrayRef<std::string> stack() const;
|
|
uint8_t scope() const;
|
|
bool hasModuleHierarchy() const;
|
|
const c10::ArrayRef<std::string> moduleHierarchy() const;
|
|
int64_t debugHandle() const;
|
|
std::string name() const;
|
|
std::string overload_name() const;
|
|
c10::DeviceType deviceType() const;
|
|
int deviceIndex() const;
|
|
int64_t nBytes() const;
|
|
uint64_t startNs() const;
|
|
uint64_t endNs() const;
|
|
uint64_t durationNs() const;
|
|
bool isAsync() const;
|
|
uint64_t correlationId() const;
|
|
uint64_t linkedCorrelationId() const;
|
|
int64_t deviceResourceId() const;
|
|
std::string backend() const;
|
|
bool isPythonFunction() const;
|
|
int64_t cudaElapsedUs() const;
|
|
int64_t privateuse1ElapsedUs() const;
|
|
void getPerfEventCounters(torch::profiler::perf_counters_t& /*in*/) const;
|
|
extra_meta_t extraMeta() const;
|
|
std::string metadataJson() const;
|
|
|
|
private:
|
|
torch::profiler::impl::ProfilerVoidEventStub fallbackStart() const;
|
|
torch::profiler::impl::ProfilerVoidEventStub fallbackEnd() const;
|
|
|
|
std::shared_ptr<const torch::profiler::impl::Result> result_;
|
|
std::vector<std::string> python_stack_;
|
|
|
|
// Copy fields from result so we can return ArrayRefs.
|
|
std::vector<std::vector<int64_t>> shapes_;
|
|
std::vector<std::string> dtypes_;
|
|
std::vector<c10::IValue> concrete_inputs_;
|
|
std::unordered_map<std::string, c10::IValue> kwinputs_;
|
|
};
|
|
|
|
// Consolidating events returned directly from Kineto
|
|
// with events manually created by us (e.g. start/stop marks,
|
|
// memory allocation events)
|
|
struct TORCH_API ProfilerResult {
|
|
ProfilerResult();
|
|
ProfilerResult(
|
|
uint64_t start_time,
|
|
std::vector<KinetoEvent> events,
|
|
std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&&
|
|
trace,
|
|
std::vector<experimental_event_t>&& event_tree);
|
|
~ProfilerResult();
|
|
|
|
uint64_t trace_start_ns() const {
|
|
return trace_start_ns_;
|
|
}
|
|
|
|
const std::vector<KinetoEvent>& events() const {
|
|
return events_;
|
|
}
|
|
|
|
const std::vector<experimental_event_t>& event_tree() const {
|
|
return event_tree_;
|
|
}
|
|
|
|
void save(const std::string& path);
|
|
|
|
private:
|
|
uint64_t trace_start_ns_ = 0;
|
|
std::vector<KinetoEvent> events_;
|
|
std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper> trace_;
|
|
std::vector<experimental_event_t> event_tree_;
|
|
};
|
|
|
|
/*
|
|
* This API is used by backends to record latency of events that
|
|
* happened in the backend but were not visible to pytorch runtime.
|
|
* For example, if part of the model is lowered to a dsp backend, then
|
|
* the execution of that part of the model is delegated to the backend.
|
|
* When backend finishes execution it has an option to provide profiling
|
|
* information (latency only at the moment) corresponding to different operators
|
|
* that were executed in the backend.
|
|
* When such events are recorded by backend using this API, the event
|
|
* records will be collected by active kineto profiler. If no kineto profiler
|
|
* is active then the event is ignored.
|
|
* This provides us with a way to generate all the profiling information
|
|
* for a model regardless of where model (or part of it) executed.
|
|
* @param start_time_us: start time in us of the event
|
|
* @param end_time_us: end time in us of the event
|
|
* @param debug_handle: debug handle to correlate this event/op with
|
|
* model level module/source information
|
|
* @param scope: scope of the event, e.g. LITE_INTERPRETER, RECORD_FN etc.
|
|
* @param event_name: name of the event, e.g. op name
|
|
* @param backend_name: name of the backend where the event took place.
|
|
*/
|
|
TORCH_API void reportBackendEventToActiveKinetoProfiler(
|
|
const int64_t start_time_us,
|
|
const int64_t end_time_us,
|
|
const int64_t debug_handle,
|
|
const at::RecordScope scope,
|
|
const std::string& event_name,
|
|
const std::string& backend_name);
|
|
|
|
TORCH_API void enableProfiler(
|
|
const torch::profiler::impl::ProfilerConfig& config,
|
|
const std::set<torch::profiler::impl::ActivityType>& activities,
|
|
const std::unordered_set<at::RecordScope>& scopes = {});
|
|
|
|
/*
|
|
* Same as enableProfiler but with callback to do post-processing of
|
|
* KinetoEvents.
|
|
* enableProfilerWithEventPostProcess enables profiler to capture
|
|
* specified activities, with specified RecordFunction scope, if any.
|
|
* Additionally, it takes a functor that does in-place post processing of
|
|
* events, e.g. populate stack trace or module hierarchy information lazily
|
|
* using debug_handle.
|
|
* Example usage is with lite interpreter that has recording scope of
|
|
* LITE_INTERPRETER. In this case lite interpreter runtime, records debug
|
|
* handles in RecordFunction, along with other information. Debug handles are
|
|
* eventually passed down to KinetoEvent and recorded as part of the event.
|
|
* KinetoEdgeCPUProfiler, in torch/csrc/jit/mobile/profiler_edge.cpp, enables
|
|
* profiler using post-processing callback, via
|
|
* enableProfilerWithEventPostProcess, that takes these debug handles and
|
|
* generates stack trace and module hierarchy information, once profiling is
|
|
* done.
|
|
*/
|
|
using post_process_t = std::function<void(
|
|
/*debug_handle */ int64_t,
|
|
/*jit_stack */ std::vector<std::string>&,
|
|
/*jit_modules */ std::vector<std::string>&)>;
|
|
TORCH_API void enableProfilerWithEventPostProcess(
|
|
const torch::profiler::impl::ProfilerConfig& config,
|
|
const std::set<torch::profiler::impl::ActivityType>& activities,
|
|
post_process_t&& cb,
|
|
const std::unordered_set<at::RecordScope>& scopes = {});
|
|
|
|
TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();
|
|
|
|
TORCH_API void prepareProfiler(
|
|
const torch::profiler::impl::ProfilerConfig& config,
|
|
const std::set<torch::profiler::impl::ActivityType>& activities);
|
|
|
|
TORCH_API void toggleCollectionDynamic(
|
|
const bool enable,
|
|
const std::set<torch::profiler::impl::ActivityType>& activities);
|
|
|
|
TORCH_API void startMemoryProfile();
|
|
TORCH_API void stopMemoryProfile();
|
|
TORCH_API void exportMemoryProfile(const std::string& path);
|
|
|
|
/**
|
|
* When a C++ thread really has no control over how the profiler was enabled,
|
|
* for example, by some unreachable Python code, it can call these functions
|
|
* to test/join/unjoin itself into the collection set of a profiler, if any.
|
|
* Without calling these functions, the symptom may be "not seeing GPU events
|
|
* from some child C++ threads". This is an example on how to use them,
|
|
*
|
|
* using namespace torch::autograd::profiler;
|
|
* bool enabled = isProfilerEnabledInMainThread();
|
|
* if (enabled != saved_enabled_state) {
|
|
* if (enabled) {
|
|
* enableProfilerInChildThread();
|
|
* } else {
|
|
* disableProfilerInChildThread();
|
|
* }
|
|
* saved_enabled_state = enabled;
|
|
* }
|
|
*/
|
|
TORCH_API bool isProfilerEnabledInMainThread();
|
|
TORCH_API void enableProfilerInChildThread();
|
|
TORCH_API void disableProfilerInChildThread();
|
|
|
|
} // namespace autograd::profiler
|
|
|
|
namespace profiler::impl {
|
|
|
|
// Experimental.
|
|
TORCH_API void _reportVulkanEventToProfiler(vulkan_id_t id);
|
|
|
|
} // namespace profiler::impl
|
|
|
|
} // namespace torch
|