mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
More detailed description of benefits can be found at #41001. This is Intel's counterpart of NVidia’s NVTX (https://pytorch.org/docs/stable/autograd.html#torch.autograd.profiler.emit_nvtx). ITT is a functionality for labeling trace data during application execution across different Intel tools. For integrating Intel(R) VTune Profiler into Kineto, ITT needs to be integrated into PyTorch first. It works with both standalone VTune Profiler [(https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html](https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html)) and Kineto-integrated VTune functionality in the future. It works for both Intel CPU and Intel XPU devices. Pitch Add VTune Profiler's ITT API function calls to annotate PyTorch ops, as well as developer customized code scopes on CPU, like NVTX for NVidia GPU. This PR rebases the code changes at https://github.com/pytorch/pytorch/pull/61335 to the latest master branch. Usage example: ``` with torch.autograd.profiler.emit_itt(): for i in range(10): torch.itt.range_push('step_{}'.format(i)) model(input) torch.itt.range_pop() ``` cc @ilia-cher @robieta @chaekit @gdankel @bitfort @ngimel @orionr @nbcsm @guotuofeng @guyang3532 @gaoteng-git Pull Request resolved: https://github.com/pytorch/pytorch/pull/63289 Approved by: https://github.com/malfet
183 lines
5.6 KiB
C++
183 lines
5.6 KiB
C++
#pragma once
|
|
|
|
#include <ATen/record_function.h>
|
|
#include <torch/csrc/Export.h>
|
|
|
|
struct CUevent_st;
|
|
|
|
namespace torch {
|
|
namespace profiler {
|
|
namespace impl {
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// -- Profiler Config ---------------------------------------------------------
|
|
// ----------------------------------------------------------------------------
|
|
enum class C10_API_ENUM ActivityType {
|
|
CPU = 0,
|
|
CUDA, // CUDA kernels, runtime
|
|
NUM_KINETO_ACTIVITIES, // must be the last one
|
|
};
|
|
|
|
enum class C10_API_ENUM ProfilerState {
|
|
Disabled = 0,
|
|
CPU, // CPU-only profiling
|
|
CUDA, // CPU + CUDA events
|
|
NVTX, // only emit NVTX markers
|
|
ITT, // only emit ITT markers
|
|
KINETO, // use libkineto
|
|
KINETO_GPU_FALLBACK, // use CUDA events when CUPTI is not available
|
|
KINETO_ONDEMAND, // run the profiler in on-demand mode
|
|
NUM_PROFILER_STATES, // must be the last one
|
|
};
|
|
|
|
enum class C10_API_ENUM ActiveProfilerType {
|
|
NONE = 0,
|
|
LEGACY,
|
|
KINETO,
|
|
NVTX,
|
|
ITT
|
|
};
|
|
|
|
struct TORCH_API ExperimentalConfig {
|
|
explicit ExperimentalConfig(
|
|
std::vector<std::string> profiler_metrics = {},
|
|
bool profiler_measure_per_kernel = false)
|
|
: profiler_metrics(std::move(profiler_metrics)),
|
|
profiler_measure_per_kernel(profiler_measure_per_kernel) {}
|
|
~ExperimentalConfig() = default;
|
|
std::vector<std::string> profiler_metrics;
|
|
bool profiler_measure_per_kernel = false;
|
|
|
|
bool hasOptions() const {
|
|
return profiler_metrics.size() > 0;
|
|
}
|
|
};
|
|
|
|
struct TORCH_API ProfilerConfig {
|
|
explicit ProfilerConfig(
|
|
ProfilerState state,
|
|
bool report_input_shapes = false,
|
|
bool profile_memory = false,
|
|
bool with_stack = false,
|
|
bool with_flops = false,
|
|
bool with_modules = false,
|
|
ExperimentalConfig experimental_config = ExperimentalConfig())
|
|
: state(state),
|
|
experimental_config(experimental_config),
|
|
report_input_shapes(report_input_shapes),
|
|
profile_memory(profile_memory),
|
|
with_stack(with_stack),
|
|
with_flops(with_flops),
|
|
with_modules(with_modules) {}
|
|
~ProfilerConfig() = default;
|
|
ProfilerState state;
|
|
ExperimentalConfig experimental_config;
|
|
bool report_input_shapes;
|
|
bool profile_memory;
|
|
bool with_stack;
|
|
bool with_flops;
|
|
bool with_modules;
|
|
|
|
// Returns IValues corresponding to ProfilerConfig struct, to be used for
|
|
// serialization.
|
|
at::IValue toIValue() const;
|
|
|
|
// Reconstructs a ProfilerConfig from IValues given by toIValue.
|
|
static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue);
|
|
};
|
|
|
|
struct TORCH_API ProfilerThreadLocalStateBase
|
|
: public c10::MemoryReportingInfoBase {
|
|
explicit ProfilerThreadLocalStateBase(const ProfilerConfig& config)
|
|
: c10::MemoryReportingInfoBase(), config_(config) {}
|
|
~ProfilerThreadLocalStateBase() override = default;
|
|
|
|
static ProfilerThreadLocalStateBase* getTLS() {
|
|
return static_cast<ProfilerThreadLocalStateBase*>(
|
|
c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE));
|
|
}
|
|
|
|
const ProfilerConfig& config() const {
|
|
return config_;
|
|
}
|
|
|
|
void setCallbackHandle(at::CallbackHandle handle) {
|
|
handle_ = handle;
|
|
}
|
|
|
|
at::CallbackHandle callbackHandle() const {
|
|
return handle_;
|
|
}
|
|
|
|
bool hasCallbackHandle() {
|
|
return handle_ > 0;
|
|
}
|
|
|
|
bool memoryProfilingEnabled() const override {
|
|
return config_.profile_memory;
|
|
}
|
|
|
|
virtual ActiveProfilerType profilerType() = 0;
|
|
|
|
protected:
|
|
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
|
std::mutex state_mutex_;
|
|
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
|
ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
|
|
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
|
|
at::CallbackHandle handle_ = 0;
|
|
};
|
|
|
|
// Returns if the profiler is currently enabled in the current thread.
|
|
TORCH_API bool profilerEnabled();
|
|
|
|
TORCH_API ActiveProfilerType profilerType();
|
|
|
|
// Retrieve the thread_local ProfilerConfig.
|
|
TORCH_API ProfilerConfig getProfilerConfig();
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// -- Annotation --------------------------------------------------------------
|
|
// ----------------------------------------------------------------------------
|
|
using ProfilerEventStub = std::shared_ptr<CUevent_st>;
|
|
|
|
struct TORCH_API ProfilerStubs {
|
|
virtual void record(int* device, ProfilerEventStub* event, int64_t* cpu_ns)
|
|
const = 0;
|
|
virtual float elapsed(
|
|
const ProfilerEventStub* event,
|
|
const ProfilerEventStub* event2) const = 0;
|
|
virtual void mark(const char* name) const = 0;
|
|
virtual void rangePush(const char* name) const = 0;
|
|
virtual void rangePop() const = 0;
|
|
virtual bool enabled() const {
|
|
return false;
|
|
}
|
|
virtual void onEachDevice(std::function<void(int)> op) const = 0;
|
|
virtual void synchronize() const = 0;
|
|
virtual ~ProfilerStubs();
|
|
};
|
|
|
|
TORCH_API void registerCUDAMethods(ProfilerStubs* stubs);
|
|
TORCH_API const ProfilerStubs* cudaStubs();
|
|
TORCH_API void registerITTMethods(ProfilerStubs* stubs);
|
|
TORCH_API const ProfilerStubs* ittStubs();
|
|
|
|
} // namespace impl
|
|
} // namespace profiler
|
|
} // namespace torch
|
|
|
|
// There are some components which use these symbols. Until we migrate them
|
|
// we have to mirror them in the old autograd namespace.
|
|
namespace torch {
|
|
namespace autograd {
|
|
namespace profiler {
|
|
using torch::profiler::impl::ActivityType;
|
|
using torch::profiler::impl::getProfilerConfig;
|
|
using torch::profiler::impl::ProfilerConfig;
|
|
using torch::profiler::impl::profilerEnabled;
|
|
using torch::profiler::impl::ProfilerState;
|
|
} // namespace profiler
|
|
} // namespace autograd
|
|
} // namespace torch
|