Files
pytorch/torch/csrc/profiler/collection.h
Yuanyuan Chen 9fff8155c3 [2/N] Fix clang-tidy readability checks (#164652)
This PR applies clang-tidy readability checks to jit sources and all headers in the code base.
`readability-redundant-inline-specifier` is suppressed because it incurs too many changes. `readability-redundant-inline-specifier` is used to detect redundant inline specifiers on function and variable declarations. There are many in-class method definitions that are marked inline.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164652
Approved by: https://github.com/Skylion007
2025-10-06 01:06:01 +00:00

710 lines
21 KiB
C++

#pragma once
#include <cstdint>
#include <memory>
#include <mutex>
#include <type_traits>
#include <utility>
#include <variant>
#include <ATen/Context.h>
#include <c10/core/Device.h>
#include <c10/core/TensorImpl.h>
#include <c10/macros/Macros.h>
#include <c10/util/ApproximateClock.h>
#include <c10/util/flat_hash_map.h>
#include <c10/util/strong_type.h>
#include <torch/csrc/profiler/containers.h>
#include <torch/csrc/profiler/data_flow.h>
#include <torch/csrc/profiler/events.h>
#include <torch/csrc/profiler/kineto_shim.h>
#include <torch/csrc/profiler/orchestration/python_tracer.h>
#include <torch/csrc/profiler/perf.h>
#include <torch/csrc/profiler/stubs/base.h>
#include <torch/csrc/profiler/util.h>
#include <torch/csrc/utils/python_stub.h>
namespace torch::profiler::impl {
enum class EventType : uint8_t {
TorchOp = 0,
Backend,
Vulkan,
Allocation,
OutOfMemory,
PyCall,
PyCCall,
Kineto,
PythonGC
};
// ============================================================================
// == Value (Tensor, Scalar) summary ==========================================
// ============================================================================
struct TORCH_API RawTensorMetadataBase {
RawTensorMetadataBase() = default;
explicit RawTensorMetadataBase(const at::Tensor& t);
StorageImplData data_;
c10::ScalarType dtype_{c10::ScalarType::Undefined};
c10::Layout layout_{c10::Layout::Strided};
uint32_t size_dim_{0};
};
// Collected during profiling.
struct TORCH_API RawTensorMetadata : RawTensorMetadataBase {
RawTensorMetadata() = default;
RawTensorMetadata(const RawTensorMetadata&) = default;
RawTensorMetadata(RawTensorMetadata&&) noexcept = default;
RawTensorMetadata& operator=(const RawTensorMetadata&) = default;
RawTensorMetadata& operator=(RawTensorMetadata&&) noexcept = default;
~RawTensorMetadata() = default;
explicit RawTensorMetadata(const at::Tensor& t);
// Wrap `weak_self_` in `std::optional` and split device into components to
// keep struct default constructable. (which the std::array initializer needs)
std::optional<WeakTensor> weak_self_;
c10::DeviceType device_type_{c10::DeviceType::CPU};
c10::DeviceIndex device_index_{-1};
};
// Used during post processing.
struct TORCH_API TensorMetadata : public RawTensorMetadataBase {
TensorMetadata(
const RawTensorMetadata& r,
std::vector<int64_t> sizes,
std::vector<int64_t> strides);
TensorImplAddress impl() const {
return weak_self_.get();
}
WeakTensor weak_self_;
c10::Device device_;
std::vector<int64_t> sizes_;
std::vector<int64_t> strides_;
// Set during `calculateUniqueTensorIDs`.
std::optional<TensorID> id_;
std::optional<AllocationID> allocation_id_;
};
// Used during post processing.
struct TORCH_API ProfilerStepInfo {
int64_t start_time_ns; // start time of the profiler step
int64_t end_time_ns; // end time of the profiler step
uint64_t out_idx; // index of the profiler step in the profiler "out" var in
// getRecords
ProfilerStepInfo(int64_t start, int64_t end, uint64_t out_idx)
: start_time_ns(start), end_time_ns(end), out_idx(out_idx) {}
};
using op_input_t = std::variant<
TensorMetadata,
std::vector<TensorMetadata>,
c10::IValue,
std::nullopt_t>;
// ============================================================================
// == ExtraFields =============================================================
// ============================================================================
template <EventType>
struct ExtraFields;
struct TorchOpBasicFields {
int64_t sequence_number_{0};
uint64_t forward_tid_{0};
at::RecordScope scope_{};
bool is_async_{false};
uint64_t record_function_id_{0};
int64_t debug_handle_{0};
std::string name_;
std::string overload_name_;
// Set in the exit callback.
uint64_t end_tid_{0};
};
using jit_stack_t = std::vector<std::string>;
using jit_modules_t = std::vector<std::string>;
using extra_args_t = std::unordered_map<std::string, c10::IValue>;
using extra_meta_t = std::unordered_map<std::string, std::string>;
using kwinputs_t = std::unordered_map<std::string, c10::IValue>;
struct FallbackPair {
ProfilerVoidEventStub device_event_start_ = nullptr;
ProfilerVoidEventStub device_event_end_ = nullptr;
};
template <>
struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
ExtraFields(
TorchOpBasicFields&& f,
uint64_t correlation_id,
c10::time_t end_time_ns,
std::vector<op_input_t>&& inputs,
std::vector<op_input_t>&& concrete_inputs,
jit_stack_t&& jit_stack,
jit_modules_t&& jit_modules,
extra_args_t&& extra_args,
extra_meta_t&& extra_meta,
kwinputs_t&& kwinputs,
FallbackPair&& device_fallback,
bool allow_tf32_cublas,
std::unique_ptr<perf_counters_t>&& perf_event_counters)
: TorchOpBasicFields(std::move(f)),
correlation_id_{correlation_id},
end_time_ns_{end_time_ns},
inputs_{std::move(inputs)},
concrete_inputs_{std::move(concrete_inputs)},
jit_stack_{std::move(jit_stack)},
jit_modules_{std::move(jit_modules)},
extra_args_{std::move(extra_args)},
extra_meta_{std::move(extra_meta)},
kwinputs_{std::move(kwinputs)},
device_fallback_{std::move(device_fallback)},
allow_tf32_cublas_{allow_tf32_cublas},
perf_event_counters_{std::move(perf_event_counters)} {}
uint64_t correlation_id_;
c10::time_t end_time_ns_;
std::vector<op_input_t> inputs_;
std::vector<op_input_t> concrete_inputs_;
jit_stack_t jit_stack_;
jit_modules_t jit_modules_;
extra_args_t extra_args_;
extra_meta_t extra_meta_;
kwinputs_t kwinputs_;
FallbackPair device_fallback_;
bool allow_tf32_cublas_;
std::unique_ptr<perf_counters_t> perf_event_counters_;
std::string metadata_json_;
};
template <>
struct ExtraFields<EventType::Backend> {
int64_t start_time_us_;
int64_t end_time_us_;
int64_t debug_handle_;
at::RecordScope scope_;
std::string name_;
std::string backend_;
jit_stack_t jit_stack_;
jit_modules_t jit_modules_;
};
template <>
struct ExtraFields<EventType::PythonGC> {
std::string phase;
int64_t duration_ns_;
};
template <>
struct ExtraFields<EventType::Vulkan> {
using raw_event_t = std::pair<c10::approx_time_t, vulkan_id_t>;
std::string name_;
int64_t duration_ns_{0};
// While building the event tree, we want to report a vulkan event's duration
// as 0 so that its end time doesn't exceed that of its parent cpu op
bool in_tree_building_{false};
};
struct RawAllocation {
c10::approx_time_t start_time_;
void* ptr_;
int64_t alloc_size_;
size_t total_allocated_;
size_t total_reserved_;
c10::DeviceType device_type_;
c10::DeviceIndex device_index_;
};
// For performance.
static_assert(
std::is_trivial_v<RawAllocation>,
"Non-Trivial member of RawAllocation.");
template <>
struct ExtraFields<EventType::Allocation> : RawAllocation {
ExtraFields(const RawAllocation& allocation) : RawAllocation(allocation) {}
c10::Device device() const {
return {device_type_, device_index_};
}
std::optional<TensorID> id_;
std::optional<AllocationID> allocation_id_;
};
template <>
struct ExtraFields<EventType::OutOfMemory> {
c10::approx_time_t start_time_;
int64_t alloc_size_;
size_t total_allocated_;
size_t total_reserved_;
c10::DeviceType device_type_;
c10::DeviceIndex device_index_;
};
// For performance.
static_assert(
std::is_trivial_v<ExtraFields<EventType::OutOfMemory>>,
"Non-Trivial member of ExtraFields<EventType::OutOfMemory>.");
struct PyFrameState {
int line_no_;
at::StringView filename_;
at::StringView funcname_;
};
template <typename T, typename Tag>
using strong_t = strong::
type<T, Tag, strong::regular, strong::convertible_to<T>, strong::hashable>;
using PyModuleSelf = strong_t<PyObject*, struct PyModuleSelf_>;
using PyModuleCls = strong_t<PyObject*, struct PyModuleCls_>;
using PyMethod = strong_t</*PyMethodDef*/ void*, struct PyMethod_>;
using PyOptimizerSelf = strong_t<PyObject*, struct PyOptSelf_>;
using PyOptimizerCls = strong_t<PyObject*, struct PyOptimizer_>;
struct NNModuleInfo {
struct ParameterInfo {
std::string name_;
TensorMetadata metadata_;
std::optional<TensorMetadata> grad_metadata_;
};
PyModuleSelf self_;
PyModuleCls cls_;
at::StringView cls_name_;
std::vector<ParameterInfo> parameters_;
// Indicates that `self_` is the kth instance of `cls_` observed.
size_t id_{std::numeric_limits<size_t>::max()};
};
struct OptimizerInfo {
struct ParameterInfo {
TensorMetadata metadata_;
std::optional<TensorMetadata> grad_metadata_;
std::vector<std::pair<std::string, TensorMetadata>> state_;
};
PyOptimizerSelf self_;
PyOptimizerCls cls_;
at::StringView cls_name_;
std::vector<ParameterInfo> parameters_;
};
struct PyExtraFieldsBase {
PyExtraFieldsBase(
c10::time_t end_time_ns,
size_t python_tid,
PyFrameState caller)
: end_time_ns_{end_time_ns},
python_tid_{python_tid},
caller_{std::move(caller)} {}
c10::time_t end_time_ns_;
size_t python_tid_;
PyFrameState caller_;
// kth python event observed. (Used by TensorBoard)
size_t id_{std::numeric_limits<size_t>::max()};
};
template <>
struct ExtraFields<EventType::PyCall> : public PyExtraFieldsBase {
struct args_t {
PyFrameState frame_state_;
std::optional<NNModuleInfo> module_info_;
std::optional<OptimizerInfo> optimizer_info_;
};
ExtraFields(
c10::time_t end_time_ns,
size_t python_tid,
PyFrameState caller,
args_t args)
: PyExtraFieldsBase(end_time_ns, python_tid, std::move(caller)),
callsite_{std::move(args.frame_state_)},
module_{std::move(args.module_info_)},
optimizer_{std::move(args.optimizer_info_)} {}
PyFrameState callsite_;
std::optional<NNModuleInfo> module_;
std::optional<OptimizerInfo> optimizer_;
};
template <>
struct ExtraFields<EventType::PyCCall> : public PyExtraFieldsBase {
using args_t = at::StringView;
ExtraFields(
c10::time_t end_time_ns,
size_t python_tid,
PyFrameState caller,
args_t args)
: PyExtraFieldsBase(end_time_ns, python_tid, std::move(caller)),
function_name_{std::move(args)} {}
at::StringView function_name_;
};
template <>
struct ExtraFields<EventType::Kineto> {
// Mirrors `libkineto::GenericTraceActivity::Flow`. This information is used
// during post processing to properly embed Kineto events into the broader
// profiler tree structure. End users are not generally expected to use these
// fields directly, but they are available for debugging.
struct Flow {
uint32_t id{0};
uint32_t type{0};
uint32_t start{0};
};
std::string name_;
int64_t duration_ns_{0};
uint64_t correlation_id_{0};
libkineto::ActivityType activity_type_;
Flow flow;
std::weak_ptr<Result> linked_activity_;
std::string metadata_json_;
};
struct TORCH_API Result : public std::enable_shared_from_this<Result> {
template <typename... Args>
[[nodiscard]] static std::shared_ptr<Result> create(Args... args) {
return std::shared_ptr<Result>(new Result(std::forward<Args>(args)...));
}
template <typename T>
decltype(auto) visit(T&& visitor) {
return std::visit(std::forward<T>(visitor), extra_fields_);
}
template <typename T>
decltype(auto) visit(T&& visitor) const {
return std::visit(std::forward<T>(visitor), extra_fields_);
}
template <typename T, typename Fn>
void visit_if_base(const Fn& fn) const {
visit([&](const auto& extra_fields) {
using extra_fields_t = typename std::remove_cv_t<
typename std::remove_reference_t<decltype(extra_fields)>>;
if constexpr (std::is_base_of_v<T, extra_fields_t>) {
fn(extra_fields);
}
});
}
EventType tag() const {
return visit([](const auto& i) { return deduceTag(i); });
}
std::string name() const;
std::string overload_name() const;
libkineto::ActivityType kinetoType() const;
uint64_t correlationID() const;
int64_t endTimeNS() const;
uint64_t endTID() const;
c10::DeviceType deviceType() const;
int64_t start_time_ns_;
uint64_t start_tid_;
kineto::DeviceAndResource kineto_info_;
std::variant<
ExtraFields<EventType::TorchOp>,
ExtraFields<EventType::Backend>,
ExtraFields<EventType::Vulkan>,
ExtraFields<EventType::Allocation>,
ExtraFields<EventType::OutOfMemory>,
ExtraFields<EventType::PyCall>,
ExtraFields<EventType::PyCCall>,
ExtraFields<EventType::Kineto>,
ExtraFields<EventType::PythonGC>>
extra_fields_;
std::weak_ptr<Result> parent_;
std::vector<std::shared_ptr<Result>> children_;
bool finished_{false};
bool hidden_{false};
const torch::profiler::impl::kineto::activity_t* kineto_activity_{nullptr};
private:
template <EventType E>
Result(
int64_t start_time_ns,
uint64_t start_tid,
kineto::DeviceAndResource kineto_info,
ExtraFields<E>&& extra_fields)
: start_time_ns_{start_time_ns},
start_tid_{start_tid},
kineto_info_{kineto_info},
extra_fields_{std::move(extra_fields)} {}
template <EventType E>
static EventType deduceTag(const ExtraFields<E>&) {
return E;
}
};
struct KinetoObserverContext : public at::ObserverContext {
struct Event {
TorchOpBasicFields basic_fields_;
c10::approx_time_t start_time_;
// Set in the exit callback.
c10::approx_time_t end_time_{
std::numeric_limits<c10::approx_time_t>::min()};
bool allow_tf32_cublas_;
std::unique_ptr<perf_counters_t> counters_;
extra_meta_t* extra_nccl_meta_{};
};
explicit KinetoObserverContext(Event* event) : event_{event} {}
Event* event_;
FallbackPair* fallback_{nullptr};
};
constexpr int IO_ENCODER_DEFAULT_BLOCK_SIZE = 1024;
constexpr int SCALAR_LIST_LENGTH_LIMIT = 30;
// InputOutputEncoder
// Stores each op_events' shapes and dtypes, and concrete values into a
// contiguous AppendOnlyList so that we no longer create vectors for shapes
// and dtypes on every op. Those vectors can be created during
// post-processing.
// It splits the data into two categories: input shapes and concrete inputs.
class InputOutputEncoder final {
public:
void push(c10::ArrayRef<const c10::IValue> values);
// Used during post-processing to unpack the encoded data.
// Each method returns a "supplier" lambda which takes no arguments;
// invoking the lambda once will return a list of args that represent
// the inputs for one op.
// The data is split into two streams: "input shapes" and "concrete inputs".
// Note: "auto" only works because these are only used in collection.cpp,
// where they are implemented.
auto getInputShapeGenerator();
auto getConcreteInputGenerator();
bool isSupportedScalarList(const c10::IValue& list_candidate);
void clear();
enum class Tag {
Tensor = 0,
UndefinedTensor,
TensorListBegin, // TODO: generalize to other lists.
ScalarList,
Scalar,
Other,
TERMINATOR
};
enum class IOType { Shapes, ConcreteInputs, None };
private:
void push(const at::Tensor& t);
// Implementation detail for getInputShapeGenerator and
// getConcreteInputGenerator
auto getIValueGenerator(const IOType& io_type);
AppendOnlyList<Tag, IO_ENCODER_DEFAULT_BLOCK_SIZE> tags_;
AppendOnlyList<RawTensorMetadata, IO_ENCODER_DEFAULT_BLOCK_SIZE>
tensor_metadata_;
AppendOnlyList<int64_t, IO_ENCODER_DEFAULT_BLOCK_SIZE> tensor_sizes_strides_;
AppendOnlyList<c10::IValue, IO_ENCODER_DEFAULT_BLOCK_SIZE> ivalues_;
};
using perf_profiler_t = torch::profiler::impl::linux_perf::PerfProfiler;
class TORCH_API ThreadLocalSubqueue {
public:
ThreadLocalSubqueue(const uint64_t tid, ProfilerConfig config);
std::unique_ptr<KinetoObserverContext> begin_op(const at::RecordFunction& fn);
template <class... Args>
void emplace_backend_event(Args&&... args) {
backend_events_.emplace_back(std::forward<Args>(args)...);
}
template <class... Args>
void emplace_vulkan_event(Args&&... args) {
vulkan_events_.emplace_back(std::forward<Args>(args)...);
}
template <class... Args>
void emplace_allocation_event(Args&&... args) {
allocations_.emplace_back(std::forward<Args>(args)...);
}
template <class... Args>
void emplace_ooms_event(Args&&... args) {
ooms_.emplace_back(std::forward<Args>(args)...);
}
template <class... Args>
void emplace_py_call(Args&&... args) {
py_calls_.emplace_back(std::forward<Args>(args)...);
}
template <class... Args>
void emplace_gc_call(Args&&... args) {
pythongc_.emplace_back(std::forward<Args>(args)...);
}
uint64_t tid() const {
return tid_;
}
const kineto::DeviceAndResource& kineto_info() const {
return kineto_info_;
}
inline void disable_perf_profiler(perf_counters_t& counters) const {
perf_profiler_->Disable(counters);
}
private:
uint64_t tid_;
ProfilerConfig config_;
kineto::DeviceAndResource kineto_info_;
std::unique_ptr<perf_profiler_t> perf_profiler_;
friend class RecordQueue;
// See `containers.h` for block size benchmarks.
static constexpr size_t BlockSize = 512;
struct TorchOpStorage {
// NB: This is a destructive operation.
void materialize(
std::vector<std::shared_ptr<Result>>& out,
std::vector<ProfilerStepInfo>& step_info,
const std::function<c10::time_t(c10::approx_time_t)>& time_converter,
const uint64_t tid,
const kineto::DeviceAndResource& kineto_info);
template <typename T, size_t ChunkSize>
class EventBlock : public std::array<T, ChunkSize> {
public:
EventBlock();
uint64_t correlation_id(const T* ptr) const;
private:
uint64_t id_start_;
};
using event_t = KinetoObserverContext::Event;
class OpList : public AppendOnlyList<event_t, BlockSize, EventBlock> {
public:
template <class... Args>
std::pair<event_t*, uint64_t> emplace_back(Args&&... args);
static uint64_t correlationID(const OpList::Iterator& e);
} op_events_;
// report_input_shapes
InputOutputEncoder inputs_outputs_;
// with_stack (JIT)
AppendOnlyList<jit_stack_t, BlockSize> jit_stack_;
// with_modules
AppendOnlyList<jit_modules_t, BlockSize> jit_modules_;
// with_flops
AppendOnlyList<extra_args_t, BlockSize> extra_args_;
// report extra metadata, i.e. collective communication meta
AppendOnlyList<extra_meta_t, BlockSize> extra_meta_;
// report kwinputs
AppendOnlyList<kwinputs_t, BlockSize> kwinputs_;
// ProfilerState::KINETO_GPU_FALLBACK or
// ProfilerState::KINETO_PRIVATEUSE1_FALLBACK
AppendOnlyList<FallbackPair, BlockSize> device_fallback_;
} torch_ops_;
// reportBackendEventToActiveKinetoProfiler
AppendOnlyList<ExtraFields<EventType::Backend>, BlockSize> backend_events_;
// _reportVulkanEventToProfiler
AppendOnlyList<ExtraFields<EventType::Vulkan>::raw_event_t, BlockSize>
vulkan_events_;
// reportMemoryUsage
AppendOnlyList<RawAllocation, BlockSize> allocations_;
// reportOOMs
AppendOnlyList<ExtraFields<EventType::OutOfMemory>, BlockSize> ooms_;
// with_stack (Python)
AppendOnlyList<
std::pair<python_tracer::TraceKey, c10::approx_time_t>,
BlockSize>
py_calls_;
// gc with_stack (Python)
AppendOnlyList<std::pair<std::string, c10::approx_time_t>, BlockSize>
pythongc_;
};
class TORCH_API RecordQueue {
public:
RecordQueue(ProfilerConfig config, std::set<ActivityType> activities);
bool tracePython() const;
bool getPythonGcEvents() const;
ThreadLocalSubqueue* getSubqueue();
void stop();
void restart();
// NB: This is a destructive operation.
std::pair<
std::vector<std::shared_ptr<Result>>,
std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>>
getRecords(
std::function<c10::time_t(c10::approx_time_t)> time_converter,
uint64_t start_time_ns,
uint64_t end_time_ns);
private:
uint32_t id_;
ProfilerConfig config_;
std::set<ActivityType> activities_;
ska::flat_hash_map<uint64_t, std::unique_ptr<ThreadLocalSubqueue>>
sub_queues_;
std::mutex sub_queue_mutex_;
std::unique_ptr<python_tracer::PythonTracerBase> python_tracer_;
};
TORCH_API bool get_record_concrete_inputs_enabled();
TORCH_API void set_record_concrete_inputs_enabled_fn(std::function<bool()>);
TORCH_API void set_record_concrete_inputs_enabled_val(bool);
TORCH_API bool get_fwd_bwd_enabled();
TORCH_API void set_fwd_bwd_enabled_fn(std::function<bool()>);
TORCH_API void set_fwd_bwd_enabled_val(bool);
TORCH_API bool get_cuda_sync_enabled();
TORCH_API void set_cuda_sync_enabled_fn(std::function<bool()>);
TORCH_API void set_cuda_sync_enabled_val(bool);
// Comms related RecordFunctions will record information about tensor storage
// locations.
TORCH_API bool get_record_tensor_addrs_enabled();
TORCH_API void set_record_tensor_addrs_enabled_fn(std::function<bool()>);
TORCH_API void set_record_tensor_addrs_enabled_val(bool);
} // namespace torch::profiler::impl