mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-20 21:14:14 +08:00 
			
		
		
		
	Preferring dash over underscore in command-line options. Add `--command-arg-name` to the argument parser. The old arguments with underscores `--command_arg_name` are kept for backward compatibility.
Both dashes and underscores are used in the PyTorch codebase. Some argument parsers only have dashes or only have underscores in arguments. For example, the `torchrun` utility for distributed training only accepts underscore arguments (e.g., `--master_port`). The dashes are more common in other command-line tools. And it looks to be the default choice in the Python standard library:
`argparse.BooleanOptionalAction`: 4a9dff0e5a/Lib/argparse.py (L893-L895)
```python
class BooleanOptionalAction(Action):
    def __init__(...):
            if option_string.startswith('--'):
                option_string = '--no-' + option_string[2:]
                _option_strings.append(option_string)
```
It adds `--no-argname`, not `--no_argname`. Also typing `_` need to press the shift or the caps-lock key than `-`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/94505
Approved by: https://github.com/ezyang, https://github.com/seemethere
		
	
		
			
				
	
	
		
			892 lines
		
	
	
		
			37 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			892 lines
		
	
	
		
			37 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from typing import Any, Dict, List, Optional
 | |
| from collections import defaultdict
 | |
| from warnings import warn
 | |
| 
 | |
| import torch
 | |
| import torch.cuda
 | |
| from torch._C._profiler import _ExperimentalConfig
 | |
| 
 | |
| from torch.autograd import (
 | |
|     _disable_profiler,
 | |
|     _enable_profiler,
 | |
|     _kineto_step,
 | |
|     _prepare_profiler,
 | |
|     _ProfilerResult,
 | |
|     _supported_activities,
 | |
|     DeviceType,
 | |
|     kineto_available,
 | |
|     ProfilerActivity,
 | |
|     ProfilerConfig,
 | |
|     ProfilerState,
 | |
| )
 | |
| from torch.autograd.profiler_util import (
 | |
|     _filter_name,
 | |
|     _filter_stack_entry,
 | |
|     _rewrite_name,
 | |
|     EventList,
 | |
|     FunctionEvent,
 | |
|     MEMORY_EVENT_NAME,
 | |
|     MemRecordsAcc,
 | |
|     OUT_OF_MEMORY_EVENT_NAME,
 | |
| )
 | |
| from torch.futures import Future
 | |
| 
 | |
| __all__ = ["profile", "record_function", "emit_itt", "emit_nvtx", "load_nvprof", "EnforceUnique",
 | |
|            "parse_nvprof_trace", "KinetoStepTracker", "EventList", "FunctionEvent", "MemRecordsAcc"]
 | |
| 
 | |
| try:
 | |
|     # Available in Python >= 3.2
 | |
|     from contextlib import ContextDecorator as _ContextDecorator
 | |
| except ImportError:
 | |
|     import functools
 | |
| 
 | |
|     class _ContextDecorator:  # type: ignore[no-redef]
 | |
| 
 | |
|         def __enter__(self):
 | |
|             raise NotImplementedError
 | |
| 
 | |
|         def __exit__(self, exc_type, exc_val, exc_tb):
 | |
|             raise NotImplementedError
 | |
| 
 | |
|         def __call__(self, func):
 | |
|             @functools.wraps(func)
 | |
|             def wrapped(*args, **kwargs):
 | |
|                 with self:
 | |
|                     return func(*args, **kwargs)
 | |
| 
 | |
|             return wrapped
 | |
| 
 | |
| class profile:
 | |
|     """Context manager that manages autograd profiler state and holds a summary of results.
 | |
|     Under the hood it just records events of functions being executed in C++ and
 | |
|     exposes those events to Python. You can wrap any code into it and it will
 | |
|     only report runtime of PyTorch functions.
 | |
|     Note: profiler is thread local and is automatically propagated into the async tasks
 | |
| 
 | |
|     Args:
 | |
|         enabled (bool, optional): Setting this to False makes this context manager a no-op.
 | |
| 
 | |
|         use_cuda (bool, optional): Enables timing of CUDA events as well using the cudaEvent API.
 | |
|             Adds approximately 4us of overhead to each tensor operation.
 | |
| 
 | |
|         record_shapes (bool, optional): If shapes recording is set, information
 | |
|             about input dimensions will be collected. This allows one to see which
 | |
|             dimensions have been used under the hood and further group by them
 | |
|             using prof.key_averages(group_by_input_shape=True). Please note that
 | |
|             shape recording might skew your profiling data. It is recommended to
 | |
|             use separate runs with and without shape recording to validate the timing.
 | |
|             Most likely the skew will be negligible for bottom most events (in a case
 | |
|             of nested function calls). But for higher level functions the total
 | |
|             self cpu time might be artificially increased because of the shape
 | |
|             collection.
 | |
| 
 | |
|         with_flops (bool, optional): If with_flops is set, the profiler will estimate
 | |
|             the FLOPs (floating point operations) value using the operator's input shape.
 | |
|             This allows one to estimate the hardware performance. Currently,
 | |
|             this option only works for the matrix multiplication and 2D convolution operators.
 | |
| 
 | |
|         profile_memory (bool, optional): track tensor memory allocation/deallocation.
 | |
| 
 | |
|         with_stack (bool, optional): record source information (file and line number) for the ops.
 | |
| 
 | |
|         with_modules (bool): record module hierarchy (including function names)
 | |
|             corresponding to the callstack of the op. e.g. If module A's forward call's
 | |
|             module B's forward which contains an aten::add op,
 | |
|             then aten::add's module hierarchy is A.B
 | |
|             Note that this support exist, at the moment, only for TorchScript models
 | |
|             and not eager mode models.
 | |
| 
 | |
|         use_kineto (bool, optional): experimental, enable profiling with Kineto profiler.
 | |
| 
 | |
|         use_cpu (bool, optional): profile CPU events; setting to ``False`` requires
 | |
|             ``use_kineto=True`` and can be used to lower the overhead for GPU-only profiling.
 | |
| 
 | |
|         experimental_config (_ExperimentalConfig) : A set of experimental options
 | |
|             used by profiler libraries like Kineto. Note, backward compatibility is not guaranteed.
 | |
| 
 | |
| 
 | |
|     .. warning:
 | |
|         Enabling memory profiling or source attribution incurs additional profiler
 | |
|         overhead
 | |
| 
 | |
|     .. warning:
 | |
|         This context managers should not be called recursively, i.e. no nested
 | |
|         instances are allowed
 | |
| 
 | |
|     .. warning:
 | |
|         Due to some CUDA multiprocessing limitations (multiprocessing-cuda-note_),
 | |
|         one cannot use the profiler with ``use_cuda = True`` to benchmark
 | |
|         DataLoaders with ``num_workers > 0``. If you wish to benchmark data loading,
 | |
|         please use ``use_cuda = False`` or ``num_workers = 0``.
 | |
| 
 | |
|     Example:
 | |
|         >>> # xdoctest: +SKIP
 | |
|         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
 | |
|         >>> x = torch.randn((1, 1), requires_grad=True)
 | |
|         >>> with torch.autograd.profiler.profile() as prof:
 | |
|         >>>     for _ in range(100):  # any normal python code, really!
 | |
|         >>>         y = x ** 2
 | |
|         >>>         y.backward()
 | |
|         >>> # NOTE: some columns were removed for brevity
 | |
|         >>> print(prof.key_averages().table(sort_by="self_cpu_time_total"))
 | |
|         -----------------------------------  ---------------  ---------------  ---------------
 | |
|         Name                                 Self CPU total   CPU time avg     Number of Calls
 | |
|         -----------------------------------  ---------------  ---------------  ---------------
 | |
|         mul                                  32.048ms         32.048ms         200
 | |
|         pow                                  27.041ms         27.041ms         200
 | |
|         PowBackward0                         9.727ms          55.483ms         100
 | |
|         torch::autograd::AccumulateGrad      9.148ms          9.148ms          100
 | |
|         torch::autograd::GraphRoot           691.816us        691.816us        100
 | |
|         -----------------------------------  ---------------  ---------------  ---------------
 | |
| 
 | |
|     """
 | |
|     def __init__(
 | |
|             self,
 | |
|             enabled=True,
 | |
|             *,
 | |
|             use_cuda=False,
 | |
|             record_shapes=False,
 | |
|             with_flops=False,
 | |
|             profile_memory=False,
 | |
|             with_stack=False,
 | |
|             with_modules=False,
 | |
|             use_kineto=False,
 | |
|             use_cpu=True,
 | |
|             experimental_config=None):
 | |
|         self.enabled: bool = enabled
 | |
|         if not self.enabled:
 | |
|             return
 | |
|         self.use_cuda = use_cuda
 | |
|         self.function_events: Optional[EventList] = None
 | |
|         self.entered = False
 | |
|         self.record_shapes = record_shapes
 | |
|         self.with_flops = with_flops
 | |
|         self.record_shapes |= self.with_flops
 | |
|         self.profile_memory = profile_memory
 | |
|         self.with_stack = with_stack
 | |
|         self.with_modules = with_modules
 | |
|         self.use_cpu = use_cpu
 | |
|         if experimental_config is None:
 | |
|             experimental_config = _ExperimentalConfig()
 | |
|         self.experimental_config = experimental_config
 | |
|         self.kineto_results: Optional[_ProfilerResult] = None
 | |
| 
 | |
|         if not self.use_cpu:
 | |
|             assert use_kineto, \
 | |
|                 "Device-only events supported only with Kineto (use_kineto=True)"
 | |
| 
 | |
|         if self.use_cuda and not torch.cuda.is_available():
 | |
|             warn("CUDA is not available, disabling CUDA profiling")
 | |
|             self.use_cuda = False
 | |
| 
 | |
|         self.kineto_activities = set()
 | |
|         if self.use_cpu:
 | |
|             self.kineto_activities.add(ProfilerActivity.CPU)
 | |
| 
 | |
|         self.profiler_kind = ProfilerState.KINETO
 | |
|         if self.use_cuda:
 | |
|             if (not use_kineto or ProfilerActivity.CUDA not in
 | |
|                     _supported_activities()):
 | |
|                 assert self.use_cpu, "Legacy CUDA profiling requires use_cpu=True"
 | |
|                 self.profiler_kind = ProfilerState.KINETO_GPU_FALLBACK
 | |
|             else:
 | |
|                 self.kineto_activities.add(ProfilerActivity.CUDA)
 | |
| 
 | |
|         assert len(self.kineto_activities) > 0, \
 | |
|             "No activities specified for the profiler"
 | |
| 
 | |
| 
 | |
|     def config(self):
 | |
|         return ProfilerConfig(
 | |
|             self.profiler_kind,
 | |
|             self.record_shapes,
 | |
|             self.profile_memory,
 | |
|             self.with_stack,
 | |
|             self.with_flops,
 | |
|             self.with_modules,
 | |
|             self.experimental_config)
 | |
| 
 | |
|     def __enter__(self):
 | |
|         if not self.enabled:
 | |
|             return
 | |
|         if self.entered:
 | |
|             raise RuntimeError("Profiler context manager is not reentrant")
 | |
|         self._prepare_trace()
 | |
|         self._start_trace()
 | |
|         return self
 | |
| 
 | |
|     def _prepare_trace(self):
 | |
|         self.entered = True
 | |
|         _prepare_profiler(self.config(), self.kineto_activities)
 | |
| 
 | |
|     def _start_trace(self):
 | |
|         self.entered = True
 | |
|         _enable_profiler(self.config(), self.kineto_activities)
 | |
| 
 | |
|     def __exit__(self, exc_type, exc_val, exc_tb):
 | |
|         if not self.enabled:
 | |
|             return
 | |
|         if self.use_cuda:
 | |
|             torch.cuda.synchronize()
 | |
|         self.kineto_results = _disable_profiler()
 | |
|         parsed_results = self._parse_kineto_results(self.kineto_results)
 | |
|         self.function_events = EventList(
 | |
|             parsed_results,
 | |
|             use_cuda=self.use_cuda,
 | |
|             profile_memory=self.profile_memory,
 | |
|             with_flops=self.with_flops)
 | |
|         self.function_events._build_tree()
 | |
|         return False
 | |
| 
 | |
|     def __repr__(self):
 | |
|         if self.function_events is None:
 | |
|             return '<unfinished torch.autograd.profile>'
 | |
|         return repr(self.function_events)
 | |
| 
 | |
|     def __str__(self):
 | |
|         if self.function_events is None:
 | |
|             return '<unfinished torch.autograd.profile>'
 | |
|         return str(self.function_events)
 | |
| 
 | |
|     def _check_finish(self):
 | |
|         if self.function_events is None:
 | |
|             raise RuntimeError("Profiler didn't finish running")
 | |
| 
 | |
|     def table(
 | |
|             self,
 | |
|             sort_by=None,
 | |
|             row_limit=100,
 | |
|             max_src_column_width=75,
 | |
|             max_name_column_width=55,
 | |
|             max_shapes_column_width=80,
 | |
|             header=None,
 | |
|             top_level_events_only=False
 | |
|     ):
 | |
|         self._check_finish()
 | |
|         assert self.function_events is not None
 | |
|         return self.function_events.table(
 | |
|             sort_by=sort_by,
 | |
|             row_limit=row_limit,
 | |
|             max_src_column_width=max_src_column_width,
 | |
|             max_name_column_width=max_name_column_width,
 | |
|             max_shapes_column_width=max_shapes_column_width,
 | |
|             header=header,
 | |
|             top_level_events_only=top_level_events_only
 | |
|         )
 | |
|     table.__doc__ = EventList.table.__doc__
 | |
| 
 | |
|     def export_chrome_trace(self, path):
 | |
|         self._check_finish()
 | |
|         if kineto_available():
 | |
|             self.kineto_results.save(path)  # type: ignore[union-attr]
 | |
|         else:
 | |
|             return self.function_events.export_chrome_trace(path)  # type: ignore[union-attr]
 | |
|     export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__
 | |
| 
 | |
|     def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
 | |
|         self._check_finish()
 | |
|         assert self.function_events is not None, "Expected profiling results"
 | |
|         assert self.with_stack, "export_stacks() requires with_stack=True"
 | |
|         return self.function_events.export_stacks(path, metric)
 | |
| 
 | |
|     def key_averages(self, group_by_input_shape=False, group_by_stack_n=0):
 | |
|         self._check_finish()
 | |
|         assert self.function_events is not None, "Expected profiling results"
 | |
|         return self.function_events.key_averages(group_by_input_shape, group_by_stack_n)
 | |
|     key_averages.__doc__ = EventList.key_averages.__doc__
 | |
| 
 | |
|     def total_average(self):
 | |
|         self._check_finish()
 | |
|         assert self.function_events is not None, "Expected profiling results"
 | |
|         return self.function_events.total_average()
 | |
|     total_average.__doc__ = EventList.total_average.__doc__
 | |
| 
 | |
|     @property
 | |
|     def self_cpu_time_total(self):
 | |
|         """ Returns total time spent on CPU obtained as a sum of
 | |
|         all self times across all the events.
 | |
|         """
 | |
|         self._check_finish()
 | |
|         assert self.function_events is not None
 | |
|         return self.function_events.self_cpu_time_total
 | |
| 
 | |
|     def _parse_kineto_results(self, result):
 | |
|         # result.events() has most of the events - PyTorch op-level and device-level events
 | |
| 
 | |
|         trace_start_us = result.trace_start_us()
 | |
|         mem_records = [[evt, False] for evt in result.events() if evt.name() == MEMORY_EVENT_NAME]
 | |
|         oom_records = [evt for evt in result.events() if evt.name() == OUT_OF_MEMORY_EVENT_NAME]
 | |
|         mem_records_acc = MemRecordsAcc(mem_records)
 | |
| 
 | |
|         def _cpu_memory_usage(mem_record):
 | |
|             return mem_record.nbytes() if \
 | |
|                 mem_record.device_type() in [DeviceType.CPU, DeviceType.MKLDNN, DeviceType.IDEEP] \
 | |
|                 else 0
 | |
| 
 | |
|         def _cuda_memory_usage(mem_record):
 | |
|             return mem_record.nbytes() if \
 | |
|                 mem_record.device_type() in [DeviceType.CUDA, DeviceType.HIP] \
 | |
|                 else 0
 | |
| 
 | |
|         # Create and return FunctionEvent list
 | |
|         function_events = []
 | |
|         cuda_corr_map: Dict[int, List[FunctionEvent]] = {}
 | |
|         max_evt_id = 0
 | |
|         for kineto_event in result.events():
 | |
|             if _filter_name(kineto_event.name()):
 | |
|                 continue
 | |
|             rel_start_us = kineto_event.start_us() - trace_start_us
 | |
|             rel_end_us = rel_start_us + kineto_event.duration_us()
 | |
|             abs_end_us = kineto_event.start_us() + kineto_event.duration_us()
 | |
| 
 | |
|             cpu_memory_usage = 0
 | |
|             cuda_memory_usage = 0
 | |
|             if kineto_event.device_type() == DeviceType.CPU:
 | |
|                 # find the corresponding memory allocation events
 | |
|                 for mem_record in mem_records_acc.in_interval(kineto_event.start_us(), abs_end_us):
 | |
|                     cpu_memory_usage += _cpu_memory_usage(mem_record[0])
 | |
|                     cuda_memory_usage += _cuda_memory_usage(mem_record[0])
 | |
|                     mem_record[1] = True
 | |
| 
 | |
|             is_async = kineto_event.is_async() or (
 | |
|                 kineto_event.start_thread_id() != kineto_event.end_thread_id()
 | |
|             )
 | |
| 
 | |
|             fe = FunctionEvent(
 | |
|                 id=kineto_event.correlation_id(),
 | |
|                 name=_rewrite_name(name=kineto_event.name(), with_wildcard=True),
 | |
|                 trace_name=_rewrite_name(name=kineto_event.name(), with_wildcard=False),
 | |
|                 thread=kineto_event.start_thread_id(),
 | |
|                 start_us=rel_start_us,
 | |
|                 end_us=rel_end_us,
 | |
|                 fwd_thread=kineto_event.fwd_thread_id(),
 | |
|                 input_shapes=kineto_event.shapes(),
 | |
|                 stack=[entry for entry in kineto_event.stack() if _filter_stack_entry(entry)],
 | |
|                 scope=kineto_event.scope(),
 | |
|                 cpu_memory_usage=cpu_memory_usage,
 | |
|                 cuda_memory_usage=cuda_memory_usage,
 | |
|                 is_async=is_async,
 | |
|                 sequence_nr=kineto_event.sequence_nr(),
 | |
|                 device_type=kineto_event.device_type(),
 | |
|                 device_index=kineto_event.device_index(),
 | |
|                 flops=kineto_event.flops(),
 | |
|             )
 | |
|             max_evt_id = fe.id if fe.id > max_evt_id else max_evt_id
 | |
|             if fe.device_type == DeviceType.CPU and not fe.is_async:
 | |
|                 # Check if we have CUDA time as a fallback
 | |
|                 cuda_time = kineto_event.cuda_elapsed_us()
 | |
|                 if cuda_time > 0:
 | |
|                     fe.append_kernel(
 | |
|                         fe.name,
 | |
|                         fe.device_index,
 | |
|                         cuda_time)
 | |
|                     fe.is_legacy = True
 | |
|             function_events.append(fe)
 | |
|             corr_id = kineto_event.linked_correlation_id()
 | |
|             if corr_id > 0:
 | |
|                 if corr_id not in cuda_corr_map:
 | |
|                     cuda_corr_map[corr_id] = []
 | |
|                 cuda_corr_map[corr_id].append(fe)
 | |
| 
 | |
|         # associate CUDA kernels and CUDA runtime (CPU) with CPU events
 | |
|         for fe in function_events:
 | |
|             if (fe.device_type == DeviceType.CPU and not fe.is_async and
 | |
|                     fe.id in cuda_corr_map):
 | |
|                 for f_evt in cuda_corr_map[fe.id]:
 | |
|                     if f_evt.device_type == DeviceType.CUDA:
 | |
|                         fe.append_kernel(
 | |
|                             f_evt.name,
 | |
|                             f_evt.device_index,
 | |
|                             f_evt.time_range.end - f_evt.time_range.start)
 | |
|                     elif f_evt.device_type == DeviceType.CPU:
 | |
|                         # make sure that 'thread' of a CPU Kineto (e.g. CUDA Runtime) event is associated
 | |
|                         # with the 'thread' of the corresponding linked PyTorch event to properly track
 | |
|                         # parents and children
 | |
|                         f_evt.thread = fe.thread
 | |
| 
 | |
| 
 | |
|         def createFunctionEventForMemoryEvents(evt):
 | |
|             rel_start_us = evt.start_us() - trace_start_us
 | |
|             fe = FunctionEvent(
 | |
|                 id=max_evt_id,
 | |
|                 name=evt.name(),
 | |
|                 trace_name=None,  # not outputting in the trace
 | |
|                 thread=evt.start_thread_id(),
 | |
|                 start_us=rel_start_us,
 | |
|                 end_us=rel_start_us,  # no duration
 | |
|                 fwd_thread=evt.start_thread_id(),
 | |
|                 input_shapes=[],
 | |
|                 stack=[],
 | |
|                 scope=0,  # RecordScope::FUNCTION
 | |
|                 cpu_memory_usage=_cpu_memory_usage(evt),
 | |
|                 cuda_memory_usage=_cuda_memory_usage(evt),
 | |
|                 is_async=False,
 | |
|                 sequence_nr=-1,
 | |
|                 device_type=DeviceType.CPU,
 | |
|                 device_index=0,
 | |
|             )
 | |
|             return fe
 | |
| 
 | |
|         # output top-level memory events
 | |
|         for mem_record in mem_records:
 | |
|             if not mem_record[1]:
 | |
|                 max_evt_id += 1
 | |
|                 fe = createFunctionEventForMemoryEvents(mem_record[0])
 | |
|                 function_events.append(fe)
 | |
| 
 | |
|         for oom_record in oom_records:
 | |
|             max_evt_id += 1
 | |
|             fe = createFunctionEventForMemoryEvents(oom_record)
 | |
|             function_events.append(fe)
 | |
| 
 | |
|         function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end])
 | |
|         return function_events
 | |
| 
 | |
| 
 | |
| class record_function(_ContextDecorator):
 | |
|     """Context manager/function decorator that adds a label to a block of
 | |
|     Python code (or function) when running autograd profiler. It is
 | |
|     useful when tracing the code profile.
 | |
| 
 | |
|     Args:
 | |
|         name (str): Label assigned to the block of code.
 | |
|         node_id (int): ID of node, for distributed profiling. Unset in
 | |
|         non-distributed cases.
 | |
| 
 | |
|     Example:
 | |
|         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
 | |
|         >>> x = torch.randn((1, 1), requires_grad=True)
 | |
|         >>> with torch.autograd.profiler.profile() as prof:
 | |
|         ...     y = x ** 2
 | |
|         ...     with torch.autograd.profiler.record_function("label-z"): # label the block
 | |
|         ...         z = y ** 3
 | |
|         ...     y.backward()
 | |
|         ...
 | |
|         >>> # xdoctest: +IGNORE_WANT
 | |
|         >>> # NOTE: some columns were removed for brevity
 | |
|         >>> print(prof.key_averages().table(sort_by="self_cpu_time_total"))
 | |
|         -----------------------------------  ---------------  ---------------  ---------------
 | |
|         Name                                 Self CPU total %  CPU time avg     Number of Calls
 | |
|         -----------------------------------  ---------------  ---------------  ---------------
 | |
|         pow                                  60.77%           47.470us         3
 | |
|         mul                                  21.73%           25.465us         2
 | |
|         PowBackward0                         12.03%           121.891us        1
 | |
|         torch::autograd::AccumulateGrad      2.70%            6.324us          1
 | |
|         label-z                              2.13%            12.421us         1
 | |
|         torch::autograd::GraphRoot           0.64%            1.503us          1
 | |
|         -----------------------------------  ---------------  ---------------  ---------------
 | |
|         Self CPU time total: 234.344us
 | |
|         CUDA time total: 0.000us
 | |
| 
 | |
|     """
 | |
|     def __init__(self, name: str, args: Optional[str] = None):
 | |
|         self.name: str = name
 | |
|         self.args: Optional[str] = args
 | |
|         # Whether or not we should run record function's end callbacks when exiting.
 | |
|         self.run_callbacks_on_exit: bool = True
 | |
|         # TODO: TorchScript ignores standard type annotation here
 | |
|         # self.record: Optional["torch.classes.profiler._RecordFunction"] = None
 | |
|         self.record = torch.jit.annotate(Optional["torch.classes.profiler._RecordFunction"], None)
 | |
| 
 | |
|     def __enter__(self):
 | |
|         self.record = torch.ops.profiler._record_function_enter_new(self.name, self.args)
 | |
|         return self
 | |
| 
 | |
|     def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
 | |
|         if not self.run_callbacks_on_exit:
 | |
|             return
 | |
| 
 | |
|         # Local variable is needed by TorchScript to refine Optional[T] to T
 | |
|         record = self.record
 | |
|         assert record is not None
 | |
| 
 | |
|         # TODO: Too slow with __torch_function__ handling enabled
 | |
|         # See https://github.com/pytorch/pytorch/issues/76410
 | |
|         if not torch.jit.is_scripting():
 | |
|             with torch._C.DisableTorchFunctionSubclass():
 | |
|                 torch.ops.profiler._record_function_exit._RecordFunction(record)
 | |
|         else:
 | |
|             torch.ops.profiler._record_function_exit(record)
 | |
| 
 | |
|     def _call_end_callbacks_on_future(self, fut: Future[Any]) -> Future[Any]:
 | |
|         """
 | |
|         _call_end_callbacks_on_future is meant to be used for profiling async
 | |
|         calls that return a future. Calling this function will extend recording
 | |
|         beyond this scope, until the future is satisfied. It is useful for profiling
 | |
|         the end to end time of asynchronous calls. This function should only be called
 | |
|         once to attach the callback onto the future, and will throw if called multiple
 | |
|         times.
 | |
| 
 | |
|         Args:
 | |
|             fut: (torch._C.Future): future for which to schedule
 | |
|             callback for.
 | |
| 
 | |
|         Returns:
 | |
|             A future that completes with the value of the passed in future when
 | |
|             the profiling callbacks have ran.
 | |
| 
 | |
|         """
 | |
|         # Throw if we have already attached a callback onto the future.
 | |
|         if not self.run_callbacks_on_exit:
 | |
|             raise RuntimeError("_call_end_callbacks_on_future can only be called once.")
 | |
| 
 | |
|         # We are scheduling to run this RecordFunction's end callbacks when the
 | |
|         # passed in future completes, so don't run end callbacks on exit.
 | |
|         self.run_callbacks_on_exit = False
 | |
| 
 | |
|         # Local variable is needed by TorchScript to refine Optional[T] to T
 | |
|         record = self.record
 | |
|         assert record is not None
 | |
| 
 | |
|         # TODO: Too slow with __torch_function__ handling enabled
 | |
|         # See https://github.com/pytorch/pytorch/issues/76410
 | |
|         if not torch.jit.is_scripting():
 | |
|             with torch._C.DisableTorchFunctionSubclass():
 | |
|                 profiled_future = torch.ops.profiler._call_end_callbacks_on_jit_fut._RecordFunction(
 | |
|                     record, fut)
 | |
|         else:
 | |
|             profiled_future = torch.ops.profiler._call_end_callbacks_on_jit_fut(record, fut)
 | |
|         return profiled_future
 | |
| 
 | |
| 
 | |
| class emit_itt:
 | |
|     """Context manager that makes every autograd operation emit an ITT range.
 | |
| 
 | |
|     It is useful when running the program under Intel(R) VTune Profiler::
 | |
| 
 | |
|         vtune <--vtune-flags> <regular command here>
 | |
| 
 | |
|     The Instrumentation and Tracing Technology (ITT) API enables your application to generate and
 | |
|     control the collection of trace data during its execution across different Intel tools.
 | |
|     This context manager is to annotate Intel(R) VTune Profiling trace. With help of this context manager,
 | |
|     you will be able to see labled ranges in Intel(R) VTune Profiler GUI.
 | |
| 
 | |
|     .. warning:
 | |
|         This context manager should not be called recursively, i.e. at most one
 | |
|         instance should be enabled at any given time.
 | |
| 
 | |
|     Args:
 | |
|         enabled (bool, optional): Setting ``enabled=False`` makes this context manager a no-op.
 | |
|             Default: ``True``.
 | |
|         record_shapes (bool, optional): If ``record_shapes=True``, the itt range wrapping
 | |
|             each autograd op will append information about the sizes of Tensor arguments received
 | |
|             by that op, in the following format:
 | |
|             ``[[arg0.size(0), arg0.size(1), ...], [arg1.size(0), arg1.size(1), ...], ...]``
 | |
|             Non-tensor arguments will be represented by ``[]``.
 | |
|             Arguments will be listed in the order they are received by the backend op.
 | |
|             Please note that this order may not match the order in which those arguments were passed
 | |
|             on the Python side.  Also note that shape recording may increase the overhead of itt range creation.
 | |
|             Default: ``False``
 | |
| 
 | |
|     Example:
 | |
|         >>> # xdoctest: +SKIP("Undefined variables")
 | |
|         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
 | |
|         >>> with torch.autograd.profiler.emit_itt():
 | |
|         ...     model(x)
 | |
| 
 | |
|     """
 | |
|     def __init__(self, enabled=True, record_shapes=False):
 | |
|         self.enabled = enabled
 | |
|         self.entered = False
 | |
|         self.record_shapes = record_shapes
 | |
| 
 | |
|     def __enter__(self):
 | |
|         if not self.enabled:
 | |
|             return
 | |
|         if self.entered:
 | |
|             raise RuntimeError("ITT annotation context manager is not reentrant")
 | |
|         self.entered = True
 | |
|         _enable_profiler(
 | |
|             ProfilerConfig(
 | |
|                 ProfilerState.ITT,
 | |
|                 self.record_shapes,
 | |
|                 False,
 | |
|                 False,
 | |
|                 False,
 | |
|                 False,
 | |
|                 _ExperimentalConfig()),
 | |
|             set()
 | |
|         )
 | |
|         return self
 | |
| 
 | |
|     def __exit__(self, exc_type, exc_val, exc_tb):
 | |
|         if not self.enabled:
 | |
|             return
 | |
|         _disable_profiler()
 | |
|         return False
 | |
| 
 | |
| 
 | |
| class emit_nvtx:
 | |
|     """Context manager that makes every autograd operation emit an NVTX range.
 | |
| 
 | |
|     It is useful when running the program under nvprof::
 | |
| 
 | |
|         nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
 | |
| 
 | |
|     Unfortunately, there's no way to force nvprof to flush the data it collected
 | |
|     to disk, so for CUDA profiling one has to use this context manager to annotate
 | |
|     nvprof traces and wait for the process to exit before inspecting them.
 | |
|     Then, either NVIDIA Visual Profiler (nvvp) can be used to visualize the timeline, or
 | |
|     :func:`torch.autograd.profiler.load_nvprof` can load the results for inspection
 | |
|     e.g. in Python REPL.
 | |
| 
 | |
|     .. warning:
 | |
|         This context manager should not be called recursively, i.e. at most one
 | |
|         instance should be enabled at any given time.
 | |
| 
 | |
|     Args:
 | |
|         enabled (bool, optional): Setting ``enabled=False`` makes this context manager a no-op.
 | |
|             Default: ``True``.
 | |
|         record_shapes (bool, optional): If ``record_shapes=True``, the nvtx range wrapping
 | |
|             each autograd op will append information about the sizes of Tensor arguments received
 | |
|             by that op, in the following format:
 | |
|             ``[[arg0.size(0), arg0.size(1), ...], [arg1.size(0), arg1.size(1), ...], ...]``
 | |
|             Non-tensor arguments will be represented by ``[]``.
 | |
|             Arguments will be listed in the order they are received by the backend op.
 | |
|             Please note that this order may not match the order in which those arguments were passed
 | |
|             on the Python side.  Also note that shape recording may increase the overhead of nvtx range creation.
 | |
|             Default: ``False``
 | |
| 
 | |
|     Example:
 | |
|         >>> # xdoctest: +SKIP("undefined variables")
 | |
|         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
 | |
|         >>> with torch.cuda.profiler.profile():
 | |
|         ...     model(x)  # Warmup CUDA memory allocator and profiler
 | |
|         ...     with torch.autograd.profiler.emit_nvtx():
 | |
|         ...         model(x)
 | |
| 
 | |
|     **Forward-backward correlation**
 | |
| 
 | |
|     When viewing a profile created using :class:`emit_nvtx` in the Nvidia Visual Profiler,
 | |
|     correlating each backward-pass op with the corresponding forward-pass op can be difficult.
 | |
|     To ease this task, :class:`emit_nvtx` appends sequence number information to the ranges it
 | |
|     generates.
 | |
| 
 | |
|     During the forward pass, each function range is decorated with ``seq=<N>``.  ``seq`` is a running
 | |
|     counter, incremented each time a new backward Function object is created and stashed for backward.
 | |
|     Thus, the ``seq=<N>`` annotation associated with each forward function range tells you that
 | |
|     if a backward Function object is created by this forward function,
 | |
|     the backward object will receive sequence number N.
 | |
|     During the backward pass, the top-level range wrapping each C++ backward Function's
 | |
|     ``apply()`` call is decorated with ``stashed seq=<M>``.  ``M`` is the sequence number that
 | |
|     the backward object was created with.  By comparing ``stashed seq`` numbers in backward with ``seq``
 | |
|     numbers in forward, you can track down which forward op created each backward Function.
 | |
| 
 | |
|     Any functions executed during the backward pass are also decorated with ``seq=<N>``.  During
 | |
|     default backward (with ``create_graph=False``) this information is irrelevant, and in fact,
 | |
|     ``N`` may simply be 0 for all such functions.  Only the top-level ranges associated with
 | |
|     backward Function objects' ``apply()`` methods are useful, as a way to correlate these Function
 | |
|     objects with the earlier forward pass.
 | |
| 
 | |
|     **Double-backward**
 | |
| 
 | |
|     If, on the other hand, a backward pass with ``create_graph=True`` is underway (in other words,
 | |
|     if you are setting up for a double-backward), each function's execution during backward
 | |
|     is given a nonzero, useful ``seq=<N>``.  Those functions may themselves create Function objects
 | |
|     to be executed later during double-backward, just as the original functions in the forward pass did.
 | |
|     The relationship between backward and double-backward is conceptually the same as the relationship
 | |
|     between forward and backward: The functions still emit current-sequence-number-tagged ranges,
 | |
|     the Function objects they create still stash those sequence numbers, and during the eventual
 | |
|     double-backward, the Function objects' ``apply()`` ranges are still tagged with ``stashed seq``
 | |
|     numbers, which can be compared to `seq` numbers from the backward pass.
 | |
| 
 | |
|     .. warning:
 | |
|         The sequence number is thread-local, and some forward functions don't create an associated
 | |
|         backward Function object (instead delegating that to sub-functions further down the call chain).
 | |
|         For these reasons, the correspondence of stashed sequence numbers in
 | |
|         backward Function ``apply()`` ranges with `seq` numbers in forward-pass ranges is
 | |
|         not guaranteed to be 1 to 1.  The sequence numbers alone may not be enough to fully
 | |
|         disambiguate which forward function created which
 | |
|         backward Function object.  You may need to make a judgment based on analytic knowledge of what
 | |
|         the expected correspondence should be.
 | |
|     """
 | |
|     def __init__(self, enabled=True, record_shapes=False):
 | |
|         self.enabled = enabled
 | |
|         self.entered = False
 | |
|         self.record_shapes = record_shapes
 | |
| 
 | |
|     def __enter__(self):
 | |
|         if not self.enabled:
 | |
|             return
 | |
|         if self.entered:
 | |
|             raise RuntimeError("NVTX annotation context manager is not reentrant")
 | |
|         self.entered = True
 | |
|         torch.cuda.synchronize()
 | |
|         _enable_profiler(
 | |
|             ProfilerConfig(
 | |
|                 ProfilerState.NVTX,
 | |
|                 self.record_shapes,
 | |
|                 False,
 | |
|                 False,
 | |
|                 False,
 | |
|                 False,
 | |
|                 _ExperimentalConfig()),
 | |
|             set()
 | |
|         )
 | |
|         return self
 | |
| 
 | |
|     def __exit__(self, exc_type, exc_val, exc_tb):
 | |
|         if not self.enabled:
 | |
|             return
 | |
|         torch.cuda.synchronize()
 | |
|         _disable_profiler()
 | |
|         return False
 | |
| 
 | |
| 
 | |
| def load_nvprof(path):
 | |
|     """Opens an nvprof trace file and parses autograd annotations.
 | |
| 
 | |
|     Args:
 | |
|         path (str): path to nvprof trace
 | |
|     """
 | |
|     return EventList(parse_nvprof_trace(path))
 | |
| 
 | |
| 
 | |
| class EnforceUnique:
 | |
|     """Raises an error if a key is seen more than once."""
 | |
|     def __init__(self):
 | |
|         self.seen = set()
 | |
| 
 | |
|     def see(self, *key):
 | |
|         if key in self.seen:
 | |
|             raise RuntimeError('duplicate key: ' + str(key))
 | |
|         self.seen.add(key)
 | |
| 
 | |
| 
 | |
| def parse_nvprof_trace(path):
 | |
|     import sqlite3
 | |
|     conn = sqlite3.connect(path)
 | |
|     conn.row_factory = sqlite3.Row
 | |
| 
 | |
|     # Parse strings table
 | |
|     strings = {}
 | |
|     for r in conn.execute("SELECT _id_ as id, value FROM StringTable"):
 | |
|         strings[r["id"]] = torch._C._demangle(r["value"])
 | |
| 
 | |
|     # First, find all functions and create FunctionEvents for them
 | |
|     marker_query = """
 | |
|     SELECT
 | |
|         start.id AS marker_id, start.name, start.timestamp AS start_time, end.timestamp AS end_time
 | |
|     FROM
 | |
|         CUPTI_ACTIVITY_KIND_MARKER AS start INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end
 | |
|         ON start.id = end.id
 | |
|     WHERE
 | |
|         start.name != 0 AND end.name = 0
 | |
|     """
 | |
|     functions = []
 | |
|     functions_map = {}
 | |
|     unique = EnforceUnique()
 | |
|     for row in conn.execute(marker_query):
 | |
|         unique.see(row['marker_id'])
 | |
|         evt = FunctionEvent(id=row['marker_id'],
 | |
|                             node_id=0,  # missing a node_id when calling FunctionEvent. This is just to ensure
 | |
|                                         # that pytorch doesn't crash when creating a FunctionEvent() object
 | |
|                             name=strings[row['name']],
 | |
|                             start_us=row['start_time'],
 | |
|                             end_us=row['end_time'],
 | |
|                             thread=0)  # TODO: find in sqlite database
 | |
|         functions.append(evt)
 | |
|         functions_map[evt.id] = evt
 | |
| 
 | |
|     # Now, correlate all kernels with FunctionEvents
 | |
|     kernel_query = """
 | |
|     SELECT
 | |
|         start.id AS marker_id, start.name, start.timestamp, end.timestamp,
 | |
|         runtime._id_ AS runtime_id, runtime.cbid, runtime.start AS runtime_start, runtime.end AS runtime_end,
 | |
|         kernel.start AS kernel_start, kernel.end AS kernel_end, kernel.name AS kernel_name
 | |
|     FROM
 | |
|         CUPTI_ACTIVITY_KIND_MARKER AS start
 | |
|         INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end
 | |
|             ON start.id = end.id
 | |
|         INNER JOIN CUPTI_ACTIVITY_KIND_RUNTIME as runtime
 | |
|             ON (start.timestamp < runtime.start AND runtime.end < end.timestamp)
 | |
|         INNER JOIN CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL AS kernel
 | |
|             ON kernel.correlationId = runtime.correlationId
 | |
|     """
 | |
|     unique = EnforceUnique()
 | |
|     for row in conn.execute(kernel_query):
 | |
|         unique.see(row['marker_id'], row['runtime_id'])
 | |
|         # 211 is cudaKernelLaunch for cuda >= 9.2
 | |
|         assert (row['cbid'] == 211)
 | |
|         evt = functions_map[row['marker_id']]
 | |
|         evt.append_kernel(row['kernel_name'],
 | |
|                           0,
 | |
|                           row['kernel_end'] - row['kernel_start'])
 | |
| 
 | |
|     functions.sort(key=lambda evt: evt.time_range.start)
 | |
|     return functions
 | |
| 
 | |
| 
 | |
| class KinetoStepTracker:
 | |
|     """Provides an abstraction for incrementing the step count globally.
 | |
|     Previously, we only had one place to mark that a step() has occurred
 | |
|     in the program via pytorch profiler step(). We will now add step hooks
 | |
|     in the Optimizer class https://github.com/pytorch/pytorch/issues/88446
 | |
| 
 | |
|     - This could mean programs that already call profiler.step() every
 | |
|       iteration can end up double incrementing step count.
 | |
|     - If a model uses multiple optimizers we can also have double or more
 | |
|       counting of the step.
 | |
| 
 | |
|     We fix this by adding a layer of abstraction before calling step()
 | |
|     to the kineto library. The idea is to maintain steps per requester in a dict:
 | |
|     ```
 | |
|     {
 | |
|        "ProfilerStep": 100,  # triggered by profiler step() call
 | |
|        "Optimizer1Step": 100,   # Optimizer 1 or 2 are just examples, could be SGD, Adam etc
 | |
|        "Optimizer2Step": 100,
 | |
|     }
 | |
|     ```
 | |
|     To figure out the global step count just take the max of dict values (100).
 | |
| 
 | |
|     If one of the count increments the max will go up.
 | |
|     ```
 | |
|     {
 | |
|        "ProfilerStep": 100,
 | |
|        "Optimizer1Step": 101,   # Optimizer1 got incremented first say
 | |
|        "Optimizer2Step": 100,
 | |
|     }
 | |
|     ```
 | |
|     Then global step count is 101
 | |
|     We only call the kineto step() function when global count increments.
 | |
| 
 | |
|     NOTE: Please do not use the KinetoStepTracker in modules beside the Optimizer
 | |
|     for now. The result could be incorrect increments of the step count.
 | |
|     """
 | |
|     _current_step = -1
 | |
|     _step_dict: Dict[str, int] = defaultdict(int)
 | |
| 
 | |
|     @classmethod
 | |
|     def init_step_count(cls, requester: str):
 | |
|         cls._step_dict[requester] = cls._current_step
 | |
| 
 | |
|     @classmethod
 | |
|     def erase_step_count(cls, requester: str) -> bool:
 | |
|         return cls._step_dict.pop(requester, None) is not None
 | |
| 
 | |
|     @classmethod
 | |
|     def increment_step(cls, requester: str) -> int:
 | |
|         """Increments the step count for the requester.
 | |
|         Additionally if the max over all step counts has incremented then
 | |
|         trigger the _kineto_step()
 | |
|         returns global step count
 | |
|         """
 | |
|         if requester not in cls._step_dict:
 | |
|             cls.init_step_count(requester)
 | |
|         cls._step_dict[requester] += 1
 | |
| 
 | |
|         new_step = max(cls._step_dict.values())
 | |
|         if new_step > cls._current_step:
 | |
|             delta = new_step - cls._current_step
 | |
|             if delta > 1:
 | |
|                 warn("Profiler step count has increased more than 1 - "
 | |
|                      f"current_step = {cls._current_step} step dict =  {cls._step_dict}")
 | |
|             for _ in range(0, delta):
 | |
|                 _kineto_step()
 | |
|             cls._current_step = new_step
 | |
|         return cls._current_step
 | |
| 
 | |
|     @classmethod
 | |
|     def current_step(cls) -> int:
 | |
|         return cls._current_step
 |