[dynamo][guards] Profile guard manager in C++ (#140110)

This should remove the pybind noise from the profiling.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/140110
Approved by: https://github.com/jansel
ghstack dependencies: #139953
This commit is contained in:
Animesh Jain
2024-11-07 22:37:00 -08:00
committed by PyTorch MergeBot
parent a140e65e0f
commit e6c5a77485
3 changed files with 45 additions and 15 deletions

View File

@ -1,5 +1,5 @@
# mypy: allow-untyped-defs
from typing import Any
from typing import Any, Dict
import torch
@ -104,6 +104,10 @@ def install_no_tensor_aliasing_guard(
tensor_names: list[str],
verbose_code_parts: list[str],
): ...
def profile_guard_manager(
guard_manager: GuardManager,
f_locals: Dict[str, Any],
) -> float: ...
class TensorGuards:
def __init__(

View File

@ -16,7 +16,6 @@ import math
import re
import sys
import textwrap
import time
import types
import warnings
import weakref
@ -47,6 +46,7 @@ from torch._C._dynamo.guards import (
DictGuardManager,
install_no_tensor_aliasing_guard,
install_object_aliasing_guard,
profile_guard_manager,
RootGuardManager,
)
from torch._dynamo.source import (
@ -2219,7 +2219,10 @@ class CheckFunctionManager:
raise AssertionError(f"Guard check failed: {reasons}")
if guards_log.isEnabledFor(logging.DEBUG):
self.profile_guard_eval(output_graph.local_scope)
latency = profile_guard_manager(
self.guard_manager.root, output_graph.local_scope
)
guards_log.debug("Guard eval latency = %s us", f"{latency:.2f}")
# NB - We have to very careful of cleaning up here. Because of the
# invalidate function, we can create a weakref finalizer that keeps
@ -2232,18 +2235,6 @@ class CheckFunctionManager:
self._weakrefs.clear()
self.output_graph = None
def profile_guard_eval(self, f_locals):
start_time = time.time()
iterations = 0
profile_duration = 1 # unit is seconds
while time.time() - start_time < profile_duration:
self.guard_manager.check(f_locals)
iterations += 1
guard_latency = 10**6 / iterations # us
guards_log.debug("Guard eval latency = %s us", f"{guard_latency:.2f}")
def compile_check_fn(self, builder, guards_out, guard_fail_fn):
# see parallel handling of ".0" / "___implicit0" in _eval_frame.c
largs = builder.argnames

View File

@ -27,6 +27,7 @@
#include <ATen/xpu/EmptyTensor.h>
#endif
#include <chrono>
#include <sstream>
#include <tuple>
#include <utility>
@ -1627,6 +1628,7 @@ class GuardAccessor {
* entries.
*/
// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
class GuardManager {
public:
GuardManager() = delete;
@ -3681,6 +3683,38 @@ void install_no_tensor_aliasing_guard(
}
}
double profile_guard_manager(RootGuardManager* root, py::object f_locals) {
PyObject* locals = f_locals.ptr();
// Warmup
for (int i = 0; i < 10; i++) {
root->check_nopybind(locals);
}
int count = 0;
auto start = std::chrono::high_resolution_clock::now();
float profile_duration = 1.0;
// Run the loop for profile_duration seconds
while (true) {
root->check_nopybind(locals);
count++;
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
// Break the loop if 1 second has passed
if (elapsed.count() >= 1.0) {
break;
}
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> total_elapsed = end - start;
// Calculate the average time per iteration in microseconds
return (total_elapsed.count() * profile_duration * 1e6) / count;
}
} // namespace
static void* _torchinductor_pyobject_tensor_data_ptr(PyObject* obj) {
@ -4506,6 +4540,7 @@ PyObject* torch_c_dynamo_guards_init() {
py_m.def("install_object_aliasing_guard", install_object_aliasing_guard);
py_m.def(
"install_no_tensor_aliasing_guard", install_no_tensor_aliasing_guard);
py_m.def("profile_guard_manager", profile_guard_manager);
// initialize dict_version_map watcher for 3.12
#if IS_PYTHON_3_12_PLUS