[Profiler][PrivateUse1] Profiler support PrivateUse1 key (#120556)

Summary:
1.Package public headers of kineto if USE_KINETO so that they can be used by PrivateUse1 user.
2.Add PrivateUse1 key to ActivityType.
3. Support PrivateUse1 key in function deviceTypeFromActivity and _supported_activities.
4. Fix some bugs when processing profiler results.
Co-authored-by: albanD <desmaison.alban@gmail.com>
Co-authored-by: Aaron Shi <enye.shi@gmail.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/120556
Approved by: https://github.com/aaronenyeshi
This commit is contained in:
Florian
2024-04-12 14:28:19 +00:00
committed by PyTorch MergeBot
parent 6f4c7eeb08
commit 41613a0803
9 changed files with 61 additions and 11 deletions

View File

@ -1386,6 +1386,12 @@ def main():
"include/tensorpipe/transport/uv/*.h",
]
)
if get_cmake_cache_vars()["USE_KINETO"]:
torch_package_data.extend(
[
"include/kineto/*.h",
]
)
torchgen_package_data = [
# Recursive glob doesn't work in setup.py,
# https://github.com/pypa/setuptools/issues/1806

View File

@ -270,7 +270,6 @@ class profile:
self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1_FALLBACK
else:
self.kineto_activities.add(ProfilerActivity.PrivateUse1)
self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1
assert (
len(self.kineto_activities) > 0
@ -317,6 +316,10 @@ class profile:
return
if self.use_cuda:
torch.cuda.synchronize()
elif self.use_device and hasattr(torch, self.use_device):
privateuse1_module = getattr(torch, self.use_device)
if hasattr(privateuse1_module, "synchronize"):
privateuse1_module.synchronize()
t0 = perf_counter_ns()
self.kineto_results = _disable_profiler()
@ -542,7 +545,10 @@ class profile:
and fe.id in device_corr_map
):
for f_evt in device_corr_map[fe.id]:
if f_evt.device_type == DeviceType.CUDA:
if (
f_evt.device_type == DeviceType.CUDA
or f_evt.device_type == DeviceType.PrivateUse1
):
fe.append_kernel(
f_evt.name,
f_evt.device_index,

View File

@ -598,7 +598,7 @@ class FunctionEvent(FormattedTimesMixin):
[child.privateuse1_time_total for child in self.cpu_children]
)
else:
assert self.device_type == DeviceType.CUDA
assert self.device_type == DeviceType.PrivateUse1
return self.privateuse1_time_total
@property
@ -867,7 +867,10 @@ def _build_table(
event.self_privateuse1_memory_usage > 0 for event in events
)
use_device = events[0].use_device
if not use_device and (has_privateuse1_mem or has_privateuse1_time):
# Running on PrivateUse1 device with profiler but not enable
# ProfilerActivity.PrivateUse1 can also catch privateuse1 memory usage.
# Here only need to check has_privateuse1_time if not use_device.
if not use_device and has_privateuse1_time:
raise RuntimeError(
"use_device is None, but there is private device performance data."
)
@ -951,7 +954,7 @@ def _build_table(
"Self CUDA Mem",
]
)
if has_privateuse1_mem:
if use_device and has_privateuse1_mem:
privateuse1 = use_device.upper()
headers.extend(
[
@ -1132,7 +1135,7 @@ def _build_table(
_format_memory(evt.self_cuda_memory_usage),
]
)
if has_privateuse1_mem:
if use_device and has_privateuse1_mem:
row_values.extend(
[
# PrivateUse1 Mem Total

View File

@ -332,6 +332,9 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
if (at::hasMTIA()) {
activities.insert(torch::profiler::impl::ActivityType::MTIA);
}
if (c10::get_privateuse1_backend() != "privateuseone") {
activities.insert(torch::profiler::impl::ActivityType::PrivateUse1);
}
#endif
return activities;
});

View File

@ -555,7 +555,9 @@ void prepareProfiler(
config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK,
"Supported only in Kineto profiler");
torch::profiler::impl::kineto::prepareTrace(
/*cpuOnly=*/!(at::hasCUDA() || at::hasXPU() || at::hasMTIA()),
/*cpuOnly=*/!(
at::hasCUDA() || at::hasXPU() || at::hasMTIA() ||
c10::get_privateuse1_backend() != "privateuseone"),
activities,
config.experimental_config);

View File

@ -25,6 +25,8 @@ const std::set<libkineto::ActivityType> kCpuTypes{
libkineto::ActivityType::CUDA_RUNTIME,
libkineto::ActivityType::CUDA_DRIVER,
libkineto::ActivityType::PYTHON_FUNCTION,
libkineto::ActivityType::PRIVATEUSE1_RUNTIME,
libkineto::ActivityType::PRIVATEUSE1_DRIVER,
};
const std::set<libkineto::ActivityType> kCudaTypes = {
@ -47,6 +49,15 @@ const std::set<libkineto::ActivityType> kMtiaTypes = {
libkineto::ActivityType::MTIA_CCP_EVENTS,
libkineto::ActivityType::MTIA_RUNTIME,
};
const std::set<libkineto::ActivityType> kPrivateUse1Types = {
libkineto::ActivityType::GPU_MEMCPY,
libkineto::ActivityType::GPU_MEMSET,
libkineto::ActivityType::GPU_USER_ANNOTATION,
libkineto::ActivityType::CONCURRENT_KERNEL,
// PRIVATEUSE1_RUNTIME appears in both kCpuTypes and kPrivateUse1Types.
libkineto::ActivityType::PRIVATEUSE1_RUNTIME,
libkineto::ActivityType::PRIVATEUSE1_DRIVER,
};
} // namespace
#endif // USE_KINETO
@ -248,6 +259,9 @@ void prepareTrace(
if (collectivesProfilerExists()) {
k_activities.insert(libkineto::ActivityType::COLLECTIVE_COMM);
}
if (activities.count(torch::autograd::profiler::ActivityType::PrivateUse1)) {
k_activities.insert(kPrivateUse1Types.begin(), kPrivateUse1Types.end());
}
ExperimentalConfigWrapper configWrap(config);
@ -336,8 +350,18 @@ c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
case libkineto::ActivityType::GPU_USER_ANNOTATION:
case libkineto::ActivityType::CUDA_PROFILER_RANGE:
// TODO: T151322015
case libkineto::ActivityType::MTIA_CCP_EVENTS:
return c10::DeviceType::CUDA;
case libkineto::ActivityType::MTIA_CCP_EVENTS: {
// PrivateUse1 kineto backend reuse above ActivityTypes,
// If PrivateUse1 backend enabled, this should return
// c10::DeviceType::PrivateUse1.
c10::DeviceType device_type = []() {
if (c10::get_privateuse1_backend() != "privateuseone") {
return c10::DeviceType::PrivateUse1;
}
return c10::DeviceType::CUDA;
}();
return device_type;
}
case libkineto::ActivityType::CPU_OP:
case libkineto::ActivityType::USER_ANNOTATION:
case libkineto::ActivityType::EXTERNAL_CORRELATION:
@ -347,6 +371,8 @@ c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
case libkineto::ActivityType::MTIA_RUNTIME:
case libkineto::ActivityType::PYTHON_FUNCTION:
case libkineto::ActivityType::CUDA_DRIVER:
case libkineto::ActivityType::PRIVATEUSE1_RUNTIME:
case libkineto::ActivityType::PRIVATEUSE1_DRIVER:
return c10::DeviceType::CPU;
default: {
TORCH_WARN(

View File

@ -17,6 +17,7 @@ enum class C10_API_ENUM ActivityType {
XPU, // XPU kernels, runtime
CUDA, // CUDA kernels, runtime
MTIA, // MTIA kernels, runtime
PrivateUse1, // PrivateUse1 kernels, runtime
NUM_KINETO_ACTIVITIES, // must be the last one
};

View File

@ -325,7 +325,8 @@ void initPythonBindings(PyObject* module) {
.value("CPU", ActivityType::CPU)
.value("XPU", ActivityType::XPU)
.value("MTIA", ActivityType::MTIA)
.value("CUDA", ActivityType::CUDA);
.value("CUDA", ActivityType::CUDA)
.value("PrivateUse1", ActivityType::PrivateUse1);
py::class_<ExperimentalConfig>(m, "_ExperimentalConfig")
.def(

View File

@ -143,7 +143,9 @@ class _KinetoProfile:
use_cuda=(ProfilerActivity.CUDA in self.activities),
use_cpu=(ProfilerActivity.CPU in self.activities),
use_mtia=(ProfilerActivity.MTIA in self.activities),
use_device=None,
use_device=self.use_device
if (ProfilerActivity.PrivateUse1 in self.activities)
else None,
record_shapes=self.record_shapes,
with_flops=self.with_flops,
profile_memory=self.profile_memory,