mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
[Profiler][PrivateUse1] Profiler support PrivateUse1 key (#120556)
Summary: 1.Package public headers of kineto if USE_KINETO so that they can be used by PrivateUse1 user. 2.Add PrivateUse1 key to ActivityType. 3. Support PrivateUse1 key in function deviceTypeFromActivity and _supported_activities. 4. Fix some bugs when processing profiler results. Co-authored-by: albanD <desmaison.alban@gmail.com> Co-authored-by: Aaron Shi <enye.shi@gmail.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/120556 Approved by: https://github.com/aaronenyeshi
This commit is contained in:
committed by
PyTorch MergeBot
parent
6f4c7eeb08
commit
41613a0803
6
setup.py
6
setup.py
@ -1386,6 +1386,12 @@ def main():
|
|||||||
"include/tensorpipe/transport/uv/*.h",
|
"include/tensorpipe/transport/uv/*.h",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
if get_cmake_cache_vars()["USE_KINETO"]:
|
||||||
|
torch_package_data.extend(
|
||||||
|
[
|
||||||
|
"include/kineto/*.h",
|
||||||
|
]
|
||||||
|
)
|
||||||
torchgen_package_data = [
|
torchgen_package_data = [
|
||||||
# Recursive glob doesn't work in setup.py,
|
# Recursive glob doesn't work in setup.py,
|
||||||
# https://github.com/pypa/setuptools/issues/1806
|
# https://github.com/pypa/setuptools/issues/1806
|
||||||
|
@ -270,7 +270,6 @@ class profile:
|
|||||||
self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1_FALLBACK
|
self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1_FALLBACK
|
||||||
else:
|
else:
|
||||||
self.kineto_activities.add(ProfilerActivity.PrivateUse1)
|
self.kineto_activities.add(ProfilerActivity.PrivateUse1)
|
||||||
self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1
|
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
len(self.kineto_activities) > 0
|
len(self.kineto_activities) > 0
|
||||||
@ -317,6 +316,10 @@ class profile:
|
|||||||
return
|
return
|
||||||
if self.use_cuda:
|
if self.use_cuda:
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
elif self.use_device and hasattr(torch, self.use_device):
|
||||||
|
privateuse1_module = getattr(torch, self.use_device)
|
||||||
|
if hasattr(privateuse1_module, "synchronize"):
|
||||||
|
privateuse1_module.synchronize()
|
||||||
|
|
||||||
t0 = perf_counter_ns()
|
t0 = perf_counter_ns()
|
||||||
self.kineto_results = _disable_profiler()
|
self.kineto_results = _disable_profiler()
|
||||||
@ -542,7 +545,10 @@ class profile:
|
|||||||
and fe.id in device_corr_map
|
and fe.id in device_corr_map
|
||||||
):
|
):
|
||||||
for f_evt in device_corr_map[fe.id]:
|
for f_evt in device_corr_map[fe.id]:
|
||||||
if f_evt.device_type == DeviceType.CUDA:
|
if (
|
||||||
|
f_evt.device_type == DeviceType.CUDA
|
||||||
|
or f_evt.device_type == DeviceType.PrivateUse1
|
||||||
|
):
|
||||||
fe.append_kernel(
|
fe.append_kernel(
|
||||||
f_evt.name,
|
f_evt.name,
|
||||||
f_evt.device_index,
|
f_evt.device_index,
|
||||||
|
@ -598,7 +598,7 @@ class FunctionEvent(FormattedTimesMixin):
|
|||||||
[child.privateuse1_time_total for child in self.cpu_children]
|
[child.privateuse1_time_total for child in self.cpu_children]
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
assert self.device_type == DeviceType.CUDA
|
assert self.device_type == DeviceType.PrivateUse1
|
||||||
return self.privateuse1_time_total
|
return self.privateuse1_time_total
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -867,7 +867,10 @@ def _build_table(
|
|||||||
event.self_privateuse1_memory_usage > 0 for event in events
|
event.self_privateuse1_memory_usage > 0 for event in events
|
||||||
)
|
)
|
||||||
use_device = events[0].use_device
|
use_device = events[0].use_device
|
||||||
if not use_device and (has_privateuse1_mem or has_privateuse1_time):
|
# Running on PrivateUse1 device with profiler but not enable
|
||||||
|
# ProfilerActivity.PrivateUse1 can also catch privateuse1 memory usage.
|
||||||
|
# Here only need to check has_privateuse1_time if not use_device.
|
||||||
|
if not use_device and has_privateuse1_time:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"use_device is None, but there is private device performance data."
|
"use_device is None, but there is private device performance data."
|
||||||
)
|
)
|
||||||
@ -951,7 +954,7 @@ def _build_table(
|
|||||||
"Self CUDA Mem",
|
"Self CUDA Mem",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
if has_privateuse1_mem:
|
if use_device and has_privateuse1_mem:
|
||||||
privateuse1 = use_device.upper()
|
privateuse1 = use_device.upper()
|
||||||
headers.extend(
|
headers.extend(
|
||||||
[
|
[
|
||||||
@ -1132,7 +1135,7 @@ def _build_table(
|
|||||||
_format_memory(evt.self_cuda_memory_usage),
|
_format_memory(evt.self_cuda_memory_usage),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
if has_privateuse1_mem:
|
if use_device and has_privateuse1_mem:
|
||||||
row_values.extend(
|
row_values.extend(
|
||||||
[
|
[
|
||||||
# PrivateUse1 Mem Total
|
# PrivateUse1 Mem Total
|
||||||
|
@ -332,6 +332,9 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
|
|||||||
if (at::hasMTIA()) {
|
if (at::hasMTIA()) {
|
||||||
activities.insert(torch::profiler::impl::ActivityType::MTIA);
|
activities.insert(torch::profiler::impl::ActivityType::MTIA);
|
||||||
}
|
}
|
||||||
|
if (c10::get_privateuse1_backend() != "privateuseone") {
|
||||||
|
activities.insert(torch::profiler::impl::ActivityType::PrivateUse1);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
return activities;
|
return activities;
|
||||||
});
|
});
|
||||||
|
@ -555,7 +555,9 @@ void prepareProfiler(
|
|||||||
config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK,
|
config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK,
|
||||||
"Supported only in Kineto profiler");
|
"Supported only in Kineto profiler");
|
||||||
torch::profiler::impl::kineto::prepareTrace(
|
torch::profiler::impl::kineto::prepareTrace(
|
||||||
/*cpuOnly=*/!(at::hasCUDA() || at::hasXPU() || at::hasMTIA()),
|
/*cpuOnly=*/!(
|
||||||
|
at::hasCUDA() || at::hasXPU() || at::hasMTIA() ||
|
||||||
|
c10::get_privateuse1_backend() != "privateuseone"),
|
||||||
activities,
|
activities,
|
||||||
config.experimental_config);
|
config.experimental_config);
|
||||||
|
|
||||||
|
@ -25,6 +25,8 @@ const std::set<libkineto::ActivityType> kCpuTypes{
|
|||||||
libkineto::ActivityType::CUDA_RUNTIME,
|
libkineto::ActivityType::CUDA_RUNTIME,
|
||||||
libkineto::ActivityType::CUDA_DRIVER,
|
libkineto::ActivityType::CUDA_DRIVER,
|
||||||
libkineto::ActivityType::PYTHON_FUNCTION,
|
libkineto::ActivityType::PYTHON_FUNCTION,
|
||||||
|
libkineto::ActivityType::PRIVATEUSE1_RUNTIME,
|
||||||
|
libkineto::ActivityType::PRIVATEUSE1_DRIVER,
|
||||||
};
|
};
|
||||||
|
|
||||||
const std::set<libkineto::ActivityType> kCudaTypes = {
|
const std::set<libkineto::ActivityType> kCudaTypes = {
|
||||||
@ -47,6 +49,15 @@ const std::set<libkineto::ActivityType> kMtiaTypes = {
|
|||||||
libkineto::ActivityType::MTIA_CCP_EVENTS,
|
libkineto::ActivityType::MTIA_CCP_EVENTS,
|
||||||
libkineto::ActivityType::MTIA_RUNTIME,
|
libkineto::ActivityType::MTIA_RUNTIME,
|
||||||
};
|
};
|
||||||
|
const std::set<libkineto::ActivityType> kPrivateUse1Types = {
|
||||||
|
libkineto::ActivityType::GPU_MEMCPY,
|
||||||
|
libkineto::ActivityType::GPU_MEMSET,
|
||||||
|
libkineto::ActivityType::GPU_USER_ANNOTATION,
|
||||||
|
libkineto::ActivityType::CONCURRENT_KERNEL,
|
||||||
|
// PRIVATEUSE1_RUNTIME appears in both kCpuTypes and kPrivateUse1Types.
|
||||||
|
libkineto::ActivityType::PRIVATEUSE1_RUNTIME,
|
||||||
|
libkineto::ActivityType::PRIVATEUSE1_DRIVER,
|
||||||
|
};
|
||||||
} // namespace
|
} // namespace
|
||||||
#endif // USE_KINETO
|
#endif // USE_KINETO
|
||||||
|
|
||||||
@ -248,6 +259,9 @@ void prepareTrace(
|
|||||||
if (collectivesProfilerExists()) {
|
if (collectivesProfilerExists()) {
|
||||||
k_activities.insert(libkineto::ActivityType::COLLECTIVE_COMM);
|
k_activities.insert(libkineto::ActivityType::COLLECTIVE_COMM);
|
||||||
}
|
}
|
||||||
|
if (activities.count(torch::autograd::profiler::ActivityType::PrivateUse1)) {
|
||||||
|
k_activities.insert(kPrivateUse1Types.begin(), kPrivateUse1Types.end());
|
||||||
|
}
|
||||||
|
|
||||||
ExperimentalConfigWrapper configWrap(config);
|
ExperimentalConfigWrapper configWrap(config);
|
||||||
|
|
||||||
@ -336,8 +350,18 @@ c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
|
|||||||
case libkineto::ActivityType::GPU_USER_ANNOTATION:
|
case libkineto::ActivityType::GPU_USER_ANNOTATION:
|
||||||
case libkineto::ActivityType::CUDA_PROFILER_RANGE:
|
case libkineto::ActivityType::CUDA_PROFILER_RANGE:
|
||||||
// TODO: T151322015
|
// TODO: T151322015
|
||||||
case libkineto::ActivityType::MTIA_CCP_EVENTS:
|
case libkineto::ActivityType::MTIA_CCP_EVENTS: {
|
||||||
return c10::DeviceType::CUDA;
|
// PrivateUse1 kineto backend reuse above ActivityTypes,
|
||||||
|
// If PrivateUse1 backend enabled, this should return
|
||||||
|
// c10::DeviceType::PrivateUse1.
|
||||||
|
c10::DeviceType device_type = []() {
|
||||||
|
if (c10::get_privateuse1_backend() != "privateuseone") {
|
||||||
|
return c10::DeviceType::PrivateUse1;
|
||||||
|
}
|
||||||
|
return c10::DeviceType::CUDA;
|
||||||
|
}();
|
||||||
|
return device_type;
|
||||||
|
}
|
||||||
case libkineto::ActivityType::CPU_OP:
|
case libkineto::ActivityType::CPU_OP:
|
||||||
case libkineto::ActivityType::USER_ANNOTATION:
|
case libkineto::ActivityType::USER_ANNOTATION:
|
||||||
case libkineto::ActivityType::EXTERNAL_CORRELATION:
|
case libkineto::ActivityType::EXTERNAL_CORRELATION:
|
||||||
@ -347,6 +371,8 @@ c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
|
|||||||
case libkineto::ActivityType::MTIA_RUNTIME:
|
case libkineto::ActivityType::MTIA_RUNTIME:
|
||||||
case libkineto::ActivityType::PYTHON_FUNCTION:
|
case libkineto::ActivityType::PYTHON_FUNCTION:
|
||||||
case libkineto::ActivityType::CUDA_DRIVER:
|
case libkineto::ActivityType::CUDA_DRIVER:
|
||||||
|
case libkineto::ActivityType::PRIVATEUSE1_RUNTIME:
|
||||||
|
case libkineto::ActivityType::PRIVATEUSE1_DRIVER:
|
||||||
return c10::DeviceType::CPU;
|
return c10::DeviceType::CPU;
|
||||||
default: {
|
default: {
|
||||||
TORCH_WARN(
|
TORCH_WARN(
|
||||||
|
@ -17,6 +17,7 @@ enum class C10_API_ENUM ActivityType {
|
|||||||
XPU, // XPU kernels, runtime
|
XPU, // XPU kernels, runtime
|
||||||
CUDA, // CUDA kernels, runtime
|
CUDA, // CUDA kernels, runtime
|
||||||
MTIA, // MTIA kernels, runtime
|
MTIA, // MTIA kernels, runtime
|
||||||
|
PrivateUse1, // PrivateUse1 kernels, runtime
|
||||||
NUM_KINETO_ACTIVITIES, // must be the last one
|
NUM_KINETO_ACTIVITIES, // must be the last one
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -325,7 +325,8 @@ void initPythonBindings(PyObject* module) {
|
|||||||
.value("CPU", ActivityType::CPU)
|
.value("CPU", ActivityType::CPU)
|
||||||
.value("XPU", ActivityType::XPU)
|
.value("XPU", ActivityType::XPU)
|
||||||
.value("MTIA", ActivityType::MTIA)
|
.value("MTIA", ActivityType::MTIA)
|
||||||
.value("CUDA", ActivityType::CUDA);
|
.value("CUDA", ActivityType::CUDA)
|
||||||
|
.value("PrivateUse1", ActivityType::PrivateUse1);
|
||||||
|
|
||||||
py::class_<ExperimentalConfig>(m, "_ExperimentalConfig")
|
py::class_<ExperimentalConfig>(m, "_ExperimentalConfig")
|
||||||
.def(
|
.def(
|
||||||
|
@ -143,7 +143,9 @@ class _KinetoProfile:
|
|||||||
use_cuda=(ProfilerActivity.CUDA in self.activities),
|
use_cuda=(ProfilerActivity.CUDA in self.activities),
|
||||||
use_cpu=(ProfilerActivity.CPU in self.activities),
|
use_cpu=(ProfilerActivity.CPU in self.activities),
|
||||||
use_mtia=(ProfilerActivity.MTIA in self.activities),
|
use_mtia=(ProfilerActivity.MTIA in self.activities),
|
||||||
use_device=None,
|
use_device=self.use_device
|
||||||
|
if (ProfilerActivity.PrivateUse1 in self.activities)
|
||||||
|
else None,
|
||||||
record_shapes=self.record_shapes,
|
record_shapes=self.record_shapes,
|
||||||
with_flops=self.with_flops,
|
with_flops=self.with_flops,
|
||||||
profile_memory=self.profile_memory,
|
profile_memory=self.profile_memory,
|
||||||
|
Reference in New Issue
Block a user