mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Summary: CUDA Post: https://fb.workplace.com/groups/ai.efficiency.tools.users/permalink/2020094788475989/ # Context In this diff, we want to enable the on-demand mode of memory snapshot to allow user to trace any remote process via dyno command line. # Design decision **How do we send on-demand signal to remote process** We leverage the dyno-Kineto approach. Since dyno is running on all machine in Meta, it can send a request to the remote machine to start the Kineto. Kineto will start another thread for memoryProfiler (https://fburl.com/code/dxsmmrok) **why we use different approach as CUDA** On CUDA side, we are using pybind to load torch Module and invoke the python api to start/stop the profiling. However, this requires us to compile the whole torch binary in the predictor which is not recommended by runtime(andruwang) Thus, we decide to use the CPP api directly to avoid un-necessary dependency **why the snapshot is saved as json string directly instead of pickle** Pickle is primarily designed for use with Python and doesn't have well support in cpp. Also, it is hard for user to download the snapshot file and open locally. Due to the dependency issue, it is hard to import the gzip/pickle library to decode the data. Thus, let's use JSON for now. I will work on the visualizer to fasten the render and support other format later. **Plan**: * Now, we will encoded file into gz for MTIA ondemand only and update the visualizer to support both type. * Update auto-trace and CUDA side to encode in gzip as well * Fully remove pickle dependency. Test Plan: # Remote cogwheel test Servicelab: https://fburl.com/servicelab/pckux7a3 snapshot file manifold: https://fburl.com/manifold/fnotk18c snapshot file in pastry: P1805522232 Visualization on D74399684 {F1977786422} # Local Predictor Test url: https://fburl.com/pytorch_memory_visualizer/y06kskkm {F1977787329} Differential Revision: D74179606 Pull Request resolved: https://github.com/pytorch/pytorch/pull/153171 Approved by: https://github.com/sraikund16
118 lines
2.9 KiB
C++
118 lines
2.9 KiB
C++
#ifdef USE_KINETO
|
|
#include <ATen/Context.h>
|
|
#include <libkineto.h>
|
|
#include <torch/csrc/autograd/profiler_kineto.h>
|
|
#include <torch/csrc/mtia/profiler/MTIAMemoryProfiler.h>
|
|
#include <torch/csrc/profiler/kineto_client_interface.h>
|
|
#include <chrono>
|
|
#include <thread>
|
|
|
|
// Ondemand tracing is not supported on Apple or edge platform
|
|
#if defined(__APPLE__) || defined(EDGE_PROFILER_USE_KINETO)
|
|
#define ENABLE_GLOBAL_OBSERVER (0)
|
|
#else
|
|
#define ENABLE_GLOBAL_OBSERVER (1)
|
|
#endif
|
|
|
|
namespace torch {
|
|
|
|
namespace profiler::impl {
|
|
|
|
namespace {
|
|
|
|
using namespace torch::autograd::profiler;
|
|
|
|
class LibKinetoClient : public libkineto::ClientInterface {
|
|
public:
|
|
void init() override {
|
|
::torch::mtia::initMemoryProfiler();
|
|
}
|
|
|
|
void prepare(
|
|
bool report_input_shapes = false,
|
|
bool profile_memory = false,
|
|
bool with_stack = false,
|
|
bool with_flops = false,
|
|
bool with_modules = false) override {
|
|
reportInputShapes_ = report_input_shapes;
|
|
profileMemory_ = profile_memory;
|
|
withStack_ = with_stack;
|
|
withFlops_ = with_flops;
|
|
withModules_ = with_modules;
|
|
}
|
|
|
|
void start() override {
|
|
ProfilerConfig cfg{
|
|
ProfilerState::KINETO_ONDEMAND,
|
|
/*report_input_shapes=*/reportInputShapes_,
|
|
/*profile_memory=*/profileMemory_,
|
|
/*with_stack=*/withStack_,
|
|
/*with_flops=*/withFlops_,
|
|
/*with_modules=*/withModules_};
|
|
std::set<ActivityType> activities{ActivityType::CPU};
|
|
std::unordered_set<at::RecordScope> scopes;
|
|
scopes.insert(at::RecordScope::FUNCTION);
|
|
scopes.insert(at::RecordScope::USER_SCOPE);
|
|
scopes.insert(at::RecordScope::BACKWARD_FUNCTION);
|
|
enableProfiler(cfg, activities, scopes);
|
|
}
|
|
|
|
void stop() override {
|
|
(void)disableProfiler();
|
|
}
|
|
|
|
void start_memory_profile() override {
|
|
LOG(INFO) << "Starting on-demand memory profile";
|
|
startMemoryProfile();
|
|
}
|
|
|
|
void stop_memory_profile() override {
|
|
LOG(INFO) << "Stopping on-demand memory profile";
|
|
stopMemoryProfile();
|
|
}
|
|
|
|
void export_memory_profile(const std::string& path) override {
|
|
exportMemoryProfile(path);
|
|
}
|
|
|
|
private:
|
|
// Temporarily disable shape collection until
|
|
// we re-roll out the feature for on-demand cases
|
|
bool reportInputShapes_{false};
|
|
bool profileMemory_{false};
|
|
bool withStack_{false};
|
|
bool withFlops_{false};
|
|
bool withModules_{false};
|
|
};
|
|
|
|
} // namespace
|
|
|
|
} // namespace profiler::impl
|
|
|
|
void global_kineto_init() {
|
|
#if ENABLE_GLOBAL_OBSERVER
|
|
if (c10::utils::get_env("KINETO_USE_DAEMON").has_value()) {
|
|
libkineto_init(
|
|
/*cpuOnly=*/!(at::hasCUDA() || at::hasXPU() || at::hasMTIA()),
|
|
/*logOnError=*/true);
|
|
libkineto::api().suppressLogMessages();
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#if ENABLE_GLOBAL_OBSERVER
|
|
namespace {
|
|
|
|
struct RegisterLibKinetoClient {
|
|
RegisterLibKinetoClient() {
|
|
static profiler::impl::LibKinetoClient client;
|
|
libkineto::api().registerClient(&client);
|
|
}
|
|
} register_libkineto_client;
|
|
|
|
} // namespace
|
|
#endif
|
|
|
|
} // namespace torch
|
|
#endif // USE_KINETO
|