#ifdef _WIN32 #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif #include #include #else #include #endif // _WIN32 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef USE_DISTRIBUTED #include #endif // USE_DISTRIBUTED using namespace at; // Collective property attributes // https://github.com/pytorch/pytorch/issues/124674 #ifdef USE_DISTRIBUTED constexpr auto kETCommsName = "collective_name"; constexpr auto kETInMsgNelems = "in_msg_nelems"; constexpr auto kETOutMsgNelems = "out_msg_nelems"; constexpr auto kETInSplit = "in_split_size"; constexpr auto kETOutSplit = "out_split_size"; constexpr auto kETGlobalRankStart = "global_rank_start"; constexpr auto kETGlobalRankStride = "global_rank_stride"; constexpr auto kETGroupSize = "pg_size"; constexpr auto kETProcessGroupName = "pg_name"; constexpr auto kETProcessGroupDesc = "pg_desc"; #endif // USE_DISTRIBUTED namespace torch::profiler::impl { //****************************************************************************** // JSON output utility functions. To be merged with PyTorch profiler. //****************************************************************************** template static std::string vectorToString(const std::vector& v) { return fmt::format("[{}]", fmt::join(v, ",")); } static std::string json_str_escape(const std::string& str); constexpr size_t kMaxNumElements = 4096; static std::string getScalarValue(const c10::IValue& val) { if (val.isDouble()) { double d_val = val.toDouble(); if (std::isinf(d_val) || std::isnan(d_val)) { return fmt::format("\"{}\"", std::to_string(d_val)); } else { return std::to_string(d_val); } } else if (val.isInt()) { return std::to_string(val.toInt()); } else if (val.isBool()) { return val.toBool() ? "true" : "false"; } else if (val.isString()) { const std::string& str_val = val.toStringRef(); return fmt::format("\"{}\"", json_str_escape(str_val)); } else if (val.isDevice()) { return fmt::format("\"{}\"", val.toDevice().str()); } return fmt::format("\"<{}>\"", val.tagKind()); } static int32_t processId() { #ifndef _WIN32 return static_cast(getpid()); #else return static_cast(GetCurrentProcessId()); #endif } //****************************************************************************** // Main ExecutionTraceObserver implementation. //****************************************************************************** // ExecutionTraceObserver contains all the states of the observer. Some of them // are shared between the enter and exit RecordFunction call backs, some data // like the `opStack` may be accessed across different threads. So we should be // careful about data races. A global mutex `gMutex` is used avoid these races // at the cost of performance in large number of threads situations. We may // optimize this further to thread local, fine-grained locking, or use thread // safe containers. struct TORCH_API ExecutionTraceObserver { // NOLINT using ID = size_t; // Mapping of each thread to its own operator stack std::map> opStack{}; // Uses the underlying TensorImpl object pointer as the key and map to its // unique id. std::map objectId{}; using weak_storage_ptr = c10::weak_intrusive_ptr; std::unordered_map data_ptr_to_storage_id{}; std::unordered_map data_ptr_to_weak_storage_ptr{}; ID get_tensor_storage_ID(const c10::Storage& t_storage) { const std::lock_guard lock(gMutex); const void* raw_data_ptr = t_storage.data(); auto iter = data_ptr_to_weak_storage_ptr.find(raw_data_ptr); if (iter == data_ptr_to_weak_storage_ptr.end()) { ID id = storage_id_++; data_ptr_to_storage_id.emplace(raw_data_ptr, id); data_ptr_to_weak_storage_ptr.emplace( raw_data_ptr, t_storage.getWeakStorageImpl()); return id; } else { // check if the storage is still alive if (iter->second.expired()) { ID id = storage_id_++; // std::unorder_map does not change if the key is already in the map. // So we need to remove the key and insert the key with the new value. data_ptr_to_storage_id.erase(raw_data_ptr); data_ptr_to_storage_id[raw_data_ptr] = id; data_ptr_to_weak_storage_ptr.erase(raw_data_ptr); data_ptr_to_weak_storage_ptr.emplace( raw_data_ptr, t_storage.getWeakStorageImpl()); return id; } else { return data_ptr_to_storage_id[raw_data_ptr]; } } } // Observer run state. enum class RunState { uninitialized, disabled, enabled }; // Mutex for multithreaded access to the shared containers. std::recursive_mutex gMutex{}; // Stream to write output JSON. std::ofstream out{}; // Full path to the output file. std::string fileName{}; std::string resourceDir{}; // RecordFunction callback handle for this observer. CallbackHandle cbHandle{INVALID_CALLBACK_HANDLE}; // Process ID. int32_t pid{-1}; std::string recordTime{}; ExecutionTraceObserver() = default; // Returns a new unique ID. ID getNewID() { return id_++; } RunState getState() const { return state_; } void setState(RunState newState) { if (state_ == RunState::uninitialized || callbackShouldBeEnabled(state_) != callbackShouldBeEnabled(newState)) { if (callbackShouldBeEnabled(newState)) { reenableCallback(cbHandle); } else { disableCallback(cbHandle); } } state_ = newState; } bool record_integral_tensor_range{false}; std::unordered_set nodeListForSavingIntegerTensor{}; private: static bool callbackShouldBeEnabled(RunState run_state) { return run_state == ExecutionTraceObserver::RunState::enabled; } // Must use accessors to change this so that we can keep the // RecordFunction callback in sync with the state. RunState state_{RunState::uninitialized}; // All tensors and operators have an unique id assigned. Increment id for each // new tensor or operator node. // 0 -> uninitialized // 1 -> root ID // 2 ... -> regular node ID std::atomic id_{2}; std::atomic storage_id_{1}; }; // Using a singleton manager here to allow init and delete the observer object. using ObserverManager = GlobalStateManager; // Uninitialized node has id = 0 const ExecutionTraceObserver::ID kUninitializedId{0}; // Root node has id = 1 const ExecutionTraceObserver::ID kRootId{1}; struct FunctionCallContext : public ObserverContext { // NOLINT std::string name; std::string kernelBackend; std::string kernelFile; ExecutionTraceObserver::ID opId{kUninitializedId}; ExecutionTraceObserver::ID parentId{kUninitializedId}; ExecutionTraceObserver::ID fwParentId{kUninitializedId}; std::vector inputTypes; std::vector inputShapes; std::vector inputStrides; std::vector inputValues; std::map> tensor_index_min_max_map; std::string get_string_for_tensor_range() { if (tensor_index_min_max_map.empty()) { return ""; } std::string result = "{"; unsigned int i = 0; for (auto const& [key, value] : tensor_index_min_max_map) { if (i == tensor_index_min_max_map.size() - 1) { result += json_str_escape( fmt::format("\"{}\":[{},{}]", key, value.first, value.second)); } else { result += json_str_escape( fmt::format("\"{}\":[{},{}],", key, value.first, value.second)); } i++; } result += "}"; return result; } }; // Opens the json file to write the execution trace. static std::ofstream openOutputFile(const std::string& name) { std::ofstream stream; stream.open(name, std::ofstream::out | std::ofstream::trunc); if (!stream) { LOG(ERROR) << "Failed to open '" << name << "'"; } else { VLOG(1) << "PyTorch Execution Trace: writing to " << name; } return stream; } #ifdef USE_DISTRIBUTED static std::string getAttrJson( const std::string& name, const std::string& type, const std::string& value) { // note name and type are not quoted but value should be if it is a string. return fmt::format( R"JSON( {{"name": "{}", "type": "{}", "value": {}}})JSON", name, type, value); } #endif static void writeJsonNode( std::ofstream& out, const std::string& name, const uint64_t id, const uint64_t rf_id, const uint64_t parent, const uint64_t fw_parent, const int64_t seq_id, const uint64_t scope, const uint64_t tid, const uint64_t fw_tid, const std::string& inputs = "[]", const std::string& inputShapes = "[]", const std::string& inputStrides = "[]", const std::string& inputTypes = "[]", const std::string& outputs = "[]", const std::string& output_shapes = "[]", const std::string& output_strides = "[]", const std::string& output_types = "[]", const std::string& operator_schema = "", const std::string& kernelBackend = "", const std::string& kernelFile = "", const std::string& tensor_range = "", const std::string& additiona_attrs = "") { if (!out.is_open() || out.fail() || out.bad()) { return; } try { out << fmt::format( R"JSON( {{ "id": {}, "name": "{}", "ctrl_deps": {}, "inputs": {{"values": {}, "shapes": {}, "types": {}, "strides": {}}}, "outputs": {{"values": {}, "shapes": {}, "types": {}, "strides": {}}}, "attrs": [{{"name": "rf_id", "type": "uint64", "value": {}}},{{"name": "fw_parent", "type": "uint64", "value": {}}},{{"name": "seq_id", "type": "int64", "value": {}}},{{"name": "scope", "type": "uint64", "value": {}}},{{"name": "tid", "type": "uint64", "value": {}}},{{"name": "fw_tid", "type": "uint64", "value": {}}},{{"name": "op_schema", "type": "string", "value": "{}"}},{{"name": "kernel_backend", "type": "string", "value": "{}"}},{{"name": "kernel_file", "type": "string", "value": "{}"}},{{"name": "tensor_range", "type": "string", "value": "{}"}}{}] }})JSON", id, name, parent, inputs, inputShapes, inputTypes, inputStrides, outputs, output_shapes, output_types, output_strides, rf_id, fw_parent, seq_id, scope, tid, fw_tid, operator_schema, kernelBackend, kernelFile, tensor_range, additiona_attrs); } catch (const std::exception& e) { LOG(ERROR) << "Failed to write json node to execution trace: " << e.what(); } } static std::string timeString(const std::time_t timepoint) { std::ostringstream oss; oss << std::put_time(std::localtime(&timepoint), "%Y-%m-%d %X"); // NOLINT return oss.str(); } static bool initExecutionTraceStart(ExecutionTraceObserver& ob) { ob.out = openOutputFile(ob.fileName); // If somehow the output stream failed to open, finish observer here. if (!ob.out) { LOG(WARNING) << "Failed to open output file: " << ob.fileName; return false; } // Wall clock time for the first op collection time. const auto current_time = std::chrono::system_clock::now(); ob.recordTime = timeString(std::chrono::system_clock::to_time_t(current_time)); // Start timestamp using steady_clock for measurement. const auto timestamp = std::chrono::duration_cast( std::chrono::steady_clock::now().time_since_epoch()) .count(); ob.out << fmt::format( R"JSON({{ "schema": "1.1.1-chakra.0.0.4", "pid": {}, "time": "{}", "start_ts": {}, "nodes": [)JSON", ob.pid, ob.recordTime, timestamp); return true; } // Write out Execution Trace to file static void finalizeExecutionTraceOutput(ExecutionTraceObserver& ob) { writeJsonNode( ob.out, "[pytorch|profiler|execution_trace|process]", kRootId, 0, // rf_id kRootId, // parent is self 0, // fw_parent -1, // seq_id static_cast>(RecordScope::USER_SCOPE), 0, // tid 0); // fw_tid // Finish timestamp using steady_clock for measurement. const auto timestamp = std::chrono::duration_cast( std::chrono::steady_clock::now().time_since_epoch()) .count(); ob.out << fmt::format( R"JSON( ], "finish_ts": {} }})JSON", timestamp); ob.out.close(); VLOG(1) << "PyTorch Execution Trace: written to file " << ob.fileName; } static ExecutionTraceObserver::ID getObjectID( ExecutionTraceObserver& ob, const void* t) { const std::lock_guard lock(ob.gMutex); auto iter = ob.objectId.find(t); if (iter == ob.objectId.end()) { ExecutionTraceObserver::ID objectId = ob.getNewID(); ob.objectId[t] = objectId; return objectId; } return iter->second; } static void dumpTensorData2File( std::string& tensor_dump_file_name, at::Tensor& tensor_on_host) { std::fstream fs; fs.open(tensor_dump_file_name, std::fstream::out | std::fstream::binary); if (fs.is_open()) { auto* tensor_impl = tensor_on_host.unsafeGetTensorImpl(); size_t tensor_offset = tensor_impl->storage_offset(); size_t tensor_nbyte = tensor_impl->numel() * tensor_impl->itemsize(); fs.write( (const char*)tensor_impl->storage().data() + tensor_offset, (long)tensor_nbyte); } } static std::tuple convertIValue( ExecutionTraceObserver& ob, const std::string& functionName, ExecutionTraceObserver::ID opId, int& tensorIndex, std::map>& tensor_index_min_max_map, bool isInput, const c10::IValue& val, const bool baseType = true, const size_t maxArrayLen = kMaxNumElements) { std::string type = val.tagKind(); if (val.isTensor()) { std::string tensor_shape, tensor_stride, tensor_type, tensor_value; const auto& tensor = val.toTensor(); const auto tensor_impl = tensor.unsafeGetTensorImpl(); if (tensor.defined() && !tensor_impl->has_symbolic_sizes_strides()) { // tensor shape tensor_shape = vectorToString(tensor.sizes().vec()); // tensor strides tensor_stride = vectorToString(tensor.strides().vec()); } else { tensor_shape = "[]"; tensor_stride = "[]"; } // tensor dtype type = type + fmt::format("({})", std::string(tensor.dtype().name())); tensor_type = baseType ? fmt::format("\"{}\"", type) : type; ExecutionTraceObserver::ID tensor_id = getObjectID(ob, tensor_impl); ExecutionTraceObserver::ID storage_id = 0; size_t offset = 0; size_t numel = 0; size_t itemsize = 0; std::string device_str; // symbolic sizes/strides implies t->storage_offset() will fail if (tensor_impl->has_storage() && !tensor_impl->has_symbolic_sizes_strides()) { const c10::Storage& t_storage = tensor_impl->storage(); storage_id = ob.get_tensor_storage_ID(t_storage); offset = tensor_impl->storage_offset(); numel = tensor_impl->numel(); itemsize = tensor_impl->itemsize(); device_str = tensor_impl->device().str(); if (isInput && at::isIntegralType(tensor.scalar_type(), false) && tensor.numel() != 0) { enableRecordFunction(false); if (ob.nodeListForSavingIntegerTensor.find(functionName) != ob.nodeListForSavingIntegerTensor.end() && !ob.resourceDir.empty()) { std::string tensor_dump_file_name = ob.resourceDir + "/nid_" + std::to_string(opId) + "_tid_" + std::to_string(tensorIndex) + ".dat"; auto tensor_on_host = tensor.cpu(); dumpTensorData2File(tensor_dump_file_name, tensor_on_host); } if (ob.record_integral_tensor_range) { long min = tensor.min().item().toLong(); long max = tensor.max().item().toLong(); tensor_index_min_max_map[tensorIndex] = std::make_pair(min, max); } enableRecordFunction(true); } } tensorIndex++; tensor_value = fmt::format( "[{},{},{},{},{},\"{}\"]", tensor_id, storage_id, offset, numel, itemsize, device_str); return std::make_tuple( tensor_shape, tensor_stride, tensor_type, tensor_value); } else if (val.isTuple()) { const auto& val_tuple = val.toTupleRef().elements(); size_t tuple_size = val_tuple.size(); std::vector shape_array; std::vector stride_array; std::vector type_array; std::vector value_array; for (const auto j : c10::irange(tuple_size)) { auto tuple = convertIValue( ob, functionName, opId, tensorIndex, tensor_index_min_max_map, isInput, val_tuple[j], false, maxArrayLen); shape_array.push_back(std::get<0>(tuple)); stride_array.push_back(std::get<1>(tuple)); type_array.push_back(std::get<2>(tuple)); value_array.push_back(std::get<3>(tuple)); } type = type + vectorToString(type_array); std::string tensor_type = baseType ? fmt::format("\"{}\"", type) : type; return std::make_tuple( vectorToString(shape_array), vectorToString(stride_array), tensor_type, vectorToString(value_array)); } else if (val.isList()) { const auto& val_list = val.toList(); size_t list_size = val_list.size(); std::vector shape_array; std::vector stride_array; std::vector type_array; std::vector value_array; for (const auto j : c10::irange(list_size)) { auto tuple = convertIValue( ob, functionName, opId, tensorIndex, tensor_index_min_max_map, isInput, val_list.get(j), false, maxArrayLen); shape_array.push_back(std::get<0>(tuple)); stride_array.push_back(std::get<1>(tuple)); type_array.push_back(std::get<2>(tuple)); value_array.push_back(std::get<3>(tuple)); if (j >= maxArrayLen) { LOG(WARNING) << "list size=" << val_list.size() << " exceeded maxArrayLen=" << maxArrayLen; break; } } type = type + vectorToString(type_array); std::string tensor_type = baseType ? fmt::format("\"{}\"", type) : type; return std::make_tuple( vectorToString(shape_array), vectorToString(stride_array), tensor_type, vectorToString(value_array)); } else { std::string tensor_shape = "[]"; std::string tensor_stride = "[]"; std::string tensor_type = baseType ? fmt::format("\"{}\"", type) : type; std::string tensor_value = getScalarValue(val); return std::make_tuple( tensor_shape, tensor_stride, tensor_type, tensor_value); } } static void appendValueInfo( ExecutionTraceObserver& ob, const std::string& functionName, ExecutionTraceObserver::ID opId, int& tensorIndex, std::map>& tensor_index_min_max_map, bool isInput, const c10::IValue& val, std::vector& shapes, std::vector& strides, std::vector& types, std::vector& values) { auto tuple = convertIValue( ob, functionName, opId, tensorIndex, tensor_index_min_max_map, isInput, val, true); shapes.push_back(std::get<0>(tuple)); strides.push_back(std::get<1>(tuple)); types.push_back(std::get<2>(tuple)); values.push_back(std::get<3>(tuple)); } static void handleKernelBackendInfo( FunctionCallContext& fc, const RecordFunction& fn) { // triton kernel related information are in kwinputs const auto& kwinputs = fn.kwinputs(); if (kwinputs.find("kernel_backend") != kwinputs.end()) { fc.kernelBackend = kwinputs.at("kernel_backend").toStringRef(); if (fc.kernelBackend == "triton") { fc.kernelFile = kwinputs.at("kernel_file").toStringRef(); TORCH_INTERNAL_ASSERT( kwinputs.find("kernel_file") != kwinputs.end(), "kernel file is missing in triton kernel"); // Remove the path of the file name if (fc.kernelFile.find_last_of('/') != std::string::npos) { fc.kernelFile = fc.kernelFile.substr(fc.kernelFile.find_last_of('/') + 1); } // get stream information TORCH_INTERNAL_ASSERT( kwinputs.find("stream") != kwinputs.end(), "stream is missing in triton kernel"); fc.inputValues.emplace_back( std::to_string(kwinputs.at("stream").toInt())); fc.inputTypes.emplace_back("\"Int\""); fc.inputShapes.emplace_back("[]"); } } } // Additional attributes for commounication collectives inline std::string getCommsNodeAttrs(const RecordFunction& fn) { // NOLINT std::vector attrs; #ifdef USE_DISTRIBUTED // We rely on paramcommsdebug object that is available in thread local info auto debugInfo = dynamic_cast( c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PARAM_COMMS_INFO)); if (debugInfo == nullptr) { LOG(WARNING) << "ParamCommsDebugInfo not available for function: " << fn.name(); return ", " + getAttrJson("debug", "string", "\"missing comms info\""); } // get NcclMeta from record function, this used ParamCommsDebugInfo above // since we currently have this read called in onFunctionExit flow, we // should only introspect output tensors to prevent an INTERNAL ASSERT // FAILED in RecordFunction when we try to read input in RecordFunction exit // methods. auto meta = saveNcclMeta(fn, SaveNcclMetaConfig(false, true, false, true)); auto addAttr = [&](const char* commsMetaName, const char* etMetaName, const char* type) { auto it = meta.find(commsMetaName); if (it != meta.end()) { attrs.push_back(getAttrJson(etMetaName, type, it->second)); } }; addAttr(kCommsName, kETCommsName, "string"); addAttr(kDtype, kDtype, "string"); addAttr(kInMsgNelems, kETInMsgNelems, "uint64"); addAttr(kOutMsgNelems, kETOutMsgNelems, "uint64"); // following two metadata are lists. addAttr(kInSplit, kETInSplit, "string"); addAttr(kOutSplit, kETOutSplit, "string"); addAttr(kGlobalRankStart, kETGlobalRankStart, "uint64"); addAttr(kGlobalRankStride, kETGlobalRankStride, "uint64"); // pg_name is a string. addAttr(kProcessGroupName, kETProcessGroupName, "string"); addAttr(kProcessGroupDesc, kETProcessGroupDesc, "string"); addAttr(kGroupSize, kETGroupSize, "uint64"); #endif // USE_DISTRIBUTED // XXX consider using as string stream? return attrs.empty() ? "" : fmt::format(", {}", fmt::join(attrs, ", ")); } static void recordOperatorStart( ExecutionTraceObserver& ob, FunctionCallContext& fc, const RecordFunction& fn) { auto tid = fn.threadId(); try { { const std::lock_guard lock(ob.gMutex); // if current thread stack is empty, push the root node to the stack // first if (ob.opStack[tid].empty()) { auto thread_node_id = ob.getNewID(); ob.opStack[tid].push(thread_node_id); writeJsonNode( ob.out, "[pytorch|profiler|execution_trace|thread]", thread_node_id, 0, // rf_id kRootId, 0, // fw_parent -1, // seq_id static_cast>( RecordScope::USER_SCOPE), tid, 0); // fw_tid ob.out << ","; } } // all input nodes should have id > opId fc.opId = ob.getNewID(); fc.name = fn.name(); if (!checkFunctionInputsForLogging(fn)) { return; } auto num_inputs = fn.num_inputs(); const auto inputs = fn.inputs(); // need to account for Stack mode where the inputs are at the end. size_t input_start = inputs.size() - num_inputs; // tensor_index is the index of the flattened tensor list for all input // tensors int tensor_index = 0; for (const auto i : c10::irange(input_start, inputs.size())) { appendValueInfo( ob, fc.name, fc.opId, tensor_index, fc.tensor_index_min_max_map, true, inputs[i], fc.inputShapes, fc.inputStrides, fc.inputTypes, fc.inputValues); } handleKernelBackendInfo(fc, fn); { const std::lock_guard lock(ob.gMutex); fc.parentId = ob.opStack[tid].top(); // get parent id from the forward stack, this can be different for // autograd ops, which may execute on a different thread than the // original thread (which should have the parent op on the stack). auto fw_tid = fn.forwardThreadId(); if (fw_tid != 0) { fc.fwParentId = ob.opStack[fw_tid].top(); } ob.opStack[tid].push(fc.opId); } } catch (const std::exception& e) { LOG(WARNING) << "Exception in execution trace observer: " << e.what(); } } static std::unique_ptr onFunctionEnter( const RecordFunction& fn) { using RunState = ExecutionTraceObserver::RunState; auto ob = ObserverManager::get(); if (ob != nullptr && ob->getState() == RunState::enabled) { // record op auto fc_ptr = std::make_unique(); recordOperatorStart(*ob, *fc_ptr.get(), fn); return fc_ptr; } return nullptr; } static std::string json_str_escape(const std::string& str) { std::ostringstream ostream; for (char ch : str) { if (ch == '"') { ostream << "\\\""; } else if (ch == '\\') { ostream << "\\\\"; } else if (ch == '\b') { ostream << "\\b"; } else if (ch == '\f') { ostream << "\\f"; } else if (ch == '\n') { ostream << "\\n"; } else if (ch == '\r') { ostream << "\\r"; } else if (ch == '\t') { ostream << "\\t"; } else if (ch <= '\x1f') { ostream << "\\u" << std::hex << std::setw(4) << std::setfill('0') << static_cast(ch); } else { ostream << ch; } } return ostream.str(); } static void onFunctionExit(const RecordFunction& fn, ObserverContext* ctx_ptr) { using RunState = ExecutionTraceObserver::RunState; auto ob = ObserverManager::get(); if (ob == nullptr || ctx_ptr == nullptr) { return; } if (ob->getState() == RunState::enabled) { auto fc_ptr = dynamic_cast(ctx_ptr); // TORCH_INTERNAL_ASSERT(fc_ptr != nullptr); if (fc_ptr == nullptr) { LOG(WARNING) << "FunctionCallContext is nullptr."; return; } auto& fc = *fc_ptr; if (!checkFunctionOutputsForLogging(fn)) { return; } auto outputs = fn.outputs(); auto num_outputs = fn.num_outputs(); // need to account for Stack mode where the outputs are at the end. size_t output_start = outputs.size() - num_outputs; std::vector output_types; std::vector output_strides; std::vector output_shapes; std::vector output_values; try { int tensor_index = 0; for (const auto i : c10::irange(output_start, outputs.size())) { appendValueInfo( *ob, fc.name, fc.opId, tensor_index, fc.tensor_index_min_max_map, false, outputs.at(i), output_shapes, output_strides, output_types, output_values); } std::string op_schema_str{}; const auto op_schema = fn.operator_schema(); if (op_schema.has_value()) { op_schema_str = json_str_escape(c10::toString(op_schema.value())); } const std::string additiona_attrs = fn.isNcclMeta() ? getCommsNodeAttrs(fn) : ""; { const std::lock_guard lock(ob->gMutex); // remove current op id from stack ob->opStack[fn.threadId()].pop(); writeJsonNode( ob->out, fc.name, fc.opId, fn.handle(), fc.parentId, fc.fwParentId, fn.seqNr(), static_cast>(fn.scope()), fn.threadId(), fn.forwardThreadId(), vectorToString(fc.inputValues), vectorToString(fc.inputShapes), vectorToString(fc.inputStrides), vectorToString(fc.inputTypes), vectorToString(output_values), vectorToString(output_shapes), vectorToString(output_strides), vectorToString(output_types), op_schema_str, fc.kernelBackend, fc.kernelFile, fc.get_string_for_tensor_range(), additiona_attrs); ob->out << ","; } } catch (const std::exception& e) { LOG(WARNING) << "Exception in execution trace observer: [" << fc.name << " (" << fc.opId << ")] " << e.what(); } } } // Add execution trace observer callback functions to the RecordFunction // global observers. bool addExecutionTraceObserver(const std::string& output_file_path) { // Check if the observer is already initialized. if (ObserverManager::get() == nullptr) { ObserverManager::push(std::make_shared()); auto& ob = *ObserverManager::get(); ob.pid = processId(); // Set output ob.fileName = output_file_path; if (!initExecutionTraceStart(ob)) { return false; } // check if the environment variable is set to force recording integer // tensors auto env_variable = c10::utils::get_env( "ENABLE_PYTORCH_EXECUTION_TRACE_SAVE_INTEGRAL_TENSOR_RANGE"); if (env_variable.has_value()) { ob.record_integral_tensor_range = true; } // check if the environment variable is set to force recording integer // tensors env_variable = c10::utils::get_env( "ENABLE_PYTORCH_EXECUTION_TRACE_SAVE_INTEGRAL_TENSOR_DATA"); if (env_variable.has_value()) { std::istringstream stream(env_variable.value()); std::string token; while (std::getline(stream, token, ',')) { ob.nodeListForSavingIntegerTensor.insert(token); } } std::size_t ext_pos = ob.fileName.rfind(".json"); if (ext_pos != std::string::npos) { ob.resourceDir = ob.fileName; // 5 is the length of ".json" ob.resourceDir.replace(ext_pos, 5, "_resources/"); VLOG(1) << "Execution trace resource directory: " << ob.resourceDir << "\n"; } else { LOG(WARNING) << "Execution trace output file does not end with \".json\"."; } ob.cbHandle = addGlobalCallback( RecordFunctionCallback(&onFunctionEnter, &onFunctionExit) .needsInputs(true) .needsOutputs(true) .needsIds(true)); // Default to disabled. ob.setState(ExecutionTraceObserver::RunState::disabled); VLOG(1) << "PyTorch Execution Trace: added observer, output=" << output_file_path; } else if (ObserverManager::get()->cbHandle != INVALID_CALLBACK_HANDLE) { LOG(WARNING) << "Execution trace observer is already registered."; } return true; } void removeExecutionTraceObserver() { auto ob = ObserverManager::get(); if (ob != nullptr) { if (ob->getState() != ExecutionTraceObserver::RunState::disabled) { disableExecutionTraceObserver(); } if (ob->cbHandle != INVALID_CALLBACK_HANDLE) { finalizeExecutionTraceOutput(*ob); removeCallback(ob->cbHandle); ob->cbHandle = INVALID_CALLBACK_HANDLE; // Release the current ET observer object and reset. TORCH_INTERNAL_ASSERT( ObserverManager::pop() != nullptr, "Global state ptr cannot be null before resetting"); VLOG(1) << "PyTorch Execution Trace: removed observer"; } else { LOG(WARNING) << "Execution trace observer was not registered."; } } else { LOG(WARNING) << "Execution trace observer was not initialized."; } } void enableExecutionTraceObserver() { LOG(WARNING) << "Enabling Execution Trace Observer"; auto& ob = *ObserverManager::get(); // Make sure we are not already enabled. if (ob.getState() == ExecutionTraceObserver::RunState::enabled) { LOG(WARNING) << "Trying to enable Execution Trace Observer when it's already enabled."; } else { ob.setState(ExecutionTraceObserver::RunState::enabled); } } void disableExecutionTraceObserver() { LOG(WARNING) << "Disabling Execution Trace Observer"; auto& ob = *ObserverManager::get(); if (ob.getState() != ExecutionTraceObserver::RunState::disabled) { ob.setState(ExecutionTraceObserver::RunState::disabled); } else { LOG(WARNING) << "Trying to disable Execution Trace Observer when it's already disabled."; } } } // namespace torch::profiler::impl