From d3059b9c493827fe5825ea7485fc21bad83469ba Mon Sep 17 00:00:00 2001
From: Dmytro Dzhulgakov <dzhulgakov@fb.com>
Date: Sun, 19 May 2019 23:01:09 -0700
Subject: [PATCH] Lightweight logging for once-only API usage

---
 aten/src/ATen/cuda/detail/CUDAHooks.cpp |  1 +
 c10/core/TensorImpl.cpp                 |  8 +++-
 c10/macros/Macros.h                     | 17 +++++++--
 c10/util/Logging.cpp                    | 50 +++++++++++++++++++++----
 c10/util/Logging.h                      | 26 +++++++++++++
 c10/util/Registry.h                     | 12 ------
 c10/util/typeid.h                       | 34 ++++++++---------
 caffe2/core/context_gpu.cu              |  1 +
 caffe2/core/init.cc                     |  1 +
 caffe2/core/net.cc                      |  1 +
 caffe2/core/operator.cc                 |  1 +
 caffe2/core/operator.h                  |  2 +-
 caffe2/python/pybind_state.cc           |  2 +
 torch/csrc/Module.cpp                   | 17 ++++++++-
 torch/csrc/distributed/c10d/init.cpp    |  1 +
 torch/csrc/jit/export.cpp               |  1 +
 torch/csrc/jit/graph_executor.cpp       |  3 +-
 torch/csrc/jit/import.cpp               |  1 +
 torch/csrc/jit/python_tracer.cpp        |  2 +
 torch/csrc/jit/script/init.cpp          |  2 +
 torch/lib/c10d/ProcessGroup.cpp         |  6 ++-
 torch/nn/modules/module.py              |  1 +
 torch/optim/optimizer.py                |  1 +
 torch/utils/data/dataloader.py          |  1 +
 24 files changed, 146 insertions(+), 46 deletions(-)

diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 5bef27a29f1a..a53f6d215165 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -39,6 +39,7 @@ namespace detail {
 // compilation unit (alt is to have another method in hooks, but
 // let's not if we don't need to!)
 std::unique_ptr<THCState, void (*)(THCState*)> CUDAHooks::initCUDA() const {
+  C10_LOG_API_USAGE_ONCE("aten.init.cuda");
   THCState* thc_state = THCState_alloc();
 
   THCudaInit(thc_state);
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index 84fcb466133b..6e5bf981967c 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -48,8 +48,12 @@ TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::Ty
       data_type_(data_type),
       device_opt_(device_opt),
       type_id_(type_id) {
-  AT_ASSERT(type_id == UndefinedTensorId() || data_type.id() ==  caffe2::TypeIdentifier::uninitialized() ||
-            device_opt_.has_value());
+  if (type_id != UndefinedTensorId()) {
+    AT_ASSERT(data_type.id() ==  caffe2::TypeIdentifier::uninitialized() ||
+              device_opt_.has_value());
+    // UndefinedTensorImpl is a singleton, so we skip logging it
+    C10_LOG_API_USAGE_ONCE("tensor.create");
+  }
   // we would also like to check that non-cpu devices have an index, but some Caffe2 operators create
   // Storages with default devices.
   strides_.push_back(1);
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 09e4b46fcfee..fc3ad273a516 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -29,10 +29,21 @@
   classname(const classname&) = delete;        \
   classname& operator=(const classname&) = delete
 
-#define CONCAT_IMPL(x, y) x##y
-#define MACRO_CONCAT(x, y) CONCAT_IMPL(x, y)
+#define C10_CONCATENATE_IMPL(s1, s2) s1##s2
+#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2)
+
+#define C10_MACRO_EXPAND(args) args
+
+/**
+ * C10_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
+ * str and ending with a number that varies with the line.
+ */
+#ifdef __COUNTER__
+#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__)
+#else
+#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__)
+#endif
 
-#define MACRO_EXPAND(args) args
 
 /// C10_NODISCARD - Warn if a type or return value is discarded.
 
diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp
index 25405769b6e3..33ef083dab9e 100644
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@@ -1,9 +1,10 @@
 #include "c10/util/Logging.h"
-#include "c10/util/Flags.h"
 #include "c10/util/Backtrace.h"
+#include "c10/util/Flags.h"
 
 #include <algorithm>
 #include <cstring>
+#include <cstdlib>
 #include <iostream>
 #include <numeric>
 
@@ -24,7 +25,9 @@ namespace enforce_detail {
 
 namespace {
 std::function<string(void)>* GetFetchStackTrace() {
-  static std::function<string(void)> func = []() { return get_backtrace(/*frames_to_skip=*/ 1); };
+  static std::function<string(void)> func = []() {
+    return get_backtrace(/*frames_to_skip=*/1);
+  };
   return &func;
 };
 } // namespace
@@ -49,12 +52,43 @@ void ThrowEnforceNotMet(
 // PyTorch-style error message
 // (This must be defined here for access to GetFetchStackTrace)
 Error::Error(SourceLocation source_location, const std::string& msg)
-    : Error(
-          msg,
-          str(" (",
-              source_location,
-              ")\n",
-              (*GetFetchStackTrace())())) {}
+    : Error(msg, str(" (", source_location, ")\n", (*GetFetchStackTrace())())) {
+}
+
+using APIUsageLoggerType = std::function<void(const std::string&)>;
+
+namespace {
+bool IsAPIUsageDebugMode() {
+  return getenv("PYTORCH_API_USAGE_STDERR");
+}
+
+void APIUsageDebug(const string& event) {
+  // use stderr to avoid messing with glog
+  std::cerr << "PYTORCH_API_USAGE " << event << std::endl;
+}
+
+APIUsageLoggerType* GetAPIUsageLogger() {
+  static APIUsageLoggerType func =
+      IsAPIUsageDebugMode() ? &APIUsageDebug : [](const string&) {};
+  return &func;
+};
+} // namespace
+
+void SetAPIUsageLogger(std::function<void(const std::string&)> logger) {
+  AT_ASSERT(logger);
+  *GetAPIUsageLogger() = logger;
+}
+
+void LogAPIUsage(const std::string& event) {
+  (*GetAPIUsageLogger())(event);
+}
+
+namespace detail {
+bool LogAPIUsageFakeReturn(const std::string& event) {
+  (*GetAPIUsageLogger())(event);
+  return true;
+}
+}
 
 } // namespace c10
 
diff --git a/c10/util/Logging.h b/c10/util/Logging.h
index 5833f355c87e..ce081e8ebc5b 100644
--- a/c10/util/Logging.h
+++ b/c10/util/Logging.h
@@ -257,6 +257,32 @@ BINARY_COMP_HELPER(LessEquals, <=)
 #define CAFFE_ENFORCE_GT_WITH_CALLER(x, y, ...) \
   CAFFE_ENFORCE_THAT_IMPL_WITH_CALLER(          \
       Greater((x), (y)), #x " > " #y, __VA_ARGS__)
+
+/**
+ * Very lightweight logging for the first time API usage. It's beneficial for
+ * tracking of individual functionality usage in larger applications.
+ *
+ * In order to ensure light-weightness of logging, we utilize static variable
+ * trick - LogAPIUsage will be invoked only once and further invocations will
+ * just do an atomic check.
+ *
+ * Example:
+ *   // Logs caller info with an arbitrary text event, if there is a usage.
+ *   C10_LOG_API_USAGE_ONCE("my_api");
+ */
+#define C10_LOG_API_USAGE_ONCE(...)             \
+  C10_UNUSED static bool C10_ANONYMOUS_VARIABLE(logFlag) = \
+      ::c10::detail::LogAPIUsageFakeReturn(__VA_ARGS__);
+
+// API usage logging capabilities
+C10_API void SetAPIUsageLogger(std::function<void(const std::string&)> logger);
+C10_API void LogAPIUsage(const std::string& context);
+
+namespace detail {
+// Return value is needed to do the static variable initialization trick
+C10_API bool LogAPIUsageFakeReturn(const std::string& context);
+}
+
 } // namespace c10
 
 #endif // C10_UTIL_LOGGING_H_
diff --git a/c10/util/Registry.h b/c10/util/Registry.h
index 060f80d5fa0d..e9defd160285 100644
--- a/c10/util/Registry.h
+++ b/c10/util/Registry.h
@@ -185,18 +185,6 @@ class Registerer {
   }
 };
 
-/**
- * C10_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
- * str and ending with a number that varies with the line.
- */
-#define C10_CONCATENATE_IMPL(s1, s2) s1##s2
-#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2)
-#ifdef __COUNTER__
-#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__)
-#else
-#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__)
-#endif
-
 /**
  * C10_DECLARE_TYPED_REGISTRY is a macro that expands to a function
  * declaration, as well as creating a convenient typename for its corresponding
diff --git a/c10/util/typeid.h b/c10/util/typeid.h
index 36e39458ac0f..09bd806850e9 100644
--- a/c10/util/typeid.h
+++ b/c10/util/typeid.h
@@ -23,10 +23,10 @@
 #include <c10/util/Exception.h>
 #include <c10/util/Half.h>
 #include <c10/util/IdWrapper.h>
+#include <c10/util/Type.h>
+#include <c10/util/qint32.h>
 #include <c10/util/qint8.h>
 #include <c10/util/quint8.h>
-#include <c10/util/qint32.h>
-#include <c10/util/Type.h>
 
 /*
  * TypeIdentifier is a small type containing an id.
@@ -498,15 +498,15 @@ inline std::ostream& operator<<(
 #define EXPORT_IF_NOT_GCC
 #endif
 
-#define _CAFFE_KNOWN_TYPE_DEFINE_TYPEMETADATA_INSTANCE(T, Counter)   \
-  namespace detail {                                                 \
-  const TypeMetaData MACRO_CONCAT(_typeMetaDataInstance_, Counter) = \
-      _makeTypeMetaDataInstance<T>(_typeName<T>(#T));                \
-  }                                                                  \
-  template <>                                                        \
-  EXPORT_IF_NOT_GCC const detail::TypeMetaData*                      \
-  TypeMeta::_typeMetaDataInstance<T>() noexcept {                    \
-    return &MACRO_CONCAT(detail::_typeMetaDataInstance_, Counter);   \
+#define _CAFFE_KNOWN_TYPE_DEFINE_TYPEMETADATA_INSTANCE(T, Counter)      \
+  namespace detail {                                                    \
+  const TypeMetaData C10_CONCATENATE(_typeMetaDataInstance_, Counter) = \
+      _makeTypeMetaDataInstance<T>(_typeName<T>(#T));                   \
+  }                                                                     \
+  template <>                                                           \
+  EXPORT_IF_NOT_GCC const detail::TypeMetaData*                         \
+  TypeMeta::_typeMetaDataInstance<T>() noexcept {                       \
+    return &C10_CONCATENATE(detail::_typeMetaDataInstance_, Counter);   \
   }
 #define CAFFE_KNOWN_TYPE(T)                                               \
   template <>                                                             \
@@ -529,20 +529,20 @@ inline std::ostream& operator<<(
     return TypeIdentifier(PreallocatedId);                       \
   }                                                              \
   namespace detail {                                             \
-  C10_API extern const TypeMetaData MACRO_CONCAT(                \
+  C10_API extern const TypeMetaData C10_CONCATENATE(             \
       _typeMetaDataInstance_preallocated_,                       \
       PreallocatedId);                                           \
   }
 #define CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(PreallocatedId, T)         \
   namespace detail {                                                    \
-  C10_EXPORT const TypeMetaData MACRO_CONCAT(                           \
+  C10_EXPORT const TypeMetaData C10_CONCATENATE(                        \
       _typeMetaDataInstance_preallocated_,                              \
       PreallocatedId) = _makeTypeMetaDataInstance<T>(_typeName<T>(#T)); \
   }                                                                     \
   template <>                                                           \
   C10_EXPORT const detail::TypeMetaData*                                \
   TypeMeta::_typeMetaDataInstance<T>() noexcept {                       \
-    return &MACRO_CONCAT(                                               \
+    return &C10_CONCATENATE(                                            \
         detail::_typeMetaDataInstance_preallocated_, PreallocatedId);   \
   }
 #else // _MSC_VER
@@ -552,19 +552,19 @@ inline std::ostream& operator<<(
     return TypeIdentifier(PreallocatedId);                            \
   }                                                                   \
   namespace detail {                                                  \
-  C10_EXPORT extern const TypeMetaData MACRO_CONCAT(                  \
+  C10_EXPORT extern const TypeMetaData C10_CONCATENATE(               \
       _typeMetaDataInstance_preallocated_,                            \
       PreallocatedId);                                                \
   }                                                                   \
   template <>                                                         \
   inline const detail::TypeMetaData*                                  \
   TypeMeta::_typeMetaDataInstance<T>() noexcept {                     \
-    return &MACRO_CONCAT(                                             \
+    return &C10_CONCATENATE(                                          \
         detail::_typeMetaDataInstance_preallocated_, PreallocatedId); \
   }
 #define CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(PreallocatedId, T)         \
   namespace detail {                                                    \
-  const TypeMetaData MACRO_CONCAT(                                      \
+  const TypeMetaData C10_CONCATENATE(                                   \
       _typeMetaDataInstance_preallocated_,                              \
       PreallocatedId) = _makeTypeMetaDataInstance<T>(_typeName<T>(#T)); \
   }
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index a360a0ea64ac..98f6b12258d0 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -201,6 +201,7 @@ static void Caffe2InitializeCuda() {
     VLOG(1) << "No cuda gpu present. Skipping.";
     return;
   }
+  C10_LOG_API_USAGE_ONCE("caffe2.init.cuda");
   // Check if the number of GPUs matches the expected compile-time max number
   // of GPUs.
   CAFFE_ENFORCE_LE(
diff --git a/caffe2/core/init.cc b/caffe2/core/init.cc
index 572f39933d17..5e8f746cfec7 100644
--- a/caffe2/core/init.cc
+++ b/caffe2/core/init.cc
@@ -42,6 +42,7 @@ bool GlobalInitAlreadyRun() {
 }
 
 bool GlobalInit(int* pargc, char*** pargv) {
+  C10_LOG_API_USAGE_ONCE("caffe2.global_init");
   static std::recursive_mutex init_mutex;
   std::lock_guard<std::recursive_mutex> guard(init_mutex);
   internal::State& init_state = internal::GlobalInitState();
diff --git a/caffe2/core/net.cc b/caffe2/core/net.cc
index 41a314403489..ec679673c1b0 100644
--- a/caffe2/core/net.cc
+++ b/caffe2/core/net.cc
@@ -37,6 +37,7 @@ NetBase::NetBase(
       name_(def->name()),
       net_def_(def) {
   static GlobalInitIsCalledGuard guard;
+  C10_LOG_API_USAGE_ONCE("caffe2.net.create");
   // Check that node_name is empty for all ops
   for (const OperatorDef& op : def->op()) {
     if (op.has_device_option()) {
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 9087d3ac2fa7..fe5c278e3d4f 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -386,6 +386,7 @@ C10_DEFINE_REGISTRY(
 
 GradientOpsMeta GetGradientForOp(
     const OperatorDef& def, const vector<GradientWrapper>& g_output) {
+  C10_LOG_API_USAGE_ONCE("caffe2.gradient_maker");
   std::unique_ptr<GradientMakerBase> maker(
       GradientRegistry()->Create(def.type(), def, g_output));
   CAFFE_ENFORCE(maker,
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 2e2353368e9b..70f888ba362e 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -1267,7 +1267,7 @@ C10_DECLARE_REGISTRY(
 #define REGISTER_CPU_GRADIENT_OPERATOR(...) /* No gradients. */
 #else
 #define REGISTER_CPU_GRADIENT_OPERATOR(...) \
-  MACRO_EXPAND(REGISTER_CPU_OPERATOR(__VA_ARGS__))
+  C10_MACRO_EXPAND(REGISTER_CPU_OPERATOR(__VA_ARGS__))
 #endif
 
 C10_DECLARE_REGISTRY(
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 66434ff4ef15..c6f626196e15 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -1819,6 +1819,8 @@ void addGlobalMethods(py::module& m) {
 PYBIND11_MODULE(caffe2_pybind11_state, m) {
   m.doc() = "pybind11 stateful interface to Caffe2 workspaces";
 
+  C10_LOG_API_USAGE_ONCE("caffe2.python.import");
+
   addGlobalMethods(m);
   addObjectMethods(m);
   for (const auto& addition : PybindAdditionRegistry()->Keys()) {
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 3675a146b189..1f80f86c9841 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -9,6 +9,7 @@
 #include <cstdlib>
 #include <libshm.h>
 #include <TH/TH.h>
+#include <c10/util/Logging.h>
 #include <ATen/ATen.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/dlpack.h>
@@ -565,6 +566,16 @@ static void warning_handler(
   }
 }
 
+// In Python we can't use the trick of C10_LOG_API_USAGE_ONCE
+// Guaranteed to be invoked from Python under GIL, no locking on map needed
+static void LogAPIUsageOnceFromPython(const std::string& event) {
+  static std::unordered_set<std::string> seen;
+  if (!seen.count(event)) {
+    seen.insert(event);
+    c10::LogAPIUsage(event);
+  }
+}
+
 
 #ifdef _WIN32
 __declspec(dllexport)
@@ -573,6 +584,8 @@ PyObject* initModule() {
   HANDLE_TH_ERRORS
   at::init_num_threads();
 
+  C10_LOG_API_USAGE_ONCE("torch.python.import");
+
 #define ASSERT_TRUE(cmd) if (!(cmd)) return nullptr
 
   THPUtils_addPyMethodDefs(methods, TorchMethods);
@@ -675,7 +688,9 @@ PyObject* initModule() {
   // setting up TH Errors so that they throw C++ exceptions
   at::init();
 
-  py::reinterpret_borrow<py::module>(module).def("_demangle", &c10::demangle);
+  auto py_module = py::reinterpret_borrow<py::module>(module);
+  py_module.def("_demangle", &c10::demangle);
+  py_module.def("_log_api_usage_once", &LogAPIUsageOnceFromPython);
 
   // Set ATen warnings to issue Python warnings
   ::c10::Warning::set_warning_handler(&warning_handler);
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index b8ce586ded42..018ee65057bb 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -36,6 +36,7 @@ template <typename T>
 using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 
 PyObject* c10d_init(PyObject* _unused) {
+  C10_LOG_API_USAGE_ONCE("c10d.python.import");
   auto c10d_module = THPObjectPtr(PyImport_ImportModule("torch.distributed"));
   if (!c10d_module) {
     throw python_error();
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index 0d64e0399a63..7542edadb3d8 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -550,6 +550,7 @@ ScriptModuleSerializer::ScriptModuleSerializer(std::ostream* ofs)
 void ScriptModuleSerializer::serialize(
     const script::Module& module,
     const script::ExtraFilesMap& extra_files) {
+  C10_LOG_API_USAGE_ONCE("torch.script.save");
   torch::ModelDef model_def;
   convertModel(module, &model_def, extra_files);
   std::string output;
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index dd12a8220a56..0fc492410eb8 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -488,6 +488,7 @@ struct GraphExecutorImpl : public GraphExecutorImplBase {
         " inputs, but got only ",
         stack.size());
 
+    C10_LOG_API_USAGE_ONCE("torch.graph_executor.run");
     logging::getLogger()->addStatValue(
         logging::runtime_counters::GRAPH_EXECUTOR_INVOCATIONS, 1.0);
 
@@ -579,7 +580,7 @@ struct GraphExecutorImpl : public GraphExecutorImplBase {
       for (Node* dnode : diff_nodes) {
         auto diff_graph = std::move(dnode->g(attr::Subgraph));
         Gradient gradient = differentiate(diff_graph);
-        // Run post differentiation optimizations, Autodiff will replace some 
+        // Run post differentiation optimizations, Autodiff will replace some
         // parts of graph with new graph, these new graphs usually consists of
         // control flows and miss shape information on nodes, so we run shape
         // prop and differentiable optimizations to ensure the graph is optimized
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index f6d35b190cd5..9a83f17d74ba 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -95,6 +95,7 @@ void ScriptModuleDeserializer::deserialize(
     script::ModuleLookup module_lookup,
     c10::optional<at::Device> device,
     script::ExtraFilesMap& extra_files) {
+  C10_LOG_API_USAGE_ONCE("torch.script.load");
   torch::ModelDef model_def;
   at::DataPtr data_ptr;
   size_t data_size;
diff --git a/torch/csrc/jit/python_tracer.cpp b/torch/csrc/jit/python_tracer.cpp
index 478262a86cc1..bf53bb014f5b 100644
--- a/torch/csrc/jit/python_tracer.cpp
+++ b/torch/csrc/jit/python_tracer.cpp
@@ -42,6 +42,8 @@ std::shared_ptr<torch::jit::Graph> createGraphByTracing(
     const py::function& var_name_lookup_fn,
     bool force_outplace,
     const std::shared_ptr<script::Module>& self) {
+  C10_LOG_API_USAGE_ONCE("torch.tracer");
+
   auto enter_info = tracer::enter(std::move(trace_inputs), self);
   auto graph = enter_info.first->graph;
 
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index a7dad0290a33..e88acae9fa41 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -531,6 +531,7 @@ void initJitScriptBindings(PyObject* module) {
   m.def(
       "_jit_script_compile",
       [](const Def& def, ResolutionCallback rcb, FunctionDefaults defaults) {
+        C10_LOG_API_USAGE_ONCE("torch.script.compile");
         CompilationUnit cu;
         cu.define({def}, {pythonResolver(rcb)}, nullptr);
         std::shared_ptr<Function> defined = cu.get_functions().at(0);
@@ -561,6 +562,7 @@ void initJitScriptBindings(PyObject* module) {
       [](const std::string& qualifiedName,
          const ClassDef& classDef,
          ResolutionCallback rcb) {
+        C10_LOG_API_USAGE_ONCE("torch.script.class");
         auto cu = std::make_shared<CompilationUnit>();
         auto classType =
             ClassType::create(c10::QualifiedName(qualifiedName), cu);
diff --git a/torch/lib/c10d/ProcessGroup.cpp b/torch/lib/c10d/ProcessGroup.cpp
index 353ef887e95a..99b2e2b6a3a1 100644
--- a/torch/lib/c10d/ProcessGroup.cpp
+++ b/torch/lib/c10d/ProcessGroup.cpp
@@ -1,5 +1,7 @@
 #include <c10d/ProcessGroup.hpp>
 
+#include <c10/util/Logging.h>
+
 namespace c10d {
 
 ProcessGroup::Work::~Work() {}
@@ -44,7 +46,9 @@ void ProcessGroup::Work::finish(std::exception_ptr exception) {
   cv_.notify_all();
 }
 
-ProcessGroup::ProcessGroup(int rank, int size) : rank_(rank), size_(size) {}
+ProcessGroup::ProcessGroup(int rank, int size) : rank_(rank), size_(size) {
+  C10_LOG_API_USAGE_ONCE("c10d.process_group");
+}
 
 ProcessGroup::~ProcessGroup() {}
 
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index eb82b8363fa0..5047926b2793 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -63,6 +63,7 @@ class Module(object):
     _version = 1
 
     def __init__(self):
+        torch._C._log_api_usage_once("python.nn_module")
         self._backend = thnn_backend
         self._parameters = OrderedDict()
         self._buffers = OrderedDict()
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 1d657b141bb8..9420a8e9eb2b 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -30,6 +30,7 @@ class Optimizer(object):
     """
 
     def __init__(self, params, defaults):
+        torch._C._log_api_usage_once("python.optimizer")
         self.defaults = defaults
 
         if isinstance(params, torch.Tensor):
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index b0b70f30b4c3..5368b42f1f76 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -142,6 +142,7 @@ class DataLoader(object):
                  batch_sampler=None, num_workers=0, collate_fn=default_collate,
                  pin_memory=False, drop_last=False, timeout=0,
                  worker_init_fn=None):
+        torch._C._log_api_usage_once("python.data_loader")
         self.dataset = dataset
         self.batch_size = batch_size
         self.num_workers = num_workers