Add soft error reporting to capture all the inference runtime failure. (#44078)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44078 When PyTorch mobile inference failed and throw exception, if caller catch and not crash the app, we are not able to track all the inference failures. So we are adding native soft error reporting to capture all the failures occurring during module loading and running including both crashing and on-crashing failures. Since c10::Error has good error messaging stack handling (D21202891 (a058e938f9)), we are utilizing it for the error handling and message print out. ghstack-source-id: 111307080 Test Plan: Verified that the soft error reporting is sent through module.cpp when operator is missing, make sure a logview mid is generated with stack trace: https://www.internalfb.com/intern/logview/details/facebook_android_softerrors/5dd347d1398c1a9a73c804b20f7c2179/?selected-logview-tab=latest. Error message with context is logged below: ``` soft_error.cpp [PyTorchMobileInference] : Error occured during model running entry point: Could not run 'aten::embedding' with arguments from the 'CPU' backend. 'aten::embedding' is only available for these backends: [BackendSelect, Named, Autograd, Autocast, Batched, VmapMode]. BackendSelect: fallthrough registered at xplat/caffe2/aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback] Named: registered at xplat/caffe2/aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback] Autograd: fallthrough registered at xplat/caffe2/aten/src/ATen/core/VariableFallbackKernel.cpp:31 [backend fallback] Autocast: fallthrough registered at xplat/caffe2/aten/src/ATen/autocast_mode.cpp:253 [backend fallback] Batched: registered at xplat/caffe2/aten/src/ATen/BatchingRegistrations.cpp:317 [backend fallback] VmapMode: fallthrough registered at xplat/caffe2/aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback] Exception raised from reportError at xplat/caffe2/aten/src/ATen/core/dispatch/OperatorEntry.cpp:261 (m ``` Reviewed By: iseeyuan Differential Revision: D23428636 fbshipit-source-id: 82d5d9c054300dff18d144f264389402d0b55a8a
2025-10-20 21:14:14 +08:00 · 2020-09-03 10:52:04 -07:00
parent 5973b44d9e
commit c59e11bfbb
4 changed files with 59 additions and 24 deletions
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@ -400,17 +400,29 @@ mobile::Module _load_for_mobile(
      observer->onExitLoadModel(result.metadata());
    }
    return result;
-  } catch (const std::exception& ex) {
+  } catch (c10::Error& error) {
    if (observer) {
-      observer->onFailLoadModel(
-          "Error occured during loading model: " + (std::string)ex.what());
+      observer->onFailLoadModel(error.what());
    }
-    TORCH_CHECK(false, ex.what());
+    TORCH_RETHROW(error);
  } catch (...) {
-    if (observer) {
-      observer->onFailLoadModel("unknown exception");
+    auto currentException = std::current_exception();
+    try {
+      if (!currentException) {
+        TORCH_CHECK(false, "Unknown exception");
+      } else {
+        try {
+          std::rethrow_exception(currentException);
+        } catch (const std::exception& e) {
+          TORCH_CHECK(false, e.what());
+        }
+      }
+    } catch (c10::Error& error) {
+      if (observer) {
+        observer->onFailLoadModel(error.what());
+      }
+      TORCH_RETHROW(error);
    }
-    TORCH_CHECK(false, "unknown exception");
  }
 }

--- a/torch/csrc/jit/mobile/import_data.cpp
+++ b/torch/csrc/jit/mobile/import_data.cpp
@ -184,17 +184,29 @@ mobile::Module _load_data(
      observer->onExitLoadModel(result.metadata());
    }
    return result;
-  } catch (const std::exception& ex) {
+  } catch (c10::Error& error) {
    if (observer) {
-      observer->onFailLoadModel(
-          "Error occured during loading model: " + (std::string)ex.what());
+      observer->onFailLoadModel(error.what());
    }
-    TORCH_CHECK(false, ex.what());
+    TORCH_RETHROW(error);
  } catch (...) {
-    if (observer) {
-      observer->onFailLoadModel("unknown exception");
+    auto currentException = std::current_exception();
+    try {
+      if (!currentException) {
+        TORCH_CHECK(false, "Unknown exception");
+      } else {
+        try {
+          std::rethrow_exception(currentException);
+        } catch (const std::exception& e) {
+          TORCH_CHECK(false, e.what());
+        }
+      }
+    } catch (c10::Error& error) {
+      if (observer) {
+        observer->onFailLoadModel(error.what());
+      }
+      TORCH_RETHROW(error);
    }
-    TORCH_CHECK(false, "unknown exception");
  }
 }

--- a/torch/csrc/jit/mobile/module.cpp
+++ b/torch/csrc/jit/mobile/module.cpp
@ -65,18 +65,29 @@ c10::IValue Module::run_method(const std::string& method_name, Stack stack) {
      observer->onExitRunMethod();
    }
    return result;
-  } catch (const std::exception& ex) {
+  } catch (c10::Error& error) {
    if (observer) {
-      observer->onFailRunMethod(
-          "Error occured during model running entry point: " +
-          (std::string)ex.what());
+      observer->onFailRunMethod(error.what());
    }
-    TORCH_CHECK(false, ex.what());
+    TORCH_RETHROW(error);
  } catch (...) {
-    if (observer) {
-      observer->onFailRunMethod("unknown exception");
+    auto currentException = std::current_exception();
+    try {
+      if (!currentException) {
+        TORCH_CHECK(false, "Unknown exception");
+      } else {
+        try {
+          std::rethrow_exception(currentException);
+        } catch (const std::exception& e) {
+          TORCH_CHECK(false, e.what());
+        }
+      }
+    } catch (c10::Error& error) {
+      if (observer) {
+        observer->onFailRunMethod(error.what());
+      }
+      TORCH_RETHROW(error);
    }
-    TORCH_CHECK(false, "unknown exception");
  }
 }

--- a/torch/csrc/jit/mobile/observer.h
+++ b/torch/csrc/jit/mobile/observer.h
@ -73,11 +73,11 @@ class MobileModuleObserver {
      const std::string&) {}
  virtual void onExitRunMethod() {}
  virtual void onCancelRunMethod(const std::string&) {}
-  virtual void onFailRunMethod(const std::string&) {}
+  virtual void onFailRunMethod(const char*) {}
  virtual void onEnterLoadModel() {}
  virtual void onExitLoadModel(
      const std::unordered_map<std::string, std::string>&) {}
-  virtual void onFailLoadModel(const std::string&) {}
+  virtual void onFailLoadModel(const char*) {}
 };

 class MobileObserverConfig {