Add soft error reporting to capture all the inference runtime failure. (#44078)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44078

When PyTorch mobile inference failed and throw exception, if caller catch and not crash the app, we are not able to track all the inference failures.

So we are adding native soft error reporting to capture all the failures occurring during module loading and running including both crashing and on-crashing failures. Since c10::Error has good error messaging stack handling (D21202891 (a058e938f9)), we are utilizing it for the error handling and message print out.
ghstack-source-id: 111307080

Test Plan:
Verified that the soft error reporting is sent through module.cpp when operator is missing, make sure a logview mid is generated with stack trace: https://www.internalfb.com/intern/logview/details/facebook_android_softerrors/5dd347d1398c1a9a73c804b20f7c2179/?selected-logview-tab=latest.

Error message with context is logged below:

```
soft_error.cpp		[PyTorchMobileInference] : Error occured during model running entry point: Could not run 'aten::embedding' with arguments from the 'CPU' backend. 'aten::embedding' is only available for these backends: [BackendSelect, Named, Autograd, Autocast, Batched, VmapMode].

BackendSelect: fallthrough registered at xplat/caffe2/aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Named: registered at xplat/caffe2/aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Autograd: fallthrough registered at xplat/caffe2/aten/src/ATen/core/VariableFallbackKernel.cpp:31 [backend fallback]
Autocast: fallthrough registered at xplat/caffe2/aten/src/ATen/autocast_mode.cpp:253 [backend fallback]
Batched: registered at xplat/caffe2/aten/src/ATen/BatchingRegistrations.cpp:317 [backend fallback]
VmapMode: fallthrough registered at xplat/caffe2/aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]

Exception raised from reportError at xplat/caffe2/aten/src/ATen/core/dispatch/OperatorEntry.cpp:261 (m
```

Reviewed By: iseeyuan

Differential Revision: D23428636

fbshipit-source-id: 82d5d9c054300dff18d144f264389402d0b55a8a
This commit is contained in:
Xingying Cheng
2020-09-03 10:52:04 -07:00
committed by Facebook GitHub Bot
parent 5973b44d9e
commit c59e11bfbb
4 changed files with 59 additions and 24 deletions

View File

@ -400,17 +400,29 @@ mobile::Module _load_for_mobile(
observer->onExitLoadModel(result.metadata());
}
return result;
} catch (const std::exception& ex) {
} catch (c10::Error& error) {
if (observer) {
observer->onFailLoadModel(
"Error occured during loading model: " + (std::string)ex.what());
observer->onFailLoadModel(error.what());
}
TORCH_CHECK(false, ex.what());
TORCH_RETHROW(error);
} catch (...) {
if (observer) {
observer->onFailLoadModel("unknown exception");
auto currentException = std::current_exception();
try {
if (!currentException) {
TORCH_CHECK(false, "Unknown exception");
} else {
try {
std::rethrow_exception(currentException);
} catch (const std::exception& e) {
TORCH_CHECK(false, e.what());
}
}
} catch (c10::Error& error) {
if (observer) {
observer->onFailLoadModel(error.what());
}
TORCH_RETHROW(error);
}
TORCH_CHECK(false, "unknown exception");
}
}

View File

@ -184,17 +184,29 @@ mobile::Module _load_data(
observer->onExitLoadModel(result.metadata());
}
return result;
} catch (const std::exception& ex) {
} catch (c10::Error& error) {
if (observer) {
observer->onFailLoadModel(
"Error occured during loading model: " + (std::string)ex.what());
observer->onFailLoadModel(error.what());
}
TORCH_CHECK(false, ex.what());
TORCH_RETHROW(error);
} catch (...) {
if (observer) {
observer->onFailLoadModel("unknown exception");
auto currentException = std::current_exception();
try {
if (!currentException) {
TORCH_CHECK(false, "Unknown exception");
} else {
try {
std::rethrow_exception(currentException);
} catch (const std::exception& e) {
TORCH_CHECK(false, e.what());
}
}
} catch (c10::Error& error) {
if (observer) {
observer->onFailLoadModel(error.what());
}
TORCH_RETHROW(error);
}
TORCH_CHECK(false, "unknown exception");
}
}

View File

@ -65,18 +65,29 @@ c10::IValue Module::run_method(const std::string& method_name, Stack stack) {
observer->onExitRunMethod();
}
return result;
} catch (const std::exception& ex) {
} catch (c10::Error& error) {
if (observer) {
observer->onFailRunMethod(
"Error occured during model running entry point: " +
(std::string)ex.what());
observer->onFailRunMethod(error.what());
}
TORCH_CHECK(false, ex.what());
TORCH_RETHROW(error);
} catch (...) {
if (observer) {
observer->onFailRunMethod("unknown exception");
auto currentException = std::current_exception();
try {
if (!currentException) {
TORCH_CHECK(false, "Unknown exception");
} else {
try {
std::rethrow_exception(currentException);
} catch (const std::exception& e) {
TORCH_CHECK(false, e.what());
}
}
} catch (c10::Error& error) {
if (observer) {
observer->onFailRunMethod(error.what());
}
TORCH_RETHROW(error);
}
TORCH_CHECK(false, "unknown exception");
}
}

View File

@ -73,11 +73,11 @@ class MobileModuleObserver {
const std::string&) {}
virtual void onExitRunMethod() {}
virtual void onCancelRunMethod(const std::string&) {}
virtual void onFailRunMethod(const std::string&) {}
virtual void onFailRunMethod(const char*) {}
virtual void onEnterLoadModel() {}
virtual void onExitLoadModel(
const std::unordered_map<std::string, std::string>&) {}
virtual void onFailLoadModel(const std::string&) {}
virtual void onFailLoadModel(const char*) {}
};
class MobileObserverConfig {