mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-29 19:24:55 +08:00
[PG NCCL] catch cuda lib runtime error - driver shutting down (#74258)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/74258 There is a case when PG cleanup thread checks cuda event status after cuda runtime library has been unloaded. When that happens, it would leads to a "driver shutting down" error. This issue usually happens when cuda API is called in global or static object destructor. Test Plan: wait for user Reviewed By: jiayisuse, osalpekar Differential Revision: D34904896 fbshipit-source-id: 705c0812132dae97ea55fcb22730557880ca35e1 (cherry picked from commit ecb5f14a022319402c509b86209f6205212956b7)
This commit is contained in:
committed by
PyTorch MergeBot
parent
d6edb8473e
commit
ec071a0815
@ -373,11 +373,20 @@ bool ProcessGroupNCCL::WorkNCCL::startedGPUExecutionInternal() const {
|
||||
}
|
||||
|
||||
bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const {
|
||||
for (const auto i : c10::irange(devices_.size())) {
|
||||
// Checking the work's corresponding CUDA events' status
|
||||
if (!(*ncclEndEvents_)[i].query()) {
|
||||
return false;
|
||||
try {
|
||||
for (const auto i : c10::irange(devices_.size())) {
|
||||
// Checking the work's corresponding CUDA events' status
|
||||
if (!(*ncclEndEvents_)[i].query()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
if (std::string(e.what()).find("driver shutting down") == std::string::npos) {
|
||||
throw;
|
||||
}
|
||||
LOG(INFO) << "[Rank " << rank_
|
||||
<< "] Event query failed with exception: "
|
||||
<< e.what();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user