mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Update GraphTask::owner_ in a single thread for DistEngine. (#58625)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/58625 Several TSAN tests were failing for distributed since `owner_` was not atomic and was being accessed by several threads. As an example: https://github.com/pytorch/pytorch/blob/master/torch/csrc/distributed/autograd/engine/dist_engine.cpp#L333. To fix this, I've set the owner_ only once when the graphTask is created. Test Plan: 1) Validated change fixes failing TSAN test. 2) waitforbuildbot Reviewed By: albanD Differential Revision: D28496878 fbshipit-source-id: 473f4f6d859595749a02563a204ba7aa35ea19e3
This commit is contained in:
committed by
Facebook GitHub Bot
parent
d9aa0b53eb
commit
1d885fbd0e
@ -308,6 +308,10 @@ void DistEngine::computeDependencies(
|
||||
}
|
||||
}
|
||||
|
||||
// Set graph task owner in a single thread since concurrent access to
|
||||
// 'owner_' field is not permitted.
|
||||
graphTask->owner_ = torch::autograd::CPU_DEVICE;
|
||||
|
||||
// Let autograd context take ownership of the GraphTask.
|
||||
autogradContext->setGraphTask(std::move(graphTask));
|
||||
}
|
||||
@ -330,7 +334,6 @@ void DistEngine::execute_graph_task_until_ready_queue_empty(
|
||||
cpu_ready_queue->push(std::move(node_task), incrementOutstandingTasks);
|
||||
|
||||
torch::autograd::set_device(torch::autograd::CPU_DEVICE);
|
||||
graph_task->owner_ = torch::autograd::CPU_DEVICE;
|
||||
while (!cpu_ready_queue->empty()) {
|
||||
std::shared_ptr<GraphTask> local_graph_task;
|
||||
{
|
||||
|
Reference in New Issue
Block a user