mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[c10d] Add hashing as a debug feature for before and after NCCL collective call (#113238)
For now, we use `TORCH_DISTRIBUTED_DEBUG = DETAIL` to turn a debug feature which calculate the hashing for input tensors and output results of c10d collective in NCCL. This is a debugging feature so that we can rule out the bug from c10d level. <img width="840" alt="image" src="https://github.com/pytorch/pytorch/assets/6937752/cdc70b0b-ae3c-4efd-86ff-adc5c5ba505f"> Pull Request resolved: https://github.com/pytorch/pytorch/pull/113238 Approved by: https://github.com/wconstab, https://github.com/fegin
This commit is contained in:
@ -8,6 +8,7 @@
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/Optional.h>
|
||||
#include <nccl.h>
|
||||
@ -163,6 +164,7 @@
|
||||
|
||||
namespace c10d {
|
||||
|
||||
TORCH_API size_t hashTensors(const std::vector<at::Tensor>& tensors);
|
||||
std::string getNcclVersion();
|
||||
std::string ncclGetErrorWithVersion(ncclResult_t error);
|
||||
bool nccl_use_nonblocking();
|
||||
|
||||
Reference in New Issue
Block a user