TCPStore: fix remote address (#131773) (#131913)

Summary:
This fixes corrupt remote address logs caused by dangling pointers to addrinfo_storage inside of addrinfo.

This relands it since it got reverted due to a fmt::format issue internally.

Original Pull Request: https://github.com/pytorch/pytorch/pull/131773
Approved by: https://github.com/kurman

Test Plan:
Enable debug logs and verify addresses are correct

```
TORCH_CPP_LOG_LEVEL=INFO TORCH_DISABLE_SHARE_RDZV_TCP_STORE=1 TORCH_DISTRIBUTED_DEBUG=DETAIL LOGLEVEL=INFO python test/distributed/test_store.py -v
buck2 test @//mode/dev-nosan //caffe2/test/distributed:store
```

Differential Revision: D60296583

Pull Request resolved: https://github.com/pytorch/pytorch/pull/131913
Approved by: https://github.com/kurman, https://github.com/rsdcastro, https://github.com/Skylion007
This commit is contained in:
Tristan Rice
2024-07-30 17:27:33 +00:00
committed by PyTorch MergeBot
parent 3864a2d834
commit 9027db1ab8
5 changed files with 57 additions and 10 deletions

View File

@ -141,10 +141,9 @@ class SocketImpl {
static constexpr Handle invalid_socket = -1;
#endif
explicit SocketImpl(
Handle hnd,
std::optional<::addrinfo> remote = std::nullopt) noexcept
: hnd_{hnd}, remote_(remote) {}
explicit SocketImpl(Handle hnd) noexcept : hnd_{hnd} {}
explicit SocketImpl(Handle hnd, const ::addrinfo& remote);
SocketImpl(const SocketImpl& other) = delete;
@ -182,7 +181,7 @@ class SocketImpl {
return hnd_;
}
const std::optional<::addrinfo>& remote() const noexcept {
const std::optional<std::string>& remote() const noexcept {
return remote_;
}
@ -192,7 +191,7 @@ class SocketImpl {
bool setSocketFlag(int level, int optname, bool value) noexcept;
Handle hnd_;
const std::optional<::addrinfo> remote_;
const std::optional<std::string> remote_;
};
} // namespace c10d::detail
@ -278,7 +277,7 @@ struct formatter<c10d::detail::SocketImpl> {
addr.ai_addrlen = addr_len;
auto remote = socket.remote();
std::string remoteStr = remote ? fmt::format("{}", *remote) : "none";
std::string remoteStr = remote ? *remote : "none";
return fmt::format_to(
ctx.out(),
@ -293,6 +292,9 @@ struct formatter<c10d::detail::SocketImpl> {
namespace c10d::detail {
SocketImpl::SocketImpl(Handle hnd, const ::addrinfo& remote)
: hnd_{hnd}, remote_{fmt::format("{}", remote)} {}
SocketImpl::~SocketImpl() {
#ifdef _WIN32
::closesocket(hnd_);