mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Summary: Original: D81957844 and D81957923 Also, https://github.com/pytorch/pytorch/pull/162142 is patched in as well #buildall Test Plan: sandcastle and oss ci Rollback Plan: Reviewed By: H-Huang Pull Request resolved: https://github.com/pytorch/pytorch/pull/162594 Approved by: https://github.com/H-Huang, https://github.com/dcci
246 lines
6.2 KiB
Python
246 lines
6.2 KiB
Python
# mypy: disable-error-code="assignment"
|
|
# noqa: F401
|
|
"""
|
|
Centralized module for importing and re-exporting torch._C._distributed_c10d components.
|
|
|
|
IMPORTANT PATTERN:
|
|
Never access torch._C._distributed_c10d directly in code. Always import from and use
|
|
torch.distributed._distributed_c10d which is guaranteed to have all functions available.
|
|
|
|
Example:
|
|
# WRONG: torch._C._distributed_c10d._set_global_rank(rank)
|
|
# RIGHT:
|
|
from torch.distributed._distributed_c10d import _set_global_rank
|
|
_set_global_rank(rank)
|
|
"""
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
# Import all core distributed components from the C extension
|
|
# NB: This list has to be spelled out because the _C module doesn't have __all__
|
|
from torch._C._distributed_c10d import (
|
|
_allow_inflight_collective_as_graph_input,
|
|
_broadcast_coalesced,
|
|
_compute_bucket_assignment_by_size,
|
|
_ControlCollectives,
|
|
_current_process_group,
|
|
_DEFAULT_FIRST_BUCKET_BYTES,
|
|
_DEFAULT_PG_TIMEOUT,
|
|
_DistributedBackendOptions,
|
|
_make_nccl_premul_sum,
|
|
_register_builtin_comm_hook,
|
|
_register_comm_hook,
|
|
_register_process_group,
|
|
_register_work,
|
|
_resolve_process_group,
|
|
_set_allow_inflight_collective_as_graph_input,
|
|
_set_global_rank,
|
|
_set_process_group,
|
|
_StoreCollectives,
|
|
_test_python_store,
|
|
_unregister_all_process_groups,
|
|
_unregister_process_group,
|
|
_verify_params_across_processes,
|
|
_WorkerServer,
|
|
AllgatherOptions,
|
|
AllreduceCoalescedOptions,
|
|
AllreduceOptions,
|
|
AllToAllOptions,
|
|
Backend,
|
|
BarrierOptions,
|
|
BroadcastOptions,
|
|
BuiltinCommHookType,
|
|
DebugLevel,
|
|
FakeProcessGroup,
|
|
FakeWork,
|
|
FileStore,
|
|
GatherOptions,
|
|
get_debug_level,
|
|
GradBucket,
|
|
Logger,
|
|
PrefixStore,
|
|
ProcessGroup,
|
|
ReduceOp,
|
|
ReduceOptions,
|
|
Reducer,
|
|
ReduceScatterOptions,
|
|
ScatterOptions,
|
|
set_debug_level,
|
|
set_debug_level_from_env,
|
|
Store,
|
|
TCPStore,
|
|
Work,
|
|
)
|
|
|
|
|
|
# Backend-specific components that may not be available
|
|
_MPI_AVAILABLE = False
|
|
_NCCL_AVAILABLE = False
|
|
_GLOO_AVAILABLE = False
|
|
_UCC_AVAILABLE = False
|
|
_XCCL_AVAILABLE = False
|
|
|
|
# HashStore
|
|
try:
|
|
from torch._C._distributed_c10d import HashStore
|
|
except ImportError:
|
|
if not TYPE_CHECKING:
|
|
from torch.distributed._C_stubs import HashStore
|
|
|
|
# NVSHMEM/SymmetricMemory components
|
|
|
|
# There are multiple backends for SymmetricMemory, as a result,
|
|
# _SymmetricMemory should not be imported together with NVSHMEM related modules.
|
|
try:
|
|
from torch._C._distributed_c10d import _SymmetricMemory
|
|
except ImportError:
|
|
if not TYPE_CHECKING:
|
|
from torch.distributed._C_stubs import _SymmetricMemory
|
|
|
|
try:
|
|
from torch._C._distributed_c10d import (
|
|
_is_nvshmem_available,
|
|
_nvshmemx_cumodule_init,
|
|
)
|
|
except ImportError:
|
|
if not TYPE_CHECKING:
|
|
from torch.distributed._C_stubs import (
|
|
_is_nvshmem_available,
|
|
_nvshmemx_cumodule_init,
|
|
)
|
|
|
|
# MPI backend
|
|
try:
|
|
from torch._C._distributed_c10d import ProcessGroupMPI
|
|
|
|
_MPI_AVAILABLE = True
|
|
except ImportError:
|
|
if not TYPE_CHECKING:
|
|
from torch.distributed._C_stubs import ProcessGroupMPI
|
|
|
|
# NCCL backend
|
|
try:
|
|
from torch._C._distributed_c10d import (
|
|
_DEFAULT_PG_NCCL_TIMEOUT,
|
|
_dump_nccl_trace,
|
|
_dump_nccl_trace_json,
|
|
_hash_tensors,
|
|
ProcessGroupNCCL,
|
|
)
|
|
|
|
_NCCL_AVAILABLE = True
|
|
except ImportError:
|
|
if not TYPE_CHECKING:
|
|
from torch.distributed._C_stubs import (
|
|
_DEFAULT_PG_NCCL_TIMEOUT,
|
|
_dump_nccl_trace,
|
|
_dump_nccl_trace_json,
|
|
_hash_tensors,
|
|
ProcessGroupNCCL,
|
|
)
|
|
|
|
# Gloo backend
|
|
try:
|
|
from torch._C._distributed_c10d import _ProcessGroupWrapper, ProcessGroupGloo
|
|
|
|
_GLOO_AVAILABLE = True
|
|
except ImportError:
|
|
if not TYPE_CHECKING:
|
|
from torch.distributed._C_stubs import _ProcessGroupWrapper, ProcessGroupGloo
|
|
|
|
# UCC backend
|
|
try:
|
|
from torch._C._distributed_c10d import ProcessGroupUCC
|
|
|
|
_UCC_AVAILABLE = True
|
|
except ImportError:
|
|
if not TYPE_CHECKING:
|
|
from torch.distributed._C_stubs import ProcessGroupUCC
|
|
|
|
# XCCL backend
|
|
try:
|
|
from torch._C._distributed_c10d import ProcessGroupXCCL
|
|
|
|
_XCCL_AVAILABLE = True
|
|
except ImportError:
|
|
if not TYPE_CHECKING:
|
|
from torch.distributed._C_stubs import ProcessGroupXCCL
|
|
|
|
# Provide backwards compatibility by making all symbols available at module level
|
|
__all__ = [
|
|
# Basic components
|
|
"_broadcast_coalesced",
|
|
"_compute_bucket_assignment_by_size",
|
|
"_ControlCollectives",
|
|
"_DEFAULT_FIRST_BUCKET_BYTES",
|
|
"_DEFAULT_PG_TIMEOUT",
|
|
"_DEFAULT_PG_NCCL_TIMEOUT",
|
|
"_make_nccl_premul_sum",
|
|
"_register_builtin_comm_hook",
|
|
"_register_comm_hook",
|
|
"_StoreCollectives",
|
|
"_test_python_store",
|
|
"_verify_params_across_processes",
|
|
"_allow_inflight_collective_as_graph_input",
|
|
"_register_work",
|
|
"_set_allow_inflight_collective_as_graph_input",
|
|
"_is_nvshmem_available",
|
|
"_nvshmemx_cumodule_init",
|
|
"_SymmetricMemory",
|
|
"_hash_tensors",
|
|
"_set_global_rank",
|
|
"_dump_nccl_trace",
|
|
"_dump_nccl_trace_json",
|
|
"Backend",
|
|
"BuiltinCommHookType",
|
|
"DebugLevel",
|
|
"FakeProcessGroup",
|
|
"FileStore",
|
|
"get_debug_level",
|
|
"GradBucket",
|
|
"HashStore",
|
|
"Logger",
|
|
"PrefixStore",
|
|
"ProcessGroup",
|
|
"Reducer",
|
|
"ReduceOp",
|
|
"set_debug_level",
|
|
"set_debug_level_from_env",
|
|
"Store",
|
|
"TCPStore",
|
|
"Work",
|
|
"FakeWork",
|
|
# Additional distributed_c10d components
|
|
"_DistributedBackendOptions",
|
|
"_register_process_group",
|
|
"_resolve_process_group",
|
|
"_unregister_all_process_groups",
|
|
"_unregister_process_group",
|
|
"_current_process_group",
|
|
"_set_process_group",
|
|
"_WorkerServer",
|
|
"AllgatherOptions",
|
|
"AllreduceCoalescedOptions",
|
|
"AllreduceOptions",
|
|
"AllToAllOptions",
|
|
"BarrierOptions",
|
|
"BroadcastOptions",
|
|
"GatherOptions",
|
|
"ReduceOptions",
|
|
"ReduceScatterOptions",
|
|
"ScatterOptions",
|
|
# Process group implementations
|
|
"ProcessGroupMPI",
|
|
"ProcessGroupNCCL",
|
|
"ProcessGroupGloo",
|
|
"ProcessGroupUCC",
|
|
"ProcessGroupXCCL",
|
|
"_ProcessGroupWrapper",
|
|
# Availability flags
|
|
"_MPI_AVAILABLE",
|
|
"_NCCL_AVAILABLE",
|
|
"_GLOO_AVAILABLE",
|
|
"_UCC_AVAILABLE",
|
|
"_XCCL_AVAILABLE",
|
|
]
|