mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
This reverts commit a0d026688cd69583d5a4e0c6f3e5fda141a7f4a9. Revert "Always build USE_DISTRIBUTED. (#160449)" This reverts commit d80297a6846f1f2c36fd4f19e22919f2abe8fcea. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162568 Approved by: https://github.com/huydhn
This commit is contained in:
committed by
PyTorch MergeBot
parent
11acfed3ce
commit
dda071587f
@ -35,10 +35,11 @@ fi
|
||||
|
||||
print_cmake_info
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
|
||||
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
|
||||
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
else
|
||||
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
|
||||
# backends (specifically the gloo backend), so test that this case works too
|
||||
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
|
||||
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
|
||||
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
|
||||
fi
|
||||
if which sccache > /dev/null; then
|
||||
|
@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
|
||||
fi
|
||||
popd
|
||||
|
||||
python -mpip install -r requirements.txt
|
||||
|
||||
# enable debug asserts in serialization
|
||||
export TORCH_SERIALIZATION_DEBUG=1
|
||||
|
||||
python -mpip install --no-input -r requirements.txt
|
||||
|
||||
setup_test_python() {
|
||||
# The CircleCI worker hostname doesn't resolve to an address.
|
||||
# This environment variable makes ProcessGroupGloo default to
|
||||
|
@ -189,8 +189,7 @@ pip install requests ninja typing-extensions
|
||||
retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
|
||||
retry brew install libomp
|
||||
|
||||
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
|
||||
# is build as part of tensorpipe submodule
|
||||
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
|
||||
export USE_DISTRIBUTED=1
|
||||
|
||||
export USE_MKLDNN=OFF
|
||||
|
@ -22,6 +22,7 @@ COMMON_COPTS = [
|
||||
"-DHAVE_SHM_UNLINK=1",
|
||||
"-D_FILE_OFFSET_BITS=64",
|
||||
"-DUSE_FBGEMM",
|
||||
"-DUSE_DISTRIBUTED",
|
||||
"-DAT_PER_OPERATOR_HEADERS",
|
||||
"-DATEN_THREADING=NATIVE",
|
||||
"-DNO_CUDNN_DESTROY_HANDLE",
|
||||
|
@ -181,9 +181,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
|
||||
set(CPU_POWER ON)
|
||||
endif()
|
||||
|
||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default.
|
||||
# NB: USE_DISTRIBUTED simply disables the backend; distributed code
|
||||
# still gets built
|
||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
|
||||
# tested and likely won't work without additional changes.
|
||||
if(NOT LINUX AND NOT WIN32)
|
||||
set(USE_DISTRIBUTED
|
||||
OFF
|
||||
@ -263,11 +262,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
|
||||
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
|
||||
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
||||
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
|
||||
option(USE_DISTRIBUTED "Enable default distributed backends" ON)
|
||||
option(USE_DISTRIBUTED "Use distributed" ON)
|
||||
cmake_dependent_option(USE_NCCL "Use NCCL" ON
|
||||
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_XCCL "Use XCCL" ON
|
||||
"USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
"USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
|
||||
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
|
||||
@ -432,10 +431,11 @@ if(WIN32)
|
||||
PATH_SUFFIXES lib
|
||||
NO_DEFAULT_PATH)
|
||||
if(NOT libuv_tmp_LIBRARY)
|
||||
set(USE_DISTRIBUTED OFF)
|
||||
set(USE_GLOO OFF)
|
||||
message(
|
||||
WARNING
|
||||
"Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
|
||||
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
|
||||
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
|
||||
)
|
||||
else()
|
||||
|
@ -948,7 +948,6 @@ def define_buck_targets(
|
||||
[
|
||||
("torch/csrc/api/include", "torch/**/*.h"),
|
||||
("", "torch/csrc/**/*.h"),
|
||||
("", "torch/csrc/**/*.hpp"),
|
||||
("", "torch/nativert/**/*.h"),
|
||||
("", "torch/headeronly/**/*.h"),
|
||||
("", "torch/script.h"),
|
||||
@ -2034,7 +2033,6 @@ def define_buck_targets(
|
||||
("", "caffe2/utils/*.h"),
|
||||
("", "caffe2/core/*.h"),
|
||||
("", "torch/csrc/*.h"),
|
||||
("", "torch/csrc/*.hpp"),
|
||||
("", "torch/csrc/api/include/torch/*.h"),
|
||||
("", "torch/csrc/autograd/*.h"),
|
||||
("", "torch/csrc/autograd/*/*.h"),
|
||||
|
@ -540,9 +540,11 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
|
||||
${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
|
||||
)
|
||||
|
||||
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
|
||||
if(USE_DISTRIBUTED)
|
||||
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@ -566,30 +568,32 @@ if(USE_CUDA)
|
||||
list(APPEND Caffe2_GPU_SRCS
|
||||
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
||||
endif()
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
|
||||
set_source_files_properties(
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
|
||||
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
|
||||
)
|
||||
endif()
|
||||
if(USE_DISTRIBUTED)
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
|
||||
set_source_files_properties(
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
|
||||
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
|
||||
)
|
||||
endif()
|
||||
|
||||
set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
|
||||
# Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
|
||||
endif()
|
||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
|
||||
set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
|
||||
# Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
|
||||
endif()
|
||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
|
||||
endif()
|
||||
endif()
|
||||
set_source_files_properties(
|
||||
${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
|
||||
@ -622,9 +626,11 @@ if(USE_ROCM)
|
||||
list(APPEND Caffe2_HIP_SRCS
|
||||
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
||||
endif()
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
|
||||
if(USE_DISTRIBUTED)
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
|
||||
endif()
|
||||
endif()
|
||||
# caffe2_nvrtc's stubs to driver APIs are useful for HIP.
|
||||
# See NOTE [ ATen NVRTC Stub and HIP ]
|
||||
@ -1345,10 +1351,12 @@ if(BUILD_TEST)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
|
||||
add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
|
||||
if(NOT WIN32)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
|
||||
if(USE_DISTRIBUTED)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
|
||||
if(NOT WIN32)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
|
||||
endif()
|
||||
endif()
|
||||
if(NOT NO_API)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
|
||||
@ -1453,40 +1461,46 @@ if(BUILD_LITE_INTERPRETER)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(USE_GLOO AND USE_C10D_GLOO)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
|
||||
endif()
|
||||
if(USE_UCC AND USE_C10D_UCC)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
|
||||
if(USE_CUDA)
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
|
||||
|
||||
# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
|
||||
# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
|
||||
if(USE_DISTRIBUTED)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED)
|
||||
if(USE_GLOO AND USE_C10D_GLOO)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
|
||||
endif()
|
||||
endif()
|
||||
if(USE_NCCL AND USE_C10D_NCCL)
|
||||
if(USE_ROCM)
|
||||
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
|
||||
else()
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
|
||||
if(USE_UCC AND USE_C10D_UCC)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
|
||||
if(USE_CUDA)
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if(USE_MPI AND USE_C10D_MPI)
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set_source_files_properties(
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
|
||||
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
|
||||
if(USE_NCCL AND USE_C10D_NCCL)
|
||||
if(USE_ROCM)
|
||||
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
|
||||
else()
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
|
||||
endif()
|
||||
endif()
|
||||
if(USE_MPI AND USE_C10D_MPI)
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set_source_files_properties(
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
|
||||
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
|
||||
endif()
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
|
||||
endif()
|
||||
# Pass USE_RPC in order to reduce use of
|
||||
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
|
||||
# need to be removed when RPC is supported
|
||||
if(NOT WIN32)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_RPC)
|
||||
endif()
|
||||
# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
|
||||
# can only be compiled with USE_TENSORPIPE is set.
|
||||
if(USE_TENSORPIPE)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
|
||||
endif()
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
|
||||
endif()
|
||||
# Pass USE_RPC in order to reduce use of
|
||||
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
|
||||
# need to be removed when RPC is supported
|
||||
if(NOT WIN32)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_RPC)
|
||||
endif()
|
||||
# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
|
||||
# can only be compiled with USE_TENSORPIPE is set.
|
||||
if(USE_TENSORPIPE)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
|
||||
endif()
|
||||
|
||||
if(NOT INTERN_BUILD_MOBILE)
|
||||
|
@ -1134,7 +1134,7 @@ if(USE_CUDA AND CUDA_VERSION VERSION_LESS 13.0)
|
||||
include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
||||
if(USE_TENSORPIPE)
|
||||
if(USE_DISTRIBUTED AND USE_TENSORPIPE)
|
||||
if(MSVC)
|
||||
message(WARNING "Tensorpipe cannot be used on Windows.")
|
||||
else()
|
||||
|
@ -192,11 +192,13 @@ function(caffe2_print_configuration_summary)
|
||||
message(STATUS " USE_PYTORCH_QNNPACK : ${USE_PYTORCH_QNNPACK}")
|
||||
message(STATUS " USE_XNNPACK : ${USE_XNNPACK}")
|
||||
message(STATUS " USE_DISTRIBUTED : ${USE_DISTRIBUTED}")
|
||||
message(STATUS " USE_MPI : ${USE_MPI}")
|
||||
message(STATUS " USE_GLOO : ${USE_GLOO}")
|
||||
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
|
||||
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
|
||||
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
|
||||
if(${USE_DISTRIBUTED})
|
||||
message(STATUS " USE_MPI : ${USE_MPI}")
|
||||
message(STATUS " USE_GLOO : ${USE_GLOO}")
|
||||
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
|
||||
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
|
||||
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
|
||||
endif()
|
||||
if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
|
||||
message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}")
|
||||
endif()
|
||||
|
@ -3333,6 +3333,13 @@ def coverage_post_process(app, exception):
|
||||
if not isinstance(app.builder, CoverageBuilder):
|
||||
return
|
||||
|
||||
if not torch.distributed.is_available():
|
||||
raise RuntimeError(
|
||||
"The coverage tool cannot run with a version "
|
||||
"of PyTorch that was built with USE_DISTRIBUTED=0 "
|
||||
"as this module's API changes."
|
||||
)
|
||||
|
||||
# These are all the modules that have "automodule" in an rst file
|
||||
# These modules are the ones for which coverage is checked
|
||||
# Here, we make sure that no module is missing from that list
|
||||
|
@ -1,4 +1,4 @@
|
||||
if(NOT WIN32)
|
||||
if(USE_DISTRIBUTED AND NOT WIN32)
|
||||
set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
|
||||
set(DIST_AUTOGRAD_TEST_SOURCES
|
||||
${TORCH_ROOT}/test/cpp/common/main.cpp
|
||||
|
@ -1,41 +0,0 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates
|
||||
# Owner(s): ["oncall: distributed"]
|
||||
|
||||
import torch
|
||||
from torch._subclasses.fake_tensor import FakeTensorMode
|
||||
from torch.distributed.tensor import DTensor
|
||||
from torch.distributed.tensor.placement_types import Shard
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
from torch.testing._internal.distributed.fake_pg import FakeStore
|
||||
|
||||
|
||||
class TestFakeDTensor(TestCase):
|
||||
def test_fake_dtensor_operations(self):
|
||||
# Use FakeTensorMode to handle CUDA tensors without actual CUDA
|
||||
fake_mode = FakeTensorMode()
|
||||
world_size = 4
|
||||
|
||||
fake_store = FakeStore()
|
||||
torch.distributed.init_process_group(
|
||||
"fake", store=fake_store, rank=0, world_size=world_size
|
||||
)
|
||||
device_mesh = torch.distributed.device_mesh.init_device_mesh(
|
||||
"cuda",
|
||||
(2, world_size // 2),
|
||||
)
|
||||
|
||||
# Create fake CUDA tensor using FakeTensorMode
|
||||
with fake_mode:
|
||||
x = torch.randn(1, 1, device="cuda")
|
||||
x = DTensor.from_local(x, device_mesh, [Shard(0), Shard(1)])
|
||||
|
||||
# Test basic DTensor operations
|
||||
self.assertIsInstance(x, DTensor)
|
||||
|
||||
# Test sum operation
|
||||
r = x.sum(1)
|
||||
self.assertIsInstance(r, DTensor)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
@ -65,7 +65,10 @@ from torch.export.passes import move_to_device_pass
|
||||
from torch.fx.experimental.proxy_tensor import make_fx
|
||||
from torch.fx.experimental.symbolic_shapes import ShapeEnv
|
||||
from torch.testing import FileCheck
|
||||
from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
|
||||
from torch.testing._internal.common_cuda import (
|
||||
PLATFORM_SUPPORTS_FLASH_ATTENTION,
|
||||
xfailIfDistributedNotSupported,
|
||||
)
|
||||
from torch.testing._internal.common_utils import (
|
||||
find_library_location,
|
||||
IS_FBCODE,
|
||||
@ -15655,6 +15658,7 @@ class GraphModule(torch.nn.Module):
|
||||
finally:
|
||||
torch.distributed.destroy_process_group()
|
||||
|
||||
@xfailIfDistributedNotSupported
|
||||
def test_distributed_all_reduce(self):
|
||||
class Foo(torch.nn.Module):
|
||||
def __init__(self):
|
||||
@ -15672,6 +15676,7 @@ class GraphModule(torch.nn.Module):
|
||||
inp = (torch.randn(4, 4),)
|
||||
self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
|
||||
|
||||
@xfailIfDistributedNotSupported
|
||||
def test_distributed_all_gather(self):
|
||||
class Foo(torch.nn.Module):
|
||||
def forward(self, x):
|
||||
@ -15687,6 +15692,7 @@ class GraphModule(torch.nn.Module):
|
||||
torch.allclose(a, b) for a, b in zip(ep.module()(*inp), m(*inp))
|
||||
)
|
||||
|
||||
@xfailIfDistributedNotSupported
|
||||
def test_distributed_all_gather_into_tensor(self):
|
||||
class Foo(torch.nn.Module):
|
||||
def forward(self, x):
|
||||
@ -15700,6 +15706,7 @@ class GraphModule(torch.nn.Module):
|
||||
inp = (torch.randn(2),)
|
||||
self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
|
||||
|
||||
@xfailIfDistributedNotSupported
|
||||
@testing.expectedFailureCppRuntime
|
||||
def test_distributed_all_to_all_single(self):
|
||||
class Foo(torch.nn.Module):
|
||||
@ -15717,6 +15724,7 @@ class GraphModule(torch.nn.Module):
|
||||
)
|
||||
self.assertEqual(len(nodes), 1)
|
||||
|
||||
@xfailIfDistributedNotSupported
|
||||
@testing.expectedFailureCppRuntime
|
||||
def test_distributed_reduce_scatter_tensor(self):
|
||||
class Foo(torch.nn.Module):
|
||||
|
@ -7,7 +7,7 @@ import sys
|
||||
from dataclasses import dataclass
|
||||
from multiprocessing.context import SpawnProcess
|
||||
from typing import Any, Optional
|
||||
from unittest import skipIf, skipUnless
|
||||
from unittest import skipUnless
|
||||
from unittest.mock import mock_open, patch
|
||||
|
||||
import torch
|
||||
@ -22,7 +22,7 @@ from torch.numa.binding import (
|
||||
AffinityMode,
|
||||
NumaOptions,
|
||||
)
|
||||
from torch.testing._internal.common_utils import IS_MACOS, run_tests, TestCase
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@ -680,7 +680,6 @@ class NumaBindingTest(TestCase):
|
||||
set(range(0, 2)),
|
||||
)
|
||||
|
||||
@skipIf(IS_MACOS, "sched_getaffinity doesn't exist")
|
||||
def test_binds_to_node_0_if_node_stored_as_minus_one(self) -> None:
|
||||
self._add_mock_hardware(
|
||||
num_sockets=1,
|
||||
|
@ -88,7 +88,8 @@ def build_pytorch(
|
||||
) -> None:
|
||||
my_env = _create_build_env()
|
||||
if (
|
||||
not check_negative_env_flag("USE_CUDA")
|
||||
not check_negative_env_flag("USE_DISTRIBUTED")
|
||||
and not check_negative_env_flag("USE_CUDA")
|
||||
and not check_negative_env_flag("USE_NCCL")
|
||||
and not check_env_flag("USE_SYSTEM_NCCL")
|
||||
):
|
||||
|
@ -276,30 +276,32 @@ add_custom_command(
|
||||
WORKING_DIRECTORY
|
||||
"${TORCH_ROOT}"
|
||||
)
|
||||
if(USE_DISTRIBUTED)
|
||||
if(WIN32)
|
||||
append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
|
||||
else()
|
||||
append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
|
||||
endif()
|
||||
# Disable certain warnings for GCC-9.X
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||
endif()
|
||||
# NCCL is a private dependency of libtorch, but libtorch_python includes
|
||||
# some private headers of libtorch, which in turn include NCCL. As a hacky
|
||||
# alternative to making NCCL a public dependency of libtorch, we make it
|
||||
# a private dependency of libtorch_python as well.
|
||||
if(USE_NCCL)
|
||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
|
||||
endif()
|
||||
# Same for MPI.
|
||||
if(USE_MPI)
|
||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
|
||||
endif()
|
||||
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
|
||||
|
||||
if(WIN32)
|
||||
append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
|
||||
else()
|
||||
append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
|
||||
endif()
|
||||
# Disable certain warnings for GCC-9.X
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||
endif()
|
||||
# NCCL is a private dependency of libtorch, but libtorch_python includes
|
||||
# some private headers of libtorch, which in turn include NCCL. As a hacky
|
||||
# alternative to making NCCL a public dependency of libtorch, we make it
|
||||
# a private dependency of libtorch_python as well.
|
||||
if(USE_NCCL)
|
||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
|
||||
endif()
|
||||
# Same for MPI.
|
||||
if(USE_MPI)
|
||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
|
||||
endif()
|
||||
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
|
||||
|
||||
if(USE_NCCL AND NOT WIN32)
|
||||
list(APPEND TORCH_PYTHON_SRCS
|
||||
@ -367,6 +369,10 @@ if(BUILD_LIBTORCHLESS)
|
||||
target_compile_definitions(torch_python PRIVATE USE_C10D_NCCL)
|
||||
endif()
|
||||
|
||||
if(USE_DISTRIBUTED)
|
||||
target_compile_definitions(torch_python PRIVATE USE_DISTRIBUTED)
|
||||
endif()
|
||||
|
||||
if(USE_MPI AND USE_C10D_MPI)
|
||||
target_compile_definitions(torch_python PRIVATE USE_C10D_MPI)
|
||||
endif()
|
||||
|
@ -851,12 +851,3 @@ class ProcessGroupXCCL(Backend):
|
||||
|
||||
def _set_process_group(pg: ProcessGroup) -> None: ...
|
||||
def _current_process_group() -> ProcessGroup: ...
|
||||
def _dump_nccl_trace_json(
|
||||
includeCollectives: Optional[bool] = ...,
|
||||
onlyActive: Optional[bool] = ...,
|
||||
) -> bytes: ...
|
||||
def _dump_nccl_trace(
|
||||
includeCollectives: Optional[bool] = ...,
|
||||
includeStackTraces: Optional[bool] = ...,
|
||||
onlyActive: Optional[bool] = ...,
|
||||
) -> bytes: ...
|
||||
|
@ -15,7 +15,9 @@
|
||||
#include <torch/csrc/utils/cpp_stacktraces.h>
|
||||
#include <torch/csrc/utils/pybind.h>
|
||||
|
||||
#if defined(USE_DISTRIBUTED)
|
||||
#include <torch/csrc/distributed/c10d/exception.h>
|
||||
#endif
|
||||
|
||||
inline void PyErr_SetString(PyObject* type, const std::string& message) {
|
||||
PyErr_SetString(type, message.c_str());
|
||||
|
@ -120,12 +120,14 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef USE_DISTRIBUTED
|
||||
#ifdef USE_C10D
|
||||
#include <torch/csrc/distributed/autograd/python_autograd.h>
|
||||
#include <torch/csrc/distributed/c10d/c10d.h>
|
||||
#include <torch/csrc/distributed/rpc/rpc.h>
|
||||
#include <torch/csrc/distributed/rpc/testing/testing.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(USE_VALGRIND)
|
||||
#include <callgrind.h>
|
||||
@ -550,7 +552,11 @@ static PyObject* THPModule_getBackcompatKeepdimWarn(
|
||||
}
|
||||
|
||||
static PyObject* THPModule_hasDistributed(PyObject* _unused, PyObject* noargs) {
|
||||
#ifdef USE_DISTRIBUTED
|
||||
Py_RETURN_TRUE;
|
||||
#else
|
||||
Py_RETURN_FALSE;
|
||||
#endif
|
||||
}
|
||||
|
||||
static PyObject* THPModule_showConfig(PyObject* module, PyObject* noargs) {
|
||||
@ -1987,7 +1993,7 @@ PyObject* initModule() {
|
||||
#ifdef USE_XPU
|
||||
THPUtils_addPyMethodDefs(methods, THXPModule_methods());
|
||||
#endif
|
||||
#ifdef USE_C10D
|
||||
#if defined(USE_DISTRIBUTED) && defined(USE_C10D)
|
||||
THPUtils_addPyMethodDefs(
|
||||
methods, torch::distributed::c10d::python_functions());
|
||||
#ifndef _WIN32
|
||||
|
@ -8,7 +8,9 @@
|
||||
#include <torch/csrc/autograd/python_autograd.h>
|
||||
#include <torch/csrc/autograd/python_cpp_function.h>
|
||||
#include <torch/csrc/autograd/python_variable.h>
|
||||
#ifdef USE_DISTRIBUTED
|
||||
#include <torch/csrc/distributed/autograd/functions/sendrpc_backward.h>
|
||||
#endif
|
||||
#include <torch/csrc/jit/python/python_tracer.h>
|
||||
#include <torch/csrc/utils/pybind.h>
|
||||
#include <torch/csrc/utils/python_numbers.h>
|
||||
@ -148,9 +150,11 @@ void THPAutograd_initFunctions() {
|
||||
static PyTypeObject CopyBackwardsClass;
|
||||
addClass<CopyBackwards, NoCtor>(module, CopyBackwardsClass, "CopyBackwards");
|
||||
|
||||
#ifdef USE_DISTRIBUTED
|
||||
static PyTypeObject SendRpcBackwardClass;
|
||||
addClass<torch::distributed::autograd::SendRpcBackward, NoCtor>(
|
||||
module, SendRpcBackwardClass, "SendRpcBackward");
|
||||
#endif
|
||||
|
||||
static PyTypeObject CopySlicesClass;
|
||||
addClass<CopySlices, NoCtor>(module, CopySlicesClass, "CopySlices");
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include <torch/csrc/distributed/c10d/HashStore.hpp>
|
||||
|
||||
#include <unistd.h>
|
||||
#include <cstdint>
|
||||
|
||||
#include <chrono>
|
||||
|
@ -1,5 +1,5 @@
|
||||
#include <ATen/ThreadLocalState.h>
|
||||
#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
|
||||
#include <distributed/c10d/ProcessGroup.hpp>
|
||||
#include <torch/csrc/distributed/c10d/cuda/StreamBlock.hpp>
|
||||
|
||||
#include <torch/csrc/distributed/c10d/Work.hpp>
|
||||
|
@ -1,5 +1,7 @@
|
||||
|
||||
#ifdef USE_DISTRIBUTED
|
||||
#include <torch/csrc/distributed/c10d/Functional.hpp>
|
||||
#endif
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim_cpu.h>
|
||||
#include <torch/csrc/inductor/aoti_torch/utils.h>
|
||||
|
||||
@ -531,6 +533,7 @@ AOTITorchError aoti_torch_cpu__weight_int4pack_mm_cpu_tensor(
|
||||
});
|
||||
}
|
||||
|
||||
#ifdef USE_DISTRIBUTED
|
||||
AOTITorchError aoti_torch_cpu__c10d_functional_all_reduce_(
|
||||
AtenTensorHandle inp,
|
||||
const char* reduce_op,
|
||||
@ -563,3 +566,4 @@ AOTITorchError aoti_torch_cpu__c10d_functional_wait_tensor(
|
||||
*ret0 = new_tensor_handle(std::move(tmp_result));
|
||||
});
|
||||
}
|
||||
#endif
|
||||
|
@ -13,8 +13,6 @@
|
||||
#include <torch/csrc/Layout.h>
|
||||
#include <torch/csrc/QScheme.h>
|
||||
#include <torch/csrc/Stream.h>
|
||||
#include <torch/csrc/distributed/rpc/py_rref.h>
|
||||
#include <torch/csrc/distributed/rpc/rref_impl.h>
|
||||
#include <torch/csrc/jit/api/module.h>
|
||||
#include <torch/csrc/jit/frontend/schema_matching.h>
|
||||
#include <torch/csrc/jit/frontend/tracer.h>
|
||||
@ -26,6 +24,10 @@
|
||||
#include <torch/csrc/utils/pybind.h>
|
||||
#include <torch/csrc/utils/python_arg_parser.h>
|
||||
#include <torch/csrc/utils/six.h>
|
||||
#ifdef USE_DISTRIBUTED
|
||||
#include <torch/csrc/distributed/rpc/py_rref.h>
|
||||
#include <torch/csrc/distributed/rpc/rref_impl.h>
|
||||
#endif
|
||||
|
||||
#include <ATen/core/function_schema.h>
|
||||
#include <c10/core/Stream.h>
|
||||
|
@ -1225,7 +1225,7 @@ std::shared_ptr<SugaredValue> toSugaredValue(
|
||||
} else if (obj.ptr() == py::module::import("torch").attr("_check").ptr()) {
|
||||
return std::make_shared<TorchCheckValue>();
|
||||
#ifdef USE_RPC
|
||||
// This is not defined on WINDOWS
|
||||
// RPC module is only available when build flag "USE_DISTRIBUTED" is on.
|
||||
} else if (
|
||||
isRpcAvailable &&
|
||||
obj.ptr() ==
|
||||
@ -1238,6 +1238,7 @@ std::shared_ptr<SugaredValue> toSugaredValue(
|
||||
return SpecialFormValue::create(prim::rpc_sync);
|
||||
} else if (
|
||||
isRpcAvailable &&
|
||||
// RPC module is only available when build flag "USE_DISTRIBUTED" is on.
|
||||
obj.ptr() ==
|
||||
py::module::import("torch.distributed.rpc").attr("remote").ptr()) {
|
||||
return SpecialFormValue::create(prim::rpc_remote);
|
||||
|
@ -128,8 +128,13 @@ struct InterpreterContinuation {
|
||||
std::optional<at::ThreadLocalState> tls_state = std::nullopt)
|
||||
: state(std::move(state_)),
|
||||
stack(std::move(stack_)),
|
||||
tls_state_(std::move(tls_state)),
|
||||
dist_autograd_context_id_(dist_autograd_context_id) {}
|
||||
tls_state_(std::move(tls_state))
|
||||
#ifdef USE_DISTRIBUTED
|
||||
,
|
||||
dist_autograd_context_id_(dist_autograd_context_id)
|
||||
#endif
|
||||
{
|
||||
}
|
||||
|
||||
void operator()();
|
||||
|
||||
@ -137,10 +142,9 @@ struct InterpreterContinuation {
|
||||
InterpreterState state;
|
||||
Stack stack;
|
||||
std::optional<at::ThreadLocalState> tls_state_ = std::nullopt;
|
||||
#ifndef USE_RPC
|
||||
[[maybe_unused]]
|
||||
#endif
|
||||
#ifdef USE_DISTRIBUTED
|
||||
int64_t dist_autograd_context_id_;
|
||||
#endif
|
||||
};
|
||||
|
||||
// what is the tensors type, including state from the current execution context
|
||||
|
@ -79,7 +79,9 @@ class TORCH_API Pickler {
|
||||
void pushTuple(const IValue& ivalue);
|
||||
void pushString(const std::string& string);
|
||||
void pushDevice(const IValue& ivalue);
|
||||
#ifdef USE_DISTRIBUTED
|
||||
void pushRRef(const IValue& ivalue);
|
||||
#endif
|
||||
// unmemoized version
|
||||
void pushStringImpl(const std::string& string);
|
||||
void pushStorageOfTensor(const at::Tensor& tensor);
|
||||
|
@ -140,7 +140,9 @@ class TORCH_API Unpickler {
|
||||
void rebuildParameter();
|
||||
void rebuildTensorFromTypeV2();
|
||||
void rebuildSparseTensor();
|
||||
#ifdef USE_DISTRIBUTED
|
||||
void rebuildRRef();
|
||||
#endif
|
||||
PickleOpCode readInstruction();
|
||||
PickleOpCode readOpCode() {
|
||||
return static_cast<PickleOpCode>(read<uint8_t>());
|
||||
|
@ -30,12 +30,15 @@
|
||||
#include <torch/csrc/profiler/standalone/execution_trace_observer.h>
|
||||
#include <torch/csrc/profiler/util.h>
|
||||
|
||||
#ifdef USE_DISTRIBUTED
|
||||
#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
|
||||
#endif // USE_DISTRIBUTED
|
||||
|
||||
using namespace at;
|
||||
|
||||
// Collective property attributes
|
||||
// https://github.com/pytorch/pytorch/issues/124674
|
||||
#ifdef USE_DISTRIBUTED
|
||||
constexpr auto kETCommsName = "collective_name";
|
||||
constexpr auto kETInMsgNelems = "in_msg_nelems";
|
||||
constexpr auto kETOutMsgNelems = "out_msg_nelems";
|
||||
@ -46,6 +49,7 @@ constexpr auto kETGlobalRankStride = "global_rank_stride";
|
||||
constexpr auto kETGroupSize = "pg_size";
|
||||
constexpr auto kETProcessGroupName = "pg_name";
|
||||
constexpr auto kETProcessGroupDesc = "pg_desc";
|
||||
#endif // USE_DISTRIBUTED
|
||||
|
||||
namespace torch::profiler::impl {
|
||||
|
||||
@ -265,6 +269,7 @@ static std::ofstream openOutputFile(const std::string& name) {
|
||||
return stream;
|
||||
}
|
||||
|
||||
#ifdef USE_DISTRIBUTED
|
||||
static std::string getAttrJson(
|
||||
const std::string& name,
|
||||
const std::string& type,
|
||||
@ -277,6 +282,7 @@ static std::string getAttrJson(
|
||||
type,
|
||||
value);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void writeJsonNode(
|
||||
std::ofstream& out,
|
||||
@ -654,6 +660,7 @@ static void handleKernelBackendInfo(
|
||||
inline std::string getCommsNodeAttrs(const RecordFunction& fn) { // NOLINT
|
||||
std::vector<std::string> attrs;
|
||||
|
||||
#ifdef USE_DISTRIBUTED
|
||||
// We rely on paramcommsdebug object that is available in thread local info
|
||||
auto debugInfo = dynamic_cast<ParamCommsDebugInfo*>(
|
||||
c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PARAM_COMMS_INFO));
|
||||
@ -697,6 +704,8 @@ inline std::string getCommsNodeAttrs(const RecordFunction& fn) { // NOLINT
|
||||
|
||||
addAttr(kGroupSize, kETGroupSize, "uint64");
|
||||
|
||||
#endif // USE_DISTRIBUTED
|
||||
|
||||
// XXX consider using as string stream?
|
||||
return attrs.empty() ? "" : fmt::format(", {}", fmt::join(attrs, ", "));
|
||||
}
|
||||
|
@ -11,7 +11,9 @@
|
||||
#ifdef USE_KINETO
|
||||
#include <libkineto.h>
|
||||
#endif
|
||||
#ifdef USE_DISTRIBUTED
|
||||
#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
|
||||
#endif // USE_DISTRIBUTED
|
||||
|
||||
namespace torch::profiler::impl {
|
||||
|
||||
@ -453,7 +455,7 @@ std::unordered_map<std::string, std::string> saveNcclMeta(
|
||||
// @lint-ignore CLANGTIDY
|
||||
const SaveNcclMetaConfig& config) {
|
||||
std::unordered_map<std::string, std::string> map;
|
||||
#if !defined(BUILD_LITE_INTERPRETER) && !defined(C10_MOBILE)
|
||||
#ifdef USE_DISTRIBUTED
|
||||
auto debugInfo = dynamic_cast<ParamCommsDebugInfo*>(
|
||||
c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PARAM_COMMS_INFO));
|
||||
|
||||
@ -563,7 +565,7 @@ std::unordered_map<std::string, std::string> saveNcclMeta(
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // !defined(BUILD_LITE_INTERPRETER) && !defined(C10_MOBILE)
|
||||
#endif // USE_DISTRIBUTED
|
||||
return map;
|
||||
}
|
||||
|
||||
|
@ -185,6 +185,7 @@ struct HashCombine {
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef USE_DISTRIBUTED
|
||||
constexpr auto kCommsName = "Collective name";
|
||||
constexpr auto kDtype = "dtype";
|
||||
constexpr auto kInMsgNelems = "In msg nelems";
|
||||
@ -202,5 +203,6 @@ constexpr auto kP2pSrc = "Src Rank";
|
||||
constexpr auto kP2pDst = "Dst Rank";
|
||||
constexpr auto kInTensorsStart = "Input Tensors start";
|
||||
constexpr auto kOutTensorsStart = "Output Tensors start";
|
||||
#endif // USE_DISTRIBUTED
|
||||
|
||||
} // namespace torch::profiler::impl
|
||||
|
@ -1,150 +0,0 @@
|
||||
# mypy: allow-untyped-defs
|
||||
"""
|
||||
Python stubs for backend-specific distributed components.
|
||||
|
||||
Since _C._distributed_c10d always exists now, this module only provides
|
||||
stubs for backend-specific functionality that may not be available in all builds
|
||||
(e.g., NCCL, UCC, MPI, Gloo, etc.).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional, TYPE_CHECKING
|
||||
|
||||
from torch._C._distributed_c10d import Store
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from datetime import timedelta
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
# Store classes
|
||||
class HashStore(Store):
|
||||
"""Stub HashStore for builds without this functionality."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._data = {}
|
||||
|
||||
def set(self, key: str, value: str):
|
||||
self._data[key] = value
|
||||
|
||||
def get(self, key: str) -> bytes:
|
||||
return self._data.get(key, "").encode()
|
||||
|
||||
|
||||
# Backend-specific process group stubs
|
||||
class ProcessGroupMPI:
|
||||
"""Stub ProcessGroupMPI for non-MPI builds."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
class ProcessGroupNCCL:
|
||||
"""Stub ProcessGroupNCCL for non-NCCL builds."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
class ProcessGroupGloo:
|
||||
"""Stub ProcessGroupGloo for non-Gloo builds."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
class ProcessGroupUCC:
|
||||
"""Stub ProcessGroupUCC for non-UCC builds."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
class ProcessGroupXCCL:
|
||||
"""Stub ProcessGroupXCCL for non-XCCL builds."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
class _ProcessGroupWrapper:
|
||||
"""Stub _ProcessGroupWrapper for non-Gloo builds."""
|
||||
|
||||
def __init__(self, process_group, *args, **kwargs):
|
||||
self._process_group = process_group
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._process_group, name)
|
||||
|
||||
|
||||
# NCCL-specific function stubs
|
||||
_DEFAULT_PG_NCCL_TIMEOUT: Optional[timedelta] = None
|
||||
|
||||
|
||||
def _hash_tensors(tensors):
|
||||
"""Stub function to hash tensors - returns dummy hash."""
|
||||
return 0
|
||||
|
||||
|
||||
def _dump_nccl_trace_json(
|
||||
includeCollectives: Optional[bool] = None, onlyActive: Optional[bool] = None
|
||||
) -> bytes:
|
||||
"""Stub function that returns empty JSON trace."""
|
||||
return b"{}"
|
||||
|
||||
|
||||
def _dump_nccl_trace(
|
||||
includeCollectives: Optional[bool] = None,
|
||||
includeStackTraces: Optional[bool] = None,
|
||||
onlyActive: Optional[bool] = None,
|
||||
) -> bytes:
|
||||
"""Stub function that returns empty pickle trace."""
|
||||
return b""
|
||||
|
||||
|
||||
# NVSHMEM/SymmetricMemory stubs
|
||||
def _is_nvshmem_available() -> bool:
|
||||
"""Stub function that returns False indicating NVSHMEM is not available."""
|
||||
return False
|
||||
|
||||
|
||||
def _nvshmemx_cumodule_init(module: int) -> None:
|
||||
"""Stub function for NVSHMEM CU module initialization."""
|
||||
|
||||
|
||||
class _SymmetricMemory:
|
||||
"""Stub _SymmetricMemory class for builds without this functionality."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def empty_strided_p2p(cls, size, stride, dtype, device, group_name=None):
|
||||
"""Stub that returns a regular tensor."""
|
||||
return torch.empty(size, dtype=dtype, device=device)
|
||||
|
||||
@classmethod
|
||||
def rendezvous(cls, tensor, group_name=None):
|
||||
"""Stub that returns None."""
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def set_group_info(cls, *args, **kwargs):
|
||||
"""Stub that does nothing."""
|
||||
|
||||
@classmethod
|
||||
def set_backend(cls, name):
|
||||
"""Stub that does nothing."""
|
||||
|
||||
@classmethod
|
||||
def get_backend(cls, device):
|
||||
"""Stub that returns None."""
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def has_multicast_support(cls, device_type, device_index):
|
||||
"""Stub that returns False."""
|
||||
return False
|
@ -14,10 +14,16 @@ log = logging.getLogger(__name__)
|
||||
|
||||
def is_available() -> bool:
|
||||
"""
|
||||
Always returns ``True``. Note that even if distributed is available,
|
||||
there may not necessarily be any usable backends.
|
||||
Return ``True`` if the distributed package is available.
|
||||
|
||||
Otherwise,
|
||||
``torch.distributed`` does not expose any other APIs. Currently,
|
||||
``torch.distributed`` is available on Linux, MacOS and Windows. Set
|
||||
``USE_DISTRIBUTED=1`` to enable it when building PyTorch from source.
|
||||
Currently, the default value is ``USE_DISTRIBUTED=1`` for Linux and Windows,
|
||||
``USE_DISTRIBUTED=0`` for MacOS.
|
||||
"""
|
||||
return True
|
||||
return hasattr(torch._C, "_c10d_init")
|
||||
|
||||
|
||||
if is_available() and not torch._C._c10d_init():
|
||||
@ -30,124 +36,132 @@ DistNetworkError = torch._C._DistNetworkError
|
||||
DistStoreError = torch._C._DistStoreError
|
||||
QueueEmptyError = torch._C._DistQueueEmptyError
|
||||
|
||||
from torch.distributed._distributed_c10d import (
|
||||
_broadcast_coalesced,
|
||||
_compute_bucket_assignment_by_size,
|
||||
_ControlCollectives,
|
||||
_DEFAULT_FIRST_BUCKET_BYTES,
|
||||
_make_nccl_premul_sum,
|
||||
_register_builtin_comm_hook,
|
||||
_register_comm_hook,
|
||||
_StoreCollectives,
|
||||
_test_python_store,
|
||||
_verify_params_across_processes,
|
||||
Backend as _Backend,
|
||||
BuiltinCommHookType,
|
||||
DebugLevel,
|
||||
FileStore,
|
||||
get_debug_level,
|
||||
GradBucket,
|
||||
Logger,
|
||||
PrefixStore,
|
||||
ProcessGroup as ProcessGroup,
|
||||
Reducer,
|
||||
set_debug_level,
|
||||
set_debug_level_from_env,
|
||||
Store,
|
||||
TCPStore,
|
||||
Work as _Work,
|
||||
)
|
||||
if is_available():
|
||||
from torch._C._distributed_c10d import (
|
||||
_broadcast_coalesced,
|
||||
_compute_bucket_assignment_by_size,
|
||||
_ControlCollectives,
|
||||
_DEFAULT_FIRST_BUCKET_BYTES,
|
||||
_make_nccl_premul_sum,
|
||||
_register_builtin_comm_hook,
|
||||
_register_comm_hook,
|
||||
_StoreCollectives,
|
||||
_test_python_store,
|
||||
_verify_params_across_processes,
|
||||
Backend as _Backend,
|
||||
BuiltinCommHookType,
|
||||
DebugLevel,
|
||||
FileStore,
|
||||
get_debug_level,
|
||||
GradBucket,
|
||||
Logger,
|
||||
PrefixStore,
|
||||
ProcessGroup as ProcessGroup,
|
||||
Reducer,
|
||||
set_debug_level,
|
||||
set_debug_level_from_env,
|
||||
Store,
|
||||
TCPStore,
|
||||
Work as _Work,
|
||||
)
|
||||
|
||||
class _DistributedPdb(pdb.Pdb):
|
||||
"""
|
||||
Supports using PDB from inside a multiprocessing child process.
|
||||
|
||||
class _DistributedPdb(pdb.Pdb):
|
||||
"""
|
||||
Supports using PDB from inside a multiprocessing child process.
|
||||
Usage:
|
||||
_DistributedPdb().set_trace()
|
||||
"""
|
||||
|
||||
Usage:
|
||||
_DistributedPdb().set_trace()
|
||||
"""
|
||||
def interaction(self, *args, **kwargs):
|
||||
_stdin = sys.stdin
|
||||
try:
|
||||
sys.stdin = open("/dev/stdin")
|
||||
pdb.Pdb.interaction(self, *args, **kwargs)
|
||||
finally:
|
||||
sys.stdin = _stdin
|
||||
|
||||
def interaction(self, *args, **kwargs):
|
||||
_stdin = sys.stdin
|
||||
try:
|
||||
sys.stdin = open("/dev/stdin")
|
||||
pdb.Pdb.interaction(self, *args, **kwargs)
|
||||
finally:
|
||||
sys.stdin = _stdin
|
||||
_breakpoint_cache: dict[int, typing.Any] = {}
|
||||
|
||||
def breakpoint(rank: int = 0, skip: int = 0, timeout_s=3600):
|
||||
"""
|
||||
Set a breakpoint, but only on a single rank. All other ranks will wait for you to be
|
||||
done with the breakpoint before continuing.
|
||||
|
||||
_breakpoint_cache: dict[int, typing.Any] = {}
|
||||
Args:
|
||||
rank (int): Which rank to break on. Default: ``0``
|
||||
skip (int): Skip the first ``skip`` calls to this breakpoint. Default: ``0``.
|
||||
"""
|
||||
if skip > 0:
|
||||
key = hash(str(traceback.format_exc()))
|
||||
counter = _breakpoint_cache.get(key, 0) + 1
|
||||
_breakpoint_cache[key] = counter
|
||||
if counter <= skip:
|
||||
log.warning("Skip the breakpoint, counter=%d", counter)
|
||||
return
|
||||
|
||||
# avoid having the default timeout (if short) interrupt your debug session
|
||||
if timeout_s is not None:
|
||||
for group in torch.distributed.distributed_c10d._pg_map:
|
||||
torch.distributed.distributed_c10d._set_pg_timeout(
|
||||
timedelta(seconds=timeout_s), group
|
||||
)
|
||||
|
||||
def breakpoint(rank: int = 0, skip: int = 0, timeout_s=3600):
|
||||
"""
|
||||
Set a breakpoint, but only on a single rank. All other ranks will wait for you to be
|
||||
done with the breakpoint before continuing.
|
||||
|
||||
Args:
|
||||
rank (int): Which rank to break on. Default: ``0``
|
||||
skip (int): Skip the first ``skip`` calls to this breakpoint. Default: ``0``.
|
||||
"""
|
||||
if skip > 0:
|
||||
key = hash(str(traceback.format_exc()))
|
||||
counter = _breakpoint_cache.get(key, 0) + 1
|
||||
_breakpoint_cache[key] = counter
|
||||
if counter <= skip:
|
||||
log.warning("Skip the breakpoint, counter=%d", counter)
|
||||
return
|
||||
|
||||
# avoid having the default timeout (if short) interrupt your debug session
|
||||
if timeout_s is not None:
|
||||
for group in torch.distributed.distributed_c10d._pg_map:
|
||||
torch.distributed.distributed_c10d._set_pg_timeout(
|
||||
timedelta(seconds=timeout_s), group
|
||||
if get_rank() == rank:
|
||||
pdb = _DistributedPdb()
|
||||
pdb.message(
|
||||
"\n!!! ATTENTION !!!\n\n"
|
||||
f"Type 'up' to get to the frame that called dist.breakpoint(rank={rank})\n"
|
||||
)
|
||||
pdb.set_trace()
|
||||
# If Meta/Python keys are in the TLS, we want to make sure that we ignore them
|
||||
# and hit the (default) CPU/CUDA implementation of barrier.
|
||||
meta_in_tls = torch._C._meta_in_tls_dispatch_include()
|
||||
guard = torch._C._DisableTorchDispatch() # type: ignore[attr-defined]
|
||||
torch._C._set_meta_in_tls_dispatch_include(False)
|
||||
try:
|
||||
barrier()
|
||||
finally:
|
||||
torch._C._set_meta_in_tls_dispatch_include(meta_in_tls)
|
||||
del guard
|
||||
|
||||
if get_rank() == rank:
|
||||
pdb = _DistributedPdb()
|
||||
pdb.message(
|
||||
"\n!!! ATTENTION !!!\n\n"
|
||||
f"Type 'up' to get to the frame that called dist.breakpoint(rank={rank})\n"
|
||||
)
|
||||
pdb.set_trace()
|
||||
# If Meta/Python keys are in the TLS, we want to make sure that we ignore them
|
||||
# and hit the (default) CPU/CUDA implementation of barrier.
|
||||
meta_in_tls = torch._C._meta_in_tls_dispatch_include()
|
||||
guard = torch._C._DisableTorchDispatch() # type: ignore[attr-defined]
|
||||
torch._C._set_meta_in_tls_dispatch_include(False)
|
||||
try:
|
||||
barrier()
|
||||
finally:
|
||||
torch._C._set_meta_in_tls_dispatch_include(meta_in_tls)
|
||||
del guard
|
||||
if sys.platform != "win32":
|
||||
from torch._C._distributed_c10d import HashStore
|
||||
|
||||
from .device_mesh import DeviceMesh, init_device_mesh
|
||||
|
||||
if sys.platform != "win32":
|
||||
from torch.distributed._distributed_c10d import HashStore
|
||||
# Variables prefixed with underscore are not auto imported
|
||||
# See the comment in `distributed_c10d.py` above `_backend` on why we expose
|
||||
# this.
|
||||
from .distributed_c10d import * # noqa: F403
|
||||
from .distributed_c10d import (
|
||||
_all_gather_base,
|
||||
_coalescing_manager,
|
||||
_CoalescingManager,
|
||||
_create_process_group_wrapper,
|
||||
_get_process_group_name,
|
||||
_rank_not_in_group,
|
||||
_reduce_scatter_base,
|
||||
_time_estimator,
|
||||
get_node_local_rank,
|
||||
)
|
||||
from .remote_device import _remote_device
|
||||
from .rendezvous import (
|
||||
_create_store_from_options,
|
||||
register_rendezvous_handler,
|
||||
rendezvous,
|
||||
)
|
||||
|
||||
from .device_mesh import DeviceMesh, init_device_mesh
|
||||
set_debug_level_from_env()
|
||||
|
||||
# Variables prefixed with underscore are not auto imported
|
||||
# See the comment in `distributed_c10d.py` above `_backend` on why we expose
|
||||
# this.
|
||||
from .distributed_c10d import * # noqa: F403
|
||||
from .distributed_c10d import (
|
||||
_all_gather_base,
|
||||
_coalescing_manager,
|
||||
_CoalescingManager,
|
||||
_create_process_group_wrapper,
|
||||
_get_process_group_name,
|
||||
_rank_not_in_group,
|
||||
_reduce_scatter_base,
|
||||
_time_estimator,
|
||||
get_node_local_rank,
|
||||
)
|
||||
from .remote_device import _remote_device
|
||||
from .rendezvous import (
|
||||
_create_store_from_options,
|
||||
register_rendezvous_handler,
|
||||
rendezvous,
|
||||
)
|
||||
else:
|
||||
# This stub is sufficient to get
|
||||
# python test/test_public_bindings.py -k test_correct_module_names
|
||||
# working even when USE_DISTRIBUTED=0. Feel free to add more
|
||||
# stubs as necessary.
|
||||
# We cannot define stubs directly because they confuse pyre
|
||||
|
||||
class _ProcessGroupStub:
|
||||
pass
|
||||
|
||||
set_debug_level_from_env()
|
||||
sys.modules["torch.distributed"].ProcessGroup = _ProcessGroupStub # type: ignore[attr-defined]
|
||||
|
@ -10,7 +10,7 @@ from datetime import timedelta
|
||||
from typing import Protocol, Union
|
||||
|
||||
import torch
|
||||
from torch.distributed._distributed_c10d import (
|
||||
from torch._C._distributed_c10d import (
|
||||
_current_process_group,
|
||||
_set_process_group,
|
||||
ProcessGroup,
|
||||
|
@ -1,245 +0,0 @@
|
||||
# mypy: disable-error-code="assignment"
|
||||
# noqa: F401
|
||||
"""
|
||||
Centralized module for importing and re-exporting torch._C._distributed_c10d components.
|
||||
|
||||
IMPORTANT PATTERN:
|
||||
Never access torch._C._distributed_c10d directly in code. Always import from and use
|
||||
torch.distributed._distributed_c10d which is guaranteed to have all functions available.
|
||||
|
||||
Example:
|
||||
# WRONG: torch._C._distributed_c10d._set_global_rank(rank)
|
||||
# RIGHT:
|
||||
from torch.distributed._distributed_c10d import _set_global_rank
|
||||
_set_global_rank(rank)
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
# Import all core distributed components from the C extension
|
||||
# NB: This list has to be spelled out because the _C module doesn't have __all__
|
||||
from torch._C._distributed_c10d import (
|
||||
_allow_inflight_collective_as_graph_input,
|
||||
_broadcast_coalesced,
|
||||
_compute_bucket_assignment_by_size,
|
||||
_ControlCollectives,
|
||||
_current_process_group,
|
||||
_DEFAULT_FIRST_BUCKET_BYTES,
|
||||
_DEFAULT_PG_TIMEOUT,
|
||||
_DistributedBackendOptions,
|
||||
_make_nccl_premul_sum,
|
||||
_register_builtin_comm_hook,
|
||||
_register_comm_hook,
|
||||
_register_process_group,
|
||||
_register_work,
|
||||
_resolve_process_group,
|
||||
_set_allow_inflight_collective_as_graph_input,
|
||||
_set_global_rank,
|
||||
_set_process_group,
|
||||
_StoreCollectives,
|
||||
_test_python_store,
|
||||
_unregister_all_process_groups,
|
||||
_unregister_process_group,
|
||||
_verify_params_across_processes,
|
||||
_WorkerServer,
|
||||
AllgatherOptions,
|
||||
AllreduceCoalescedOptions,
|
||||
AllreduceOptions,
|
||||
AllToAllOptions,
|
||||
Backend,
|
||||
BarrierOptions,
|
||||
BroadcastOptions,
|
||||
BuiltinCommHookType,
|
||||
DebugLevel,
|
||||
FakeProcessGroup,
|
||||
FakeWork,
|
||||
FileStore,
|
||||
GatherOptions,
|
||||
get_debug_level,
|
||||
GradBucket,
|
||||
Logger,
|
||||
PrefixStore,
|
||||
ProcessGroup,
|
||||
ReduceOp,
|
||||
ReduceOptions,
|
||||
Reducer,
|
||||
ReduceScatterOptions,
|
||||
ScatterOptions,
|
||||
set_debug_level,
|
||||
set_debug_level_from_env,
|
||||
Store,
|
||||
TCPStore,
|
||||
Work,
|
||||
)
|
||||
|
||||
|
||||
# Backend-specific components that may not be available
|
||||
_MPI_AVAILABLE = False
|
||||
_NCCL_AVAILABLE = False
|
||||
_GLOO_AVAILABLE = False
|
||||
_UCC_AVAILABLE = False
|
||||
_XCCL_AVAILABLE = False
|
||||
|
||||
# HashStore
|
||||
try:
|
||||
from torch._C._distributed_c10d import HashStore
|
||||
except ImportError:
|
||||
if not TYPE_CHECKING:
|
||||
from torch.distributed._C_stubs import HashStore
|
||||
|
||||
# NVSHMEM/SymmetricMemory components
|
||||
|
||||
# There are multiple backends for SymmetricMemory, as a result,
|
||||
# _SymmetricMemory should not be imported together with NVSHMEM related modules.
|
||||
try:
|
||||
from torch._C._distributed_c10d import _SymmetricMemory
|
||||
except ImportError:
|
||||
if not TYPE_CHECKING:
|
||||
from torch.distributed._C_stubs import _SymmetricMemory
|
||||
|
||||
try:
|
||||
from torch._C._distributed_c10d import (
|
||||
_is_nvshmem_available,
|
||||
_nvshmemx_cumodule_init,
|
||||
)
|
||||
except ImportError:
|
||||
if not TYPE_CHECKING:
|
||||
from torch.distributed._C_stubs import (
|
||||
_is_nvshmem_available,
|
||||
_nvshmemx_cumodule_init,
|
||||
)
|
||||
|
||||
# MPI backend
|
||||
try:
|
||||
from torch._C._distributed_c10d import ProcessGroupMPI
|
||||
|
||||
_MPI_AVAILABLE = True
|
||||
except ImportError:
|
||||
if not TYPE_CHECKING:
|
||||
from torch.distributed._C_stubs import ProcessGroupMPI
|
||||
|
||||
# NCCL backend
|
||||
try:
|
||||
from torch._C._distributed_c10d import (
|
||||
_DEFAULT_PG_NCCL_TIMEOUT,
|
||||
_dump_nccl_trace,
|
||||
_dump_nccl_trace_json,
|
||||
_hash_tensors,
|
||||
ProcessGroupNCCL,
|
||||
)
|
||||
|
||||
_NCCL_AVAILABLE = True
|
||||
except ImportError:
|
||||
if not TYPE_CHECKING:
|
||||
from torch.distributed._C_stubs import (
|
||||
_DEFAULT_PG_NCCL_TIMEOUT,
|
||||
_dump_nccl_trace,
|
||||
_dump_nccl_trace_json,
|
||||
_hash_tensors,
|
||||
ProcessGroupNCCL,
|
||||
)
|
||||
|
||||
# Gloo backend
|
||||
try:
|
||||
from torch._C._distributed_c10d import _ProcessGroupWrapper, ProcessGroupGloo
|
||||
|
||||
_GLOO_AVAILABLE = True
|
||||
except ImportError:
|
||||
if not TYPE_CHECKING:
|
||||
from torch.distributed._C_stubs import _ProcessGroupWrapper, ProcessGroupGloo
|
||||
|
||||
# UCC backend
|
||||
try:
|
||||
from torch._C._distributed_c10d import ProcessGroupUCC
|
||||
|
||||
_UCC_AVAILABLE = True
|
||||
except ImportError:
|
||||
if not TYPE_CHECKING:
|
||||
from torch.distributed._C_stubs import ProcessGroupUCC
|
||||
|
||||
# XCCL backend
|
||||
try:
|
||||
from torch._C._distributed_c10d import ProcessGroupXCCL
|
||||
|
||||
_XCCL_AVAILABLE = True
|
||||
except ImportError:
|
||||
if not TYPE_CHECKING:
|
||||
from torch.distributed._C_stubs import ProcessGroupXCCL
|
||||
|
||||
# Provide backwards compatibility by making all symbols available at module level
|
||||
__all__ = [
|
||||
# Basic components
|
||||
"_broadcast_coalesced",
|
||||
"_compute_bucket_assignment_by_size",
|
||||
"_ControlCollectives",
|
||||
"_DEFAULT_FIRST_BUCKET_BYTES",
|
||||
"_DEFAULT_PG_TIMEOUT",
|
||||
"_DEFAULT_PG_NCCL_TIMEOUT",
|
||||
"_make_nccl_premul_sum",
|
||||
"_register_builtin_comm_hook",
|
||||
"_register_comm_hook",
|
||||
"_StoreCollectives",
|
||||
"_test_python_store",
|
||||
"_verify_params_across_processes",
|
||||
"_allow_inflight_collective_as_graph_input",
|
||||
"_register_work",
|
||||
"_set_allow_inflight_collective_as_graph_input",
|
||||
"_is_nvshmem_available",
|
||||
"_nvshmemx_cumodule_init",
|
||||
"_SymmetricMemory",
|
||||
"_hash_tensors",
|
||||
"_set_global_rank",
|
||||
"_dump_nccl_trace",
|
||||
"_dump_nccl_trace_json",
|
||||
"Backend",
|
||||
"BuiltinCommHookType",
|
||||
"DebugLevel",
|
||||
"FakeProcessGroup",
|
||||
"FileStore",
|
||||
"get_debug_level",
|
||||
"GradBucket",
|
||||
"HashStore",
|
||||
"Logger",
|
||||
"PrefixStore",
|
||||
"ProcessGroup",
|
||||
"Reducer",
|
||||
"ReduceOp",
|
||||
"set_debug_level",
|
||||
"set_debug_level_from_env",
|
||||
"Store",
|
||||
"TCPStore",
|
||||
"Work",
|
||||
"FakeWork",
|
||||
# Additional distributed_c10d components
|
||||
"_DistributedBackendOptions",
|
||||
"_register_process_group",
|
||||
"_resolve_process_group",
|
||||
"_unregister_all_process_groups",
|
||||
"_unregister_process_group",
|
||||
"_current_process_group",
|
||||
"_set_process_group",
|
||||
"_WorkerServer",
|
||||
"AllgatherOptions",
|
||||
"AllreduceCoalescedOptions",
|
||||
"AllreduceOptions",
|
||||
"AllToAllOptions",
|
||||
"BarrierOptions",
|
||||
"BroadcastOptions",
|
||||
"GatherOptions",
|
||||
"ReduceOptions",
|
||||
"ReduceScatterOptions",
|
||||
"ScatterOptions",
|
||||
# Process group implementations
|
||||
"ProcessGroupMPI",
|
||||
"ProcessGroupNCCL",
|
||||
"ProcessGroupGloo",
|
||||
"ProcessGroupUCC",
|
||||
"ProcessGroupXCCL",
|
||||
"_ProcessGroupWrapper",
|
||||
# Availability flags
|
||||
"_MPI_AVAILABLE",
|
||||
"_NCCL_AVAILABLE",
|
||||
"_GLOO_AVAILABLE",
|
||||
"_UCC_AVAILABLE",
|
||||
"_XCCL_AVAILABLE",
|
||||
]
|
@ -7,10 +7,6 @@ from typing import Any, cast, Optional, TYPE_CHECKING, Union
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.distributed.distributed_c10d as c10d
|
||||
from torch.distributed._distributed_c10d import (
|
||||
_allow_inflight_collective_as_graph_input,
|
||||
_set_allow_inflight_collective_as_graph_input,
|
||||
)
|
||||
from torch.distributed.device_mesh import DeviceMesh
|
||||
from torch.fx.experimental.proxy_tensor import get_proxy_mode
|
||||
|
||||
@ -862,13 +858,15 @@ def allow_inflight_collective_as_graph_input_ctx(value: bool = True):
|
||||
will be registered in the work registry, and the wait_tensor() in compiled region called on
|
||||
the output tensor of the collective will wait on the correct work object.
|
||||
"""
|
||||
previous = _allow_inflight_collective_as_graph_input()
|
||||
previous = torch._C._distributed_c10d._allow_inflight_collective_as_graph_input()
|
||||
|
||||
try:
|
||||
_set_allow_inflight_collective_as_graph_input(value)
|
||||
torch._C._distributed_c10d._set_allow_inflight_collective_as_graph_input(value)
|
||||
yield
|
||||
finally:
|
||||
_set_allow_inflight_collective_as_graph_input(previous)
|
||||
torch._C._distributed_c10d._set_allow_inflight_collective_as_graph_input(
|
||||
previous
|
||||
)
|
||||
|
||||
|
||||
def _make_all_gather_out_tensor(input, group_size):
|
||||
|
@ -4,7 +4,7 @@ import copy
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.distributed._shard.sharding_spec as shard_spec
|
||||
from torch.distributed._distributed_c10d import ProcessGroup
|
||||
from torch._C._distributed_c10d import ProcessGroup
|
||||
from torch.distributed._shard.metadata import ShardMetadata
|
||||
from torch.distributed._shard.sharding_spec._internals import (
|
||||
get_chunked_dim_size,
|
||||
|
@ -4,7 +4,7 @@ from typing import cast
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch.distributed._distributed_c10d import ReduceOp
|
||||
from torch._C._distributed_c10d import ReduceOp
|
||||
from torch.distributed._shard.sharded_tensor import ShardedTensor
|
||||
from torch.distributed._shard.sharding_spec import ChunkShardingSpec
|
||||
from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
|
||||
|
@ -15,12 +15,7 @@ import torch
|
||||
import torch.distributed._functional_collectives as funcol
|
||||
import torch.distributed.distributed_c10d as c10d
|
||||
from torch._C._autograd import DeviceType
|
||||
from torch.distributed._distributed_c10d import (
|
||||
_register_work,
|
||||
_SymmetricMemory,
|
||||
ProcessGroup,
|
||||
Work as _Work,
|
||||
)
|
||||
from torch._C._distributed_c10d import _SymmetricMemory, Work as _Work
|
||||
|
||||
|
||||
_group_name_to_store: dict[str, c10d.Store] = {}
|
||||
@ -1493,7 +1488,7 @@ def _low_contention_all_gather(
|
||||
src_buf = symm_mem.get_buffer(remote_rank, tensor.shape, tensor.dtype)
|
||||
chunks[remote_rank].copy_(src_buf)
|
||||
symm_mem.barrier()
|
||||
_register_work(output, Work())
|
||||
torch._C._distributed_c10d._register_work(output, Work())
|
||||
return output
|
||||
|
||||
|
||||
@ -1541,7 +1536,7 @@ def _low_contention_reduce_scatter_with_symm_mem_input(
|
||||
ret = ret.mean(dim=0)
|
||||
else:
|
||||
raise ValueError(f"reduce_op ({reduce_op}) is not supported")
|
||||
_register_work(ret, Work())
|
||||
torch._C._distributed_c10d._register_work(ret, Work())
|
||||
return ret
|
||||
|
||||
|
||||
@ -1576,7 +1571,7 @@ def _low_contention_reduce_scatter_with_workspace(
|
||||
ret = ret.mean(dim=0)
|
||||
else:
|
||||
raise ValueError(f"reduce_op ({reduce_op}) is not supported")
|
||||
_register_work(ret, Work())
|
||||
torch._C._distributed_c10d._register_work(ret, Work())
|
||||
return ret
|
||||
|
||||
|
||||
@ -1654,6 +1649,7 @@ from typing import overload, TYPE_CHECKING, Union
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch._C._distributed_c10d import ProcessGroup
|
||||
from torch.types import _device, _dtype, _int
|
||||
|
||||
|
||||
@ -1731,6 +1727,8 @@ def rendezvous(
|
||||
group (Union[str, :class:`torch.distributed.ProcessGroup`]): The group identifying the
|
||||
participating processes. This can be either a group name or a process group object.
|
||||
"""
|
||||
from torch._C._distributed_c10d import ProcessGroup
|
||||
|
||||
if isinstance(group, str):
|
||||
group_name = group
|
||||
elif isinstance(group, ProcessGroup):
|
||||
@ -1748,7 +1746,11 @@ def is_nvshmem_available() -> bool:
|
||||
|
||||
Check if NVSHMEM is available in current build and on current system.
|
||||
"""
|
||||
from torch.distributed._distributed_c10d import _is_nvshmem_available
|
||||
try:
|
||||
from torch._C._distributed_c10d import _is_nvshmem_available
|
||||
except ImportError:
|
||||
# Not all builds have NVSHMEM support.
|
||||
return False
|
||||
|
||||
# Check if NVSHMEM is available on current system.
|
||||
return _is_nvshmem_available()
|
||||
|
@ -75,7 +75,7 @@ def enable_triton(lib_dir: Optional[str] = None) -> dict[str, str]:
|
||||
"""
|
||||
import triton
|
||||
|
||||
from torch.distributed._distributed_c10d import _nvshmemx_cumodule_init
|
||||
from torch._C._distributed_c10d import _nvshmemx_cumodule_init
|
||||
|
||||
if lib_dir is not None:
|
||||
lib_path = os.path.join(lib_dir, "libnvshmem_device.bc")
|
||||
|
@ -2,9 +2,7 @@ import random
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
|
||||
# Import centralized distributed components
|
||||
from torch.distributed._distributed_c10d import (
|
||||
from torch._C._distributed_c10d import (
|
||||
_resolve_process_group,
|
||||
FakeWork,
|
||||
ProcessGroup,
|
||||
|
@ -5,6 +5,10 @@ from typing import Union
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
# The two imports below are not always available depending on the
|
||||
# USE_DISTRIBUTED compile flag. Make sure they raise import error
|
||||
# if we're trying to use them.
|
||||
from torch.distributed import group, ProcessGroup
|
||||
|
||||
|
||||
|
@ -1,11 +1,7 @@
|
||||
from datetime import timedelta
|
||||
from typing import Optional
|
||||
|
||||
# Import from centralized fallback module - no ImportError handling needed
|
||||
from torch.distributed._distributed_c10d import (
|
||||
_DEFAULT_PG_NCCL_TIMEOUT,
|
||||
_DEFAULT_PG_TIMEOUT,
|
||||
)
|
||||
from torch._C._distributed_c10d import _DEFAULT_PG_TIMEOUT
|
||||
|
||||
|
||||
__all__ = ["default_pg_timeout", "default_pg_nccl_timeout"]
|
||||
@ -20,4 +16,11 @@ default_pg_timeout: timedelta = _DEFAULT_PG_TIMEOUT
|
||||
# Later, we could consider merging them back together at the c++ layer if we can align on a same value.
|
||||
# (only if TORCH_NCCL_BLOCKING_WAIT or TORCH_NCCL_ASYNC_ERROR_HANDLING is set to 1).
|
||||
|
||||
default_pg_nccl_timeout: Optional[timedelta] = _DEFAULT_PG_NCCL_TIMEOUT
|
||||
try:
|
||||
from torch._C._distributed_c10d import _DEFAULT_PG_NCCL_TIMEOUT
|
||||
|
||||
default_pg_nccl_timeout: Optional[timedelta] = _DEFAULT_PG_NCCL_TIMEOUT
|
||||
except ImportError:
|
||||
# if C++ NCCL support is not compiled, we don't have access to the default nccl value.
|
||||
# if anyone is actually trying to use nccl in this state, it should error.
|
||||
default_pg_nccl_timeout = None
|
||||
|
@ -11,14 +11,35 @@ from itertools import chain, zip_longest
|
||||
from typing import Optional, TYPE_CHECKING, Union
|
||||
|
||||
import torch
|
||||
from torch.distributed import is_available
|
||||
from torch.utils._typing_utils import not_none
|
||||
|
||||
|
||||
__all__ = ["init_device_mesh", "DeviceMesh"]
|
||||
|
||||
|
||||
if True: # just to temporarily avoid reindentation
|
||||
from torch.distributed._distributed_c10d import Backend as C10dBackend
|
||||
if not is_available():
|
||||
import sys
|
||||
|
||||
# We need to create the stubs when distributed is not available.
|
||||
# Otherwise, we would fail the doc tests (```./.ci/pytorch/docs-test.sh```),
|
||||
# since it would try to import ``torch.distributed.device_mesh`` or
|
||||
# ``torch.distributed.init_device_mesh`` but cannot find them.
|
||||
|
||||
class _DeviceMeshStub:
|
||||
pass
|
||||
|
||||
def _init_device_mesh_stub():
|
||||
pass
|
||||
|
||||
sys.modules["torch.distributed.device_mesh"].DeviceMesh = _DeviceMeshStub # type: ignore[attr-defined]
|
||||
sys.modules[
|
||||
"torch.distributed.device_mesh"
|
||||
].init_device_mesh = _init_device_mesh_stub # type: ignore[attr-defined]
|
||||
|
||||
|
||||
else:
|
||||
from torch._C._distributed_c10d import Backend as C10dBackend
|
||||
from torch.distributed.distributed_c10d import (
|
||||
_get_default_group,
|
||||
_resolve_process_group,
|
||||
@ -512,16 +533,15 @@ if True: # just to temporarily avoid reindentation
|
||||
# heuristic to set the current cuda/cuda-like device base on num of gpu devices available in each host
|
||||
# NOTE: This device selection would only work for homogeneous hardware.
|
||||
num_devices_per_host = device_handle.device_count()
|
||||
if num_devices_per_host:
|
||||
if (
|
||||
world_size > num_devices_per_host
|
||||
and world_size % num_devices_per_host != 0
|
||||
):
|
||||
raise RuntimeError(
|
||||
f"DeviceMesh only support homogeneous hardware, but found "
|
||||
f"{world_size} ranks and {num_devices_per_host} {self.device_type} devices!"
|
||||
)
|
||||
device_handle.set_device(get_rank() % num_devices_per_host)
|
||||
if (
|
||||
world_size > num_devices_per_host
|
||||
and world_size % num_devices_per_host != 0
|
||||
):
|
||||
raise RuntimeError(
|
||||
f"DeviceMesh only support homogeneous hardware, but found "
|
||||
f"{world_size} ranks and {num_devices_per_host} {self.device_type} devices!"
|
||||
)
|
||||
device_handle.set_device(get_rank() % num_devices_per_host)
|
||||
|
||||
return _get_default_group()
|
||||
|
||||
|
@ -19,21 +19,13 @@ from typing import Any, Callable, Optional, TYPE_CHECKING, Union
|
||||
from typing_extensions import deprecated
|
||||
|
||||
import torch
|
||||
import torch.distributed._distributed_c10d as _c10d
|
||||
from torch._C import _DistStoreError as DistStoreError
|
||||
from torch._utils_internal import set_pytorch_distributed_envs_from_justknobs
|
||||
from torch.distributed._distributed_c10d import ( # Process group implementations; Availability flags
|
||||
from torch._C._distributed_c10d import (
|
||||
_DistributedBackendOptions,
|
||||
_GLOO_AVAILABLE,
|
||||
_MPI_AVAILABLE,
|
||||
_NCCL_AVAILABLE,
|
||||
_ProcessGroupWrapper,
|
||||
_register_process_group,
|
||||
_resolve_process_group,
|
||||
_UCC_AVAILABLE,
|
||||
_unregister_all_process_groups,
|
||||
_unregister_process_group,
|
||||
_XCCL_AVAILABLE,
|
||||
AllgatherOptions,
|
||||
AllreduceCoalescedOptions,
|
||||
AllreduceOptions,
|
||||
@ -45,11 +37,6 @@ from torch.distributed._distributed_c10d import ( # Process group implementatio
|
||||
get_debug_level,
|
||||
PrefixStore,
|
||||
ProcessGroup,
|
||||
ProcessGroupGloo,
|
||||
ProcessGroupMPI,
|
||||
ProcessGroupNCCL,
|
||||
ProcessGroupUCC,
|
||||
ProcessGroupXCCL,
|
||||
ReduceOp,
|
||||
ReduceOptions,
|
||||
ReduceScatterOptions,
|
||||
@ -57,6 +44,7 @@ from torch.distributed._distributed_c10d import ( # Process group implementatio
|
||||
Store,
|
||||
Work,
|
||||
)
|
||||
from torch._utils_internal import set_pytorch_distributed_envs_from_justknobs
|
||||
from torch.monitor import _WaitCounter
|
||||
from torch.overrides import handle_torch_function, has_torch_function
|
||||
from torch.utils._typing_utils import not_none
|
||||
@ -143,11 +131,17 @@ __all__ = [
|
||||
"split_group",
|
||||
]
|
||||
|
||||
_MPI_AVAILABLE = True
|
||||
_NCCL_AVAILABLE = True
|
||||
_GLOO_AVAILABLE = True
|
||||
_UCC_AVAILABLE = True
|
||||
_XCCL_AVAILABLE = True
|
||||
|
||||
_pickler = pickle.Pickler
|
||||
_unpickler = pickle.Unpickler
|
||||
|
||||
|
||||
# Change __module__ of all imported types from the distributed wrapper that are public
|
||||
# Change __module__ of all imported types from torch._C._distributed_c10d that are public
|
||||
def _export_c_types() -> None:
|
||||
_public_types_to_change_module = [
|
||||
AllreduceCoalescedOptions,
|
||||
@ -173,26 +167,45 @@ def _export_c_types() -> None:
|
||||
|
||||
_export_c_types()
|
||||
|
||||
# Add process groups to __all__ and set their module based on availability
|
||||
if _MPI_AVAILABLE:
|
||||
try:
|
||||
from torch._C._distributed_c10d import ProcessGroupMPI
|
||||
|
||||
ProcessGroupMPI.__module__ = "torch.distributed.distributed_c10d"
|
||||
__all__ += ["ProcessGroupMPI"]
|
||||
except ImportError:
|
||||
_MPI_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from torch._C._distributed_c10d import ProcessGroupNCCL
|
||||
|
||||
if _NCCL_AVAILABLE:
|
||||
ProcessGroupNCCL.__module__ = "torch.distributed.distributed_c10d"
|
||||
__all__ += ["ProcessGroupNCCL"]
|
||||
except ImportError:
|
||||
_NCCL_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from torch._C._distributed_c10d import _ProcessGroupWrapper, ProcessGroupGloo
|
||||
|
||||
if _GLOO_AVAILABLE:
|
||||
ProcessGroupGloo.__module__ = "torch.distributed.distributed_c10d"
|
||||
__all__ += ["ProcessGroupGloo"]
|
||||
except ImportError:
|
||||
_GLOO_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from torch._C._distributed_c10d import ProcessGroupUCC
|
||||
|
||||
if _UCC_AVAILABLE:
|
||||
ProcessGroupUCC.__module__ = "torch.distributed.distributed_c10d"
|
||||
__all__ += ["ProcessGroupUCC"]
|
||||
except ImportError:
|
||||
_UCC_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from torch._C._distributed_c10d import ProcessGroupXCCL
|
||||
|
||||
if _XCCL_AVAILABLE:
|
||||
ProcessGroupXCCL.__module__ = "torch.distributed.distributed_c10d"
|
||||
__all__ += ["ProcessGroupXCCL"]
|
||||
except ImportError:
|
||||
_XCCL_AVAILABLE = False
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -1312,8 +1325,7 @@ def _get_default_store() -> Store:
|
||||
def _update_default_pg(pg) -> None:
|
||||
_world.default_pg = pg
|
||||
rank = pg.rank() if pg is not None and pg != GroupMember.NON_GROUP_MEMBER else -1
|
||||
|
||||
_c10d._set_global_rank(rank)
|
||||
torch._C._distributed_c10d._set_global_rank(rank)
|
||||
|
||||
|
||||
def get_backend_config(group: Optional[ProcessGroup] = None) -> str:
|
||||
@ -1950,7 +1962,7 @@ def _new_process_group_helper(
|
||||
|
||||
if device_id:
|
||||
pg.bound_device_id = device_id
|
||||
backend_class: _c10d.Backend
|
||||
backend_class: torch._C._distributed_c10d.Backend
|
||||
for device, backend_str in backend_config.get_device_backend_map().items():
|
||||
# Use the group name as prefix in the default store, such that
|
||||
# a single store can be reused by multiple groups.
|
||||
@ -3065,9 +3077,7 @@ def _object_to_tensor(obj, device, group):
|
||||
if get_debug_level() == DebugLevel.DETAIL and is_nccl_available():
|
||||
backend = get_backend(group)
|
||||
if backend == Backend.NCCL:
|
||||
from torch.distributed._distributed_c10d import _hash_tensors
|
||||
|
||||
hash = _hash_tensors([byte_tensor])
|
||||
hash = torch._C._distributed_c10d._hash_tensors([byte_tensor])
|
||||
logger.warning(
|
||||
"_object_to_tensor size: %s hash value: %s",
|
||||
byte_tensor.numel(),
|
||||
@ -3082,9 +3092,7 @@ def _tensor_to_object(tensor, tensor_size, group):
|
||||
if get_debug_level() == DebugLevel.DETAIL and is_nccl_available():
|
||||
backend = get_backend(group)
|
||||
if backend == Backend.NCCL:
|
||||
from torch.distributed._distributed_c10d import _hash_tensors
|
||||
|
||||
hash = _hash_tensors([tensor])
|
||||
hash = torch._C._distributed_c10d._hash_tensors([tensor])
|
||||
logger.warning(
|
||||
"_tensor_to_object size: %s hash value: %s", tensor.numel(), hash
|
||||
)
|
||||
@ -4961,7 +4969,7 @@ def monitored_barrier(
|
||||
|
||||
|
||||
def _create_process_group_wrapper(
|
||||
wrapped_pg: _c10d.Backend,
|
||||
wrapped_pg: torch._C._distributed_c10d.Backend,
|
||||
store_prefix: str,
|
||||
store: Store,
|
||||
rank: int,
|
||||
|
@ -14,7 +14,7 @@ TORCH_WORKER_SERVER_SOCKET = "TORCH_WORKER_SERVER_SOCKET"
|
||||
|
||||
@contextmanager
|
||||
def _worker_server(socket_path: str) -> Generator[None, None, None]:
|
||||
from torch.distributed._distributed_c10d import _WorkerServer
|
||||
from torch._C._distributed_c10d import _WorkerServer
|
||||
|
||||
server = _WorkerServer(socket_path)
|
||||
try:
|
||||
|
@ -2,6 +2,10 @@
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch.autograd import Function
|
||||
|
||||
# The two imports below are not always available depending on the
|
||||
# USE_DISTRIBUTED compile flag. Make sure they raise import error
|
||||
# if we're trying to use them.
|
||||
from torch.distributed import group, ReduceOp
|
||||
|
||||
|
||||
|
@ -37,6 +37,7 @@ if is_available():
|
||||
import numbers
|
||||
|
||||
import torch.distributed.autograd as dist_autograd
|
||||
from torch._C._distributed_c10d import Store
|
||||
from torch._C._distributed_rpc import ( # noqa: F401
|
||||
_cleanup_python_rpc_handler,
|
||||
_DEFAULT_INIT_METHOD,
|
||||
@ -69,7 +70,6 @@ if is_available():
|
||||
RpcBackendOptions,
|
||||
WorkerInfo,
|
||||
)
|
||||
from torch.distributed._distributed_c10d import Store
|
||||
|
||||
if _is_tensorpipe_available:
|
||||
from torch._C._distributed_rpc import ( # noqa: F401
|
||||
|
@ -8,10 +8,8 @@ from typing import Optional
|
||||
import torch
|
||||
import torch.distributed._functional_collectives as funcol
|
||||
import torch.distributed.tensor._dtensor_spec as dtensor_spec
|
||||
from torch._C._distributed_c10d import _resolve_process_group
|
||||
from torch._logging import warning_once
|
||||
|
||||
# Import from centralized fallback module - no conditional imports needed
|
||||
from torch.distributed._distributed_c10d import _resolve_process_group
|
||||
from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
|
||||
from torch.distributed.distributed_c10d import (
|
||||
_get_group_size_by_name,
|
||||
|
@ -1,7 +1,7 @@
|
||||
# mypy: allow-untyped-defs
|
||||
|
||||
import torch.distributed as dist
|
||||
from torch.distributed._distributed_c10d import FakeProcessGroup
|
||||
from torch._C._distributed_c10d import FakeProcessGroup
|
||||
|
||||
|
||||
class FakeStore(dist.Store):
|
||||
|
Reference in New Issue
Block a user