mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Always build USE_DISTRIBUTED. (#160449)
Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160449 Approved by: https://github.com/wconstab, https://github.com/albanD, https://github.com/dcci
This commit is contained in:
committed by
PyTorch MergeBot
parent
6087ef41e5
commit
de893e96c7
@ -35,11 +35,10 @@ fi
|
|||||||
|
|
||||||
print_cmake_info
|
print_cmake_info
|
||||||
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
|
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
|
||||||
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
|
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||||
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
|
||||||
else
|
else
|
||||||
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
|
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
|
||||||
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
|
# backends (specifically the gloo backend), so test that this case works too
|
||||||
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
|
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
|
||||||
fi
|
fi
|
||||||
if which sccache > /dev/null; then
|
if which sccache > /dev/null; then
|
||||||
|
@ -16,6 +16,8 @@ popd
|
|||||||
# enable debug asserts in serialization
|
# enable debug asserts in serialization
|
||||||
export TORCH_SERIALIZATION_DEBUG=1
|
export TORCH_SERIALIZATION_DEBUG=1
|
||||||
|
|
||||||
|
python -mpip install --no-input -r requirements.txt
|
||||||
|
|
||||||
setup_test_python() {
|
setup_test_python() {
|
||||||
# The CircleCI worker hostname doesn't resolve to an address.
|
# The CircleCI worker hostname doesn't resolve to an address.
|
||||||
# This environment variable makes ProcessGroupGloo default to
|
# This environment variable makes ProcessGroupGloo default to
|
||||||
|
@ -213,7 +213,8 @@ pip install requests ninja typing-extensions
|
|||||||
retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
|
retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
|
||||||
retry brew install libomp
|
retry brew install libomp
|
||||||
|
|
||||||
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
|
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
|
||||||
|
# is build as part of tensorpipe submodule
|
||||||
export USE_DISTRIBUTED=1
|
export USE_DISTRIBUTED=1
|
||||||
|
|
||||||
export USE_MKLDNN=OFF
|
export USE_MKLDNN=OFF
|
||||||
|
@ -22,7 +22,6 @@ COMMON_COPTS = [
|
|||||||
"-DHAVE_SHM_UNLINK=1",
|
"-DHAVE_SHM_UNLINK=1",
|
||||||
"-D_FILE_OFFSET_BITS=64",
|
"-D_FILE_OFFSET_BITS=64",
|
||||||
"-DUSE_FBGEMM",
|
"-DUSE_FBGEMM",
|
||||||
"-DUSE_DISTRIBUTED",
|
|
||||||
"-DAT_PER_OPERATOR_HEADERS",
|
"-DAT_PER_OPERATOR_HEADERS",
|
||||||
"-DATEN_THREADING=NATIVE",
|
"-DATEN_THREADING=NATIVE",
|
||||||
"-DNO_CUDNN_DESTROY_HANDLE",
|
"-DNO_CUDNN_DESTROY_HANDLE",
|
||||||
|
@ -181,8 +181,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
|
|||||||
set(CPU_POWER ON)
|
set(CPU_POWER ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
|
# For non-supported platforms, turn USE_DISTRIBUTED off by default.
|
||||||
# tested and likely won't work without additional changes.
|
# NB: USE_DISTRIBUTED simply disables the backend; distributed code
|
||||||
|
# still gets built
|
||||||
if(NOT LINUX AND NOT WIN32)
|
if(NOT LINUX AND NOT WIN32)
|
||||||
set(USE_DISTRIBUTED
|
set(USE_DISTRIBUTED
|
||||||
OFF
|
OFF
|
||||||
@ -261,11 +262,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
|
|||||||
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
|
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
|
||||||
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
||||||
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
|
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
|
||||||
option(USE_DISTRIBUTED "Use distributed" ON)
|
option(USE_DISTRIBUTED "Enable default distributed backends" ON)
|
||||||
cmake_dependent_option(USE_NCCL "Use NCCL" ON
|
cmake_dependent_option(USE_NCCL "Use NCCL" ON
|
||||||
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
|
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
|
||||||
cmake_dependent_option(USE_XCCL "Use XCCL" ON
|
cmake_dependent_option(USE_XCCL "Use XCCL" ON
|
||||||
"USE_XPU;UNIX;NOT APPLE" OFF)
|
"USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
|
||||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
|
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
|
||||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
|
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
|
||||||
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
|
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
|
||||||
@ -430,11 +431,10 @@ if(WIN32)
|
|||||||
PATH_SUFFIXES lib
|
PATH_SUFFIXES lib
|
||||||
NO_DEFAULT_PATH)
|
NO_DEFAULT_PATH)
|
||||||
if(NOT libuv_tmp_LIBRARY)
|
if(NOT libuv_tmp_LIBRARY)
|
||||||
set(USE_DISTRIBUTED OFF)
|
|
||||||
set(USE_GLOO OFF)
|
set(USE_GLOO OFF)
|
||||||
message(
|
message(
|
||||||
WARNING
|
WARNING
|
||||||
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
|
"Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
|
||||||
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
|
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
|
||||||
)
|
)
|
||||||
else()
|
else()
|
||||||
|
@ -948,6 +948,7 @@ def define_buck_targets(
|
|||||||
[
|
[
|
||||||
("torch/csrc/api/include", "torch/**/*.h"),
|
("torch/csrc/api/include", "torch/**/*.h"),
|
||||||
("", "torch/csrc/**/*.h"),
|
("", "torch/csrc/**/*.h"),
|
||||||
|
("", "torch/csrc/**/*.hpp"),
|
||||||
("", "torch/nativert/**/*.h"),
|
("", "torch/nativert/**/*.h"),
|
||||||
("", "torch/headeronly/**/*.h"),
|
("", "torch/headeronly/**/*.h"),
|
||||||
("", "torch/script.h"),
|
("", "torch/script.h"),
|
||||||
@ -2033,6 +2034,7 @@ def define_buck_targets(
|
|||||||
("", "caffe2/utils/*.h"),
|
("", "caffe2/utils/*.h"),
|
||||||
("", "caffe2/core/*.h"),
|
("", "caffe2/core/*.h"),
|
||||||
("", "torch/csrc/*.h"),
|
("", "torch/csrc/*.h"),
|
||||||
|
("", "torch/csrc/*.hpp"),
|
||||||
("", "torch/csrc/api/include/torch/*.h"),
|
("", "torch/csrc/api/include/torch/*.h"),
|
||||||
("", "torch/csrc/autograd/*.h"),
|
("", "torch/csrc/autograd/*.h"),
|
||||||
("", "torch/csrc/autograd/*/*.h"),
|
("", "torch/csrc/autograd/*/*.h"),
|
||||||
|
@ -540,12 +540,10 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
|
|||||||
${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
|
${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
if(USE_DISTRIBUTED)
|
|
||||||
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
|
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
|
||||||
if(NOT WIN32)
|
if(NOT WIN32)
|
||||||
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
|
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_CUDA OR USE_ROCM)
|
if(USE_CUDA OR USE_ROCM)
|
||||||
@ -568,7 +566,6 @@ if(USE_CUDA)
|
|||||||
list(APPEND Caffe2_GPU_SRCS
|
list(APPEND Caffe2_GPU_SRCS
|
||||||
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
||||||
endif()
|
endif()
|
||||||
if(USE_DISTRIBUTED)
|
|
||||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
|
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
|
||||||
if(NOT WIN32)
|
if(NOT WIN32)
|
||||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
|
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
|
||||||
@ -594,7 +591,6 @@ if(USE_CUDA)
|
|||||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
|
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
|
||||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
|
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
|
||||||
set_source_files_properties(
|
set_source_files_properties(
|
||||||
${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
|
${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
|
||||||
PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
|
PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
|
||||||
@ -626,12 +622,10 @@ if(USE_ROCM)
|
|||||||
list(APPEND Caffe2_HIP_SRCS
|
list(APPEND Caffe2_HIP_SRCS
|
||||||
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
||||||
endif()
|
endif()
|
||||||
if(USE_DISTRIBUTED)
|
|
||||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
|
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
|
||||||
if(NOT WIN32)
|
if(NOT WIN32)
|
||||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
|
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
|
||||||
# caffe2_nvrtc's stubs to driver APIs are useful for HIP.
|
# caffe2_nvrtc's stubs to driver APIs are useful for HIP.
|
||||||
# See NOTE [ ATen NVRTC Stub and HIP ]
|
# See NOTE [ ATen NVRTC Stub and HIP ]
|
||||||
add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
|
add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
|
||||||
@ -1351,13 +1345,11 @@ if(BUILD_TEST)
|
|||||||
add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
|
add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
|
||||||
add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
|
add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
|
||||||
add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
|
add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
|
||||||
if(USE_DISTRIBUTED)
|
|
||||||
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
|
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
|
||||||
if(NOT WIN32)
|
if(NOT WIN32)
|
||||||
add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
|
add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
|
||||||
add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
|
add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
|
||||||
if(NOT NO_API)
|
if(NOT NO_API)
|
||||||
add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
|
add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
|
||||||
endif()
|
endif()
|
||||||
@ -1461,46 +1453,40 @@ if(BUILD_LITE_INTERPRETER)
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(USE_GLOO AND USE_C10D_GLOO)
|
||||||
# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
|
|
||||||
# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
|
|
||||||
if(USE_DISTRIBUTED)
|
|
||||||
target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED)
|
|
||||||
if(USE_GLOO AND USE_C10D_GLOO)
|
|
||||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
|
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
|
||||||
endif()
|
endif()
|
||||||
if(USE_UCC AND USE_C10D_UCC)
|
if(USE_UCC AND USE_C10D_UCC)
|
||||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
|
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
|
||||||
if(USE_CUDA)
|
if(USE_CUDA)
|
||||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
|
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
if(USE_NCCL AND USE_C10D_NCCL)
|
if(USE_NCCL AND USE_C10D_NCCL)
|
||||||
if(USE_ROCM)
|
if(USE_ROCM)
|
||||||
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
|
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
|
||||||
else()
|
else()
|
||||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
|
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
if(USE_MPI AND USE_C10D_MPI)
|
if(USE_MPI AND USE_C10D_MPI)
|
||||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||||
set_source_files_properties(
|
set_source_files_properties(
|
||||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
|
"${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
|
||||||
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
|
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
|
||||||
endif()
|
endif()
|
||||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
|
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
|
||||||
endif()
|
endif()
|
||||||
# Pass USE_RPC in order to reduce use of
|
# Pass USE_RPC in order to reduce use of
|
||||||
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
|
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
|
||||||
# need to be removed when RPC is supported
|
# need to be removed when RPC is supported
|
||||||
if(NOT WIN32)
|
if(NOT WIN32)
|
||||||
target_compile_definitions(torch_cpu PUBLIC USE_RPC)
|
target_compile_definitions(torch_cpu PUBLIC USE_RPC)
|
||||||
endif()
|
endif()
|
||||||
# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
|
# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
|
||||||
# can only be compiled with USE_TENSORPIPE is set.
|
# can only be compiled with USE_TENSORPIPE is set.
|
||||||
if(USE_TENSORPIPE)
|
if(USE_TENSORPIPE)
|
||||||
target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
|
target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(NOT INTERN_BUILD_MOBILE)
|
if(NOT INTERN_BUILD_MOBILE)
|
||||||
|
@ -1126,7 +1126,7 @@ if(USE_CUDA AND CUDA_VERSION VERSION_LESS 13.0)
|
|||||||
include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
|
include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_DISTRIBUTED AND USE_TENSORPIPE)
|
if(USE_TENSORPIPE)
|
||||||
if(MSVC)
|
if(MSVC)
|
||||||
message(WARNING "Tensorpipe cannot be used on Windows.")
|
message(WARNING "Tensorpipe cannot be used on Windows.")
|
||||||
else()
|
else()
|
||||||
|
@ -191,13 +191,11 @@ function(caffe2_print_configuration_summary)
|
|||||||
message(STATUS " USE_PYTORCH_QNNPACK : ${USE_PYTORCH_QNNPACK}")
|
message(STATUS " USE_PYTORCH_QNNPACK : ${USE_PYTORCH_QNNPACK}")
|
||||||
message(STATUS " USE_XNNPACK : ${USE_XNNPACK}")
|
message(STATUS " USE_XNNPACK : ${USE_XNNPACK}")
|
||||||
message(STATUS " USE_DISTRIBUTED : ${USE_DISTRIBUTED}")
|
message(STATUS " USE_DISTRIBUTED : ${USE_DISTRIBUTED}")
|
||||||
if(${USE_DISTRIBUTED})
|
|
||||||
message(STATUS " USE_MPI : ${USE_MPI}")
|
message(STATUS " USE_MPI : ${USE_MPI}")
|
||||||
message(STATUS " USE_GLOO : ${USE_GLOO}")
|
message(STATUS " USE_GLOO : ${USE_GLOO}")
|
||||||
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
|
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
|
||||||
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
|
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
|
||||||
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
|
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
|
||||||
endif()
|
|
||||||
if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
|
if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
|
||||||
message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}")
|
message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}")
|
||||||
endif()
|
endif()
|
||||||
|
@ -3333,13 +3333,6 @@ def coverage_post_process(app, exception):
|
|||||||
if not isinstance(app.builder, CoverageBuilder):
|
if not isinstance(app.builder, CoverageBuilder):
|
||||||
return
|
return
|
||||||
|
|
||||||
if not torch.distributed.is_available():
|
|
||||||
raise RuntimeError(
|
|
||||||
"The coverage tool cannot run with a version "
|
|
||||||
"of PyTorch that was built with USE_DISTRIBUTED=0 "
|
|
||||||
"as this module's API changes."
|
|
||||||
)
|
|
||||||
|
|
||||||
# These are all the modules that have "automodule" in an rst file
|
# These are all the modules that have "automodule" in an rst file
|
||||||
# These modules are the ones for which coverage is checked
|
# These modules are the ones for which coverage is checked
|
||||||
# Here, we make sure that no module is missing from that list
|
# Here, we make sure that no module is missing from that list
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
if(USE_DISTRIBUTED AND NOT WIN32)
|
if(NOT WIN32)
|
||||||
set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
|
set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
|
||||||
set(DIST_AUTOGRAD_TEST_SOURCES
|
set(DIST_AUTOGRAD_TEST_SOURCES
|
||||||
${TORCH_ROOT}/test/cpp/common/main.cpp
|
${TORCH_ROOT}/test/cpp/common/main.cpp
|
||||||
|
@ -65,10 +65,7 @@ from torch.export.passes import move_to_device_pass
|
|||||||
from torch.fx.experimental.proxy_tensor import make_fx
|
from torch.fx.experimental.proxy_tensor import make_fx
|
||||||
from torch.fx.experimental.symbolic_shapes import ShapeEnv
|
from torch.fx.experimental.symbolic_shapes import ShapeEnv
|
||||||
from torch.testing import FileCheck
|
from torch.testing import FileCheck
|
||||||
from torch.testing._internal.common_cuda import (
|
from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
|
||||||
PLATFORM_SUPPORTS_FLASH_ATTENTION,
|
|
||||||
xfailIfDistributedNotSupported,
|
|
||||||
)
|
|
||||||
from torch.testing._internal.common_utils import (
|
from torch.testing._internal.common_utils import (
|
||||||
find_library_location,
|
find_library_location,
|
||||||
IS_FBCODE,
|
IS_FBCODE,
|
||||||
@ -15555,7 +15552,6 @@ class GraphModule(torch.nn.Module):
|
|||||||
finally:
|
finally:
|
||||||
torch.distributed.destroy_process_group()
|
torch.distributed.destroy_process_group()
|
||||||
|
|
||||||
@xfailIfDistributedNotSupported
|
|
||||||
def test_distributed_all_reduce(self):
|
def test_distributed_all_reduce(self):
|
||||||
class Foo(torch.nn.Module):
|
class Foo(torch.nn.Module):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -15573,7 +15569,6 @@ class GraphModule(torch.nn.Module):
|
|||||||
inp = (torch.randn(4, 4),)
|
inp = (torch.randn(4, 4),)
|
||||||
self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
|
self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
|
||||||
|
|
||||||
@xfailIfDistributedNotSupported
|
|
||||||
def test_distributed_all_gather(self):
|
def test_distributed_all_gather(self):
|
||||||
class Foo(torch.nn.Module):
|
class Foo(torch.nn.Module):
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
@ -15589,7 +15584,6 @@ class GraphModule(torch.nn.Module):
|
|||||||
torch.allclose(a, b) for a, b in zip(ep.module()(*inp), m(*inp))
|
torch.allclose(a, b) for a, b in zip(ep.module()(*inp), m(*inp))
|
||||||
)
|
)
|
||||||
|
|
||||||
@xfailIfDistributedNotSupported
|
|
||||||
def test_distributed_all_gather_into_tensor(self):
|
def test_distributed_all_gather_into_tensor(self):
|
||||||
class Foo(torch.nn.Module):
|
class Foo(torch.nn.Module):
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
@ -15603,7 +15597,6 @@ class GraphModule(torch.nn.Module):
|
|||||||
inp = (torch.randn(2),)
|
inp = (torch.randn(2),)
|
||||||
self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
|
self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
|
||||||
|
|
||||||
@xfailIfDistributedNotSupported
|
|
||||||
@testing.expectedFailureCppRuntime
|
@testing.expectedFailureCppRuntime
|
||||||
def test_distributed_all_to_all_single(self):
|
def test_distributed_all_to_all_single(self):
|
||||||
class Foo(torch.nn.Module):
|
class Foo(torch.nn.Module):
|
||||||
@ -15621,7 +15614,6 @@ class GraphModule(torch.nn.Module):
|
|||||||
)
|
)
|
||||||
self.assertEqual(len(nodes), 1)
|
self.assertEqual(len(nodes), 1)
|
||||||
|
|
||||||
@xfailIfDistributedNotSupported
|
|
||||||
@testing.expectedFailureCppRuntime
|
@testing.expectedFailureCppRuntime
|
||||||
def test_distributed_reduce_scatter_tensor(self):
|
def test_distributed_reduce_scatter_tensor(self):
|
||||||
class Foo(torch.nn.Module):
|
class Foo(torch.nn.Module):
|
||||||
|
@ -88,8 +88,7 @@ def build_pytorch(
|
|||||||
) -> None:
|
) -> None:
|
||||||
my_env = _create_build_env()
|
my_env = _create_build_env()
|
||||||
if (
|
if (
|
||||||
not check_negative_env_flag("USE_DISTRIBUTED")
|
not check_negative_env_flag("USE_CUDA")
|
||||||
and not check_negative_env_flag("USE_CUDA")
|
|
||||||
and not check_negative_env_flag("USE_NCCL")
|
and not check_negative_env_flag("USE_NCCL")
|
||||||
and not check_env_flag("USE_SYSTEM_NCCL")
|
and not check_env_flag("USE_SYSTEM_NCCL")
|
||||||
):
|
):
|
||||||
|
@ -273,32 +273,30 @@ add_custom_command(
|
|||||||
WORKING_DIRECTORY
|
WORKING_DIRECTORY
|
||||||
"${TORCH_ROOT}"
|
"${TORCH_ROOT}"
|
||||||
)
|
)
|
||||||
if(USE_DISTRIBUTED)
|
|
||||||
if(WIN32)
|
if(WIN32)
|
||||||
append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
|
append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
|
||||||
else()
|
else()
|
||||||
append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
|
append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
|
||||||
endif()
|
endif()
|
||||||
# Disable certain warnings for GCC-9.X
|
# Disable certain warnings for GCC-9.X
|
||||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||||
endif()
|
|
||||||
# NCCL is a private dependency of libtorch, but libtorch_python includes
|
|
||||||
# some private headers of libtorch, which in turn include NCCL. As a hacky
|
|
||||||
# alternative to making NCCL a public dependency of libtorch, we make it
|
|
||||||
# a private dependency of libtorch_python as well.
|
|
||||||
if(USE_NCCL)
|
|
||||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
|
|
||||||
endif()
|
|
||||||
# Same for MPI.
|
|
||||||
if(USE_MPI)
|
|
||||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
|
|
||||||
endif()
|
|
||||||
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
|
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
# NCCL is a private dependency of libtorch, but libtorch_python includes
|
||||||
|
# some private headers of libtorch, which in turn include NCCL. As a hacky
|
||||||
|
# alternative to making NCCL a public dependency of libtorch, we make it
|
||||||
|
# a private dependency of libtorch_python as well.
|
||||||
|
if(USE_NCCL)
|
||||||
|
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
|
||||||
|
endif()
|
||||||
|
# Same for MPI.
|
||||||
|
if(USE_MPI)
|
||||||
|
list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
|
||||||
|
endif()
|
||||||
|
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
|
||||||
|
|
||||||
if(USE_NCCL AND NOT WIN32)
|
if(USE_NCCL AND NOT WIN32)
|
||||||
list(APPEND TORCH_PYTHON_SRCS
|
list(APPEND TORCH_PYTHON_SRCS
|
||||||
@ -366,10 +364,6 @@ if(BUILD_LIBTORCHLESS)
|
|||||||
target_compile_definitions(torch_python PRIVATE USE_C10D_NCCL)
|
target_compile_definitions(torch_python PRIVATE USE_C10D_NCCL)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_DISTRIBUTED)
|
|
||||||
target_compile_definitions(torch_python PRIVATE USE_DISTRIBUTED)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(USE_MPI AND USE_C10D_MPI)
|
if(USE_MPI AND USE_C10D_MPI)
|
||||||
target_compile_definitions(torch_python PRIVATE USE_C10D_MPI)
|
target_compile_definitions(torch_python PRIVATE USE_C10D_MPI)
|
||||||
endif()
|
endif()
|
||||||
|
@ -15,9 +15,7 @@
|
|||||||
#include <torch/csrc/utils/cpp_stacktraces.h>
|
#include <torch/csrc/utils/cpp_stacktraces.h>
|
||||||
#include <torch/csrc/utils/pybind.h>
|
#include <torch/csrc/utils/pybind.h>
|
||||||
|
|
||||||
#if defined(USE_DISTRIBUTED)
|
|
||||||
#include <torch/csrc/distributed/c10d/exception.h>
|
#include <torch/csrc/distributed/c10d/exception.h>
|
||||||
#endif
|
|
||||||
|
|
||||||
inline void PyErr_SetString(PyObject* type, const std::string& message) {
|
inline void PyErr_SetString(PyObject* type, const std::string& message) {
|
||||||
PyErr_SetString(type, message.c_str());
|
PyErr_SetString(type, message.c_str());
|
||||||
|
@ -120,14 +120,12 @@
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
#ifdef USE_C10D
|
#ifdef USE_C10D
|
||||||
#include <torch/csrc/distributed/autograd/python_autograd.h>
|
#include <torch/csrc/distributed/autograd/python_autograd.h>
|
||||||
#include <torch/csrc/distributed/c10d/c10d.h>
|
#include <torch/csrc/distributed/c10d/c10d.h>
|
||||||
#include <torch/csrc/distributed/rpc/rpc.h>
|
#include <torch/csrc/distributed/rpc/rpc.h>
|
||||||
#include <torch/csrc/distributed/rpc/testing/testing.h>
|
#include <torch/csrc/distributed/rpc/testing/testing.h>
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(USE_VALGRIND)
|
#if defined(USE_VALGRIND)
|
||||||
#include <callgrind.h>
|
#include <callgrind.h>
|
||||||
@ -552,11 +550,7 @@ static PyObject* THPModule_getBackcompatKeepdimWarn(
|
|||||||
}
|
}
|
||||||
|
|
||||||
static PyObject* THPModule_hasDistributed(PyObject* _unused, PyObject* noargs) {
|
static PyObject* THPModule_hasDistributed(PyObject* _unused, PyObject* noargs) {
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
Py_RETURN_TRUE;
|
Py_RETURN_TRUE;
|
||||||
#else
|
|
||||||
Py_RETURN_FALSE;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static PyObject* THPModule_showConfig(PyObject* module, PyObject* noargs) {
|
static PyObject* THPModule_showConfig(PyObject* module, PyObject* noargs) {
|
||||||
@ -1993,7 +1987,7 @@ PyObject* initModule() {
|
|||||||
#ifdef USE_XPU
|
#ifdef USE_XPU
|
||||||
THPUtils_addPyMethodDefs(methods, THXPModule_methods());
|
THPUtils_addPyMethodDefs(methods, THXPModule_methods());
|
||||||
#endif
|
#endif
|
||||||
#if defined(USE_DISTRIBUTED) && defined(USE_C10D)
|
#ifdef USE_C10D
|
||||||
THPUtils_addPyMethodDefs(
|
THPUtils_addPyMethodDefs(
|
||||||
methods, torch::distributed::c10d::python_functions());
|
methods, torch::distributed::c10d::python_functions());
|
||||||
#ifndef _WIN32
|
#ifndef _WIN32
|
||||||
|
@ -8,9 +8,7 @@
|
|||||||
#include <torch/csrc/autograd/python_autograd.h>
|
#include <torch/csrc/autograd/python_autograd.h>
|
||||||
#include <torch/csrc/autograd/python_cpp_function.h>
|
#include <torch/csrc/autograd/python_cpp_function.h>
|
||||||
#include <torch/csrc/autograd/python_variable.h>
|
#include <torch/csrc/autograd/python_variable.h>
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
#include <torch/csrc/distributed/autograd/functions/sendrpc_backward.h>
|
#include <torch/csrc/distributed/autograd/functions/sendrpc_backward.h>
|
||||||
#endif
|
|
||||||
#include <torch/csrc/jit/python/python_tracer.h>
|
#include <torch/csrc/jit/python/python_tracer.h>
|
||||||
#include <torch/csrc/utils/pybind.h>
|
#include <torch/csrc/utils/pybind.h>
|
||||||
#include <torch/csrc/utils/python_numbers.h>
|
#include <torch/csrc/utils/python_numbers.h>
|
||||||
@ -150,11 +148,9 @@ void THPAutograd_initFunctions() {
|
|||||||
static PyTypeObject CopyBackwardsClass;
|
static PyTypeObject CopyBackwardsClass;
|
||||||
addClass<CopyBackwards, NoCtor>(module, CopyBackwardsClass, "CopyBackwards");
|
addClass<CopyBackwards, NoCtor>(module, CopyBackwardsClass, "CopyBackwards");
|
||||||
|
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
static PyTypeObject SendRpcBackwardClass;
|
static PyTypeObject SendRpcBackwardClass;
|
||||||
addClass<torch::distributed::autograd::SendRpcBackward, NoCtor>(
|
addClass<torch::distributed::autograd::SendRpcBackward, NoCtor>(
|
||||||
module, SendRpcBackwardClass, "SendRpcBackward");
|
module, SendRpcBackwardClass, "SendRpcBackward");
|
||||||
#endif
|
|
||||||
|
|
||||||
static PyTypeObject CopySlicesClass;
|
static PyTypeObject CopySlicesClass;
|
||||||
addClass<CopySlices, NoCtor>(module, CopySlicesClass, "CopySlices");
|
addClass<CopySlices, NoCtor>(module, CopySlicesClass, "CopySlices");
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
|
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
#include <torch/csrc/distributed/c10d/Functional.hpp>
|
#include <torch/csrc/distributed/c10d/Functional.hpp>
|
||||||
#endif
|
|
||||||
#include <torch/csrc/inductor/aoti_torch/c/shim_cpu.h>
|
#include <torch/csrc/inductor/aoti_torch/c/shim_cpu.h>
|
||||||
#include <torch/csrc/inductor/aoti_torch/utils.h>
|
#include <torch/csrc/inductor/aoti_torch/utils.h>
|
||||||
|
|
||||||
@ -533,7 +531,6 @@ AOTITorchError aoti_torch_cpu__weight_int4pack_mm_cpu_tensor(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
AOTITorchError aoti_torch_cpu__c10d_functional_all_reduce_(
|
AOTITorchError aoti_torch_cpu__c10d_functional_all_reduce_(
|
||||||
AtenTensorHandle inp,
|
AtenTensorHandle inp,
|
||||||
const char* reduce_op,
|
const char* reduce_op,
|
||||||
@ -566,4 +563,3 @@ AOTITorchError aoti_torch_cpu__c10d_functional_wait_tensor(
|
|||||||
*ret0 = new_tensor_handle(std::move(tmp_result));
|
*ret0 = new_tensor_handle(std::move(tmp_result));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
@ -13,6 +13,8 @@
|
|||||||
#include <torch/csrc/Layout.h>
|
#include <torch/csrc/Layout.h>
|
||||||
#include <torch/csrc/QScheme.h>
|
#include <torch/csrc/QScheme.h>
|
||||||
#include <torch/csrc/Stream.h>
|
#include <torch/csrc/Stream.h>
|
||||||
|
#include <torch/csrc/distributed/rpc/py_rref.h>
|
||||||
|
#include <torch/csrc/distributed/rpc/rref_impl.h>
|
||||||
#include <torch/csrc/jit/api/module.h>
|
#include <torch/csrc/jit/api/module.h>
|
||||||
#include <torch/csrc/jit/frontend/schema_matching.h>
|
#include <torch/csrc/jit/frontend/schema_matching.h>
|
||||||
#include <torch/csrc/jit/frontend/tracer.h>
|
#include <torch/csrc/jit/frontend/tracer.h>
|
||||||
@ -24,10 +26,6 @@
|
|||||||
#include <torch/csrc/utils/pybind.h>
|
#include <torch/csrc/utils/pybind.h>
|
||||||
#include <torch/csrc/utils/python_arg_parser.h>
|
#include <torch/csrc/utils/python_arg_parser.h>
|
||||||
#include <torch/csrc/utils/six.h>
|
#include <torch/csrc/utils/six.h>
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
#include <torch/csrc/distributed/rpc/py_rref.h>
|
|
||||||
#include <torch/csrc/distributed/rpc/rref_impl.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <ATen/core/function_schema.h>
|
#include <ATen/core/function_schema.h>
|
||||||
#include <c10/core/Stream.h>
|
#include <c10/core/Stream.h>
|
||||||
|
@ -1225,7 +1225,7 @@ std::shared_ptr<SugaredValue> toSugaredValue(
|
|||||||
} else if (obj.ptr() == py::module::import("torch").attr("_check").ptr()) {
|
} else if (obj.ptr() == py::module::import("torch").attr("_check").ptr()) {
|
||||||
return std::make_shared<TorchCheckValue>();
|
return std::make_shared<TorchCheckValue>();
|
||||||
#ifdef USE_RPC
|
#ifdef USE_RPC
|
||||||
// RPC module is only available when build flag "USE_DISTRIBUTED" is on.
|
// This is not defined on WINDOWS
|
||||||
} else if (
|
} else if (
|
||||||
isRpcAvailable &&
|
isRpcAvailable &&
|
||||||
obj.ptr() ==
|
obj.ptr() ==
|
||||||
@ -1238,7 +1238,6 @@ std::shared_ptr<SugaredValue> toSugaredValue(
|
|||||||
return SpecialFormValue::create(prim::rpc_sync);
|
return SpecialFormValue::create(prim::rpc_sync);
|
||||||
} else if (
|
} else if (
|
||||||
isRpcAvailable &&
|
isRpcAvailable &&
|
||||||
// RPC module is only available when build flag "USE_DISTRIBUTED" is on.
|
|
||||||
obj.ptr() ==
|
obj.ptr() ==
|
||||||
py::module::import("torch.distributed.rpc").attr("remote").ptr()) {
|
py::module::import("torch.distributed.rpc").attr("remote").ptr()) {
|
||||||
return SpecialFormValue::create(prim::rpc_remote);
|
return SpecialFormValue::create(prim::rpc_remote);
|
||||||
|
@ -128,13 +128,8 @@ struct InterpreterContinuation {
|
|||||||
std::optional<at::ThreadLocalState> tls_state = std::nullopt)
|
std::optional<at::ThreadLocalState> tls_state = std::nullopt)
|
||||||
: state(std::move(state_)),
|
: state(std::move(state_)),
|
||||||
stack(std::move(stack_)),
|
stack(std::move(stack_)),
|
||||||
tls_state_(std::move(tls_state))
|
tls_state_(std::move(tls_state)),
|
||||||
#ifdef USE_DISTRIBUTED
|
dist_autograd_context_id_(dist_autograd_context_id) {}
|
||||||
,
|
|
||||||
dist_autograd_context_id_(dist_autograd_context_id)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
void operator()();
|
void operator()();
|
||||||
|
|
||||||
@ -142,9 +137,10 @@ struct InterpreterContinuation {
|
|||||||
InterpreterState state;
|
InterpreterState state;
|
||||||
Stack stack;
|
Stack stack;
|
||||||
std::optional<at::ThreadLocalState> tls_state_ = std::nullopt;
|
std::optional<at::ThreadLocalState> tls_state_ = std::nullopt;
|
||||||
#ifdef USE_DISTRIBUTED
|
#ifndef USE_RPC
|
||||||
int64_t dist_autograd_context_id_;
|
[[maybe_unused]]
|
||||||
#endif
|
#endif
|
||||||
|
int64_t dist_autograd_context_id_;
|
||||||
};
|
};
|
||||||
|
|
||||||
// what is the tensors type, including state from the current execution context
|
// what is the tensors type, including state from the current execution context
|
||||||
|
@ -79,9 +79,7 @@ class TORCH_API Pickler {
|
|||||||
void pushTuple(const IValue& ivalue);
|
void pushTuple(const IValue& ivalue);
|
||||||
void pushString(const std::string& string);
|
void pushString(const std::string& string);
|
||||||
void pushDevice(const IValue& ivalue);
|
void pushDevice(const IValue& ivalue);
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
void pushRRef(const IValue& ivalue);
|
void pushRRef(const IValue& ivalue);
|
||||||
#endif
|
|
||||||
// unmemoized version
|
// unmemoized version
|
||||||
void pushStringImpl(const std::string& string);
|
void pushStringImpl(const std::string& string);
|
||||||
void pushStorageOfTensor(const at::Tensor& tensor);
|
void pushStorageOfTensor(const at::Tensor& tensor);
|
||||||
|
@ -140,9 +140,7 @@ class TORCH_API Unpickler {
|
|||||||
void rebuildParameter();
|
void rebuildParameter();
|
||||||
void rebuildTensorFromTypeV2();
|
void rebuildTensorFromTypeV2();
|
||||||
void rebuildSparseTensor();
|
void rebuildSparseTensor();
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
void rebuildRRef();
|
void rebuildRRef();
|
||||||
#endif
|
|
||||||
PickleOpCode readInstruction();
|
PickleOpCode readInstruction();
|
||||||
PickleOpCode readOpCode() {
|
PickleOpCode readOpCode() {
|
||||||
return static_cast<PickleOpCode>(read<uint8_t>());
|
return static_cast<PickleOpCode>(read<uint8_t>());
|
||||||
|
@ -30,15 +30,12 @@
|
|||||||
#include <torch/csrc/profiler/standalone/execution_trace_observer.h>
|
#include <torch/csrc/profiler/standalone/execution_trace_observer.h>
|
||||||
#include <torch/csrc/profiler/util.h>
|
#include <torch/csrc/profiler/util.h>
|
||||||
|
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
|
#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
|
||||||
#endif // USE_DISTRIBUTED
|
|
||||||
|
|
||||||
using namespace at;
|
using namespace at;
|
||||||
|
|
||||||
// Collective property attributes
|
// Collective property attributes
|
||||||
// https://github.com/pytorch/pytorch/issues/124674
|
// https://github.com/pytorch/pytorch/issues/124674
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
constexpr auto kETCommsName = "collective_name";
|
constexpr auto kETCommsName = "collective_name";
|
||||||
constexpr auto kETInMsgNelems = "in_msg_nelems";
|
constexpr auto kETInMsgNelems = "in_msg_nelems";
|
||||||
constexpr auto kETOutMsgNelems = "out_msg_nelems";
|
constexpr auto kETOutMsgNelems = "out_msg_nelems";
|
||||||
@ -49,7 +46,6 @@ constexpr auto kETGlobalRankStride = "global_rank_stride";
|
|||||||
constexpr auto kETGroupSize = "pg_size";
|
constexpr auto kETGroupSize = "pg_size";
|
||||||
constexpr auto kETProcessGroupName = "pg_name";
|
constexpr auto kETProcessGroupName = "pg_name";
|
||||||
constexpr auto kETProcessGroupDesc = "pg_desc";
|
constexpr auto kETProcessGroupDesc = "pg_desc";
|
||||||
#endif // USE_DISTRIBUTED
|
|
||||||
|
|
||||||
namespace torch::profiler::impl {
|
namespace torch::profiler::impl {
|
||||||
|
|
||||||
@ -269,7 +265,6 @@ static std::ofstream openOutputFile(const std::string& name) {
|
|||||||
return stream;
|
return stream;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
static std::string getAttrJson(
|
static std::string getAttrJson(
|
||||||
const std::string& name,
|
const std::string& name,
|
||||||
const std::string& type,
|
const std::string& type,
|
||||||
@ -282,7 +277,6 @@ static std::string getAttrJson(
|
|||||||
type,
|
type,
|
||||||
value);
|
value);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
static void writeJsonNode(
|
static void writeJsonNode(
|
||||||
std::ofstream& out,
|
std::ofstream& out,
|
||||||
@ -660,7 +654,6 @@ static void handleKernelBackendInfo(
|
|||||||
inline std::string getCommsNodeAttrs(const RecordFunction& fn) { // NOLINT
|
inline std::string getCommsNodeAttrs(const RecordFunction& fn) { // NOLINT
|
||||||
std::vector<std::string> attrs;
|
std::vector<std::string> attrs;
|
||||||
|
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
// We rely on paramcommsdebug object that is available in thread local info
|
// We rely on paramcommsdebug object that is available in thread local info
|
||||||
auto debugInfo = dynamic_cast<ParamCommsDebugInfo*>(
|
auto debugInfo = dynamic_cast<ParamCommsDebugInfo*>(
|
||||||
c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PARAM_COMMS_INFO));
|
c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PARAM_COMMS_INFO));
|
||||||
@ -704,8 +697,6 @@ inline std::string getCommsNodeAttrs(const RecordFunction& fn) { // NOLINT
|
|||||||
|
|
||||||
addAttr(kGroupSize, kETGroupSize, "uint64");
|
addAttr(kGroupSize, kETGroupSize, "uint64");
|
||||||
|
|
||||||
#endif // USE_DISTRIBUTED
|
|
||||||
|
|
||||||
// XXX consider using as string stream?
|
// XXX consider using as string stream?
|
||||||
return attrs.empty() ? "" : fmt::format(", {}", fmt::join(attrs, ", "));
|
return attrs.empty() ? "" : fmt::format(", {}", fmt::join(attrs, ", "));
|
||||||
}
|
}
|
||||||
|
@ -11,9 +11,7 @@
|
|||||||
#ifdef USE_KINETO
|
#ifdef USE_KINETO
|
||||||
#include <libkineto.h>
|
#include <libkineto.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
|
#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
|
||||||
#endif // USE_DISTRIBUTED
|
|
||||||
|
|
||||||
namespace torch::profiler::impl {
|
namespace torch::profiler::impl {
|
||||||
|
|
||||||
@ -455,7 +453,7 @@ std::unordered_map<std::string, std::string> saveNcclMeta(
|
|||||||
// @lint-ignore CLANGTIDY
|
// @lint-ignore CLANGTIDY
|
||||||
const SaveNcclMetaConfig& config) {
|
const SaveNcclMetaConfig& config) {
|
||||||
std::unordered_map<std::string, std::string> map;
|
std::unordered_map<std::string, std::string> map;
|
||||||
#ifdef USE_DISTRIBUTED
|
#if !defined(BUILD_LITE_INTERPRETER) && !defined(C10_MOBILE)
|
||||||
auto debugInfo = dynamic_cast<ParamCommsDebugInfo*>(
|
auto debugInfo = dynamic_cast<ParamCommsDebugInfo*>(
|
||||||
c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PARAM_COMMS_INFO));
|
c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PARAM_COMMS_INFO));
|
||||||
|
|
||||||
@ -565,7 +563,7 @@ std::unordered_map<std::string, std::string> saveNcclMeta(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // USE_DISTRIBUTED
|
#endif // !defined(BUILD_LITE_INTERPRETER) && !defined(C10_MOBILE)
|
||||||
return map;
|
return map;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -185,7 +185,6 @@ struct HashCombine {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef USE_DISTRIBUTED
|
|
||||||
constexpr auto kCommsName = "Collective name";
|
constexpr auto kCommsName = "Collective name";
|
||||||
constexpr auto kDtype = "dtype";
|
constexpr auto kDtype = "dtype";
|
||||||
constexpr auto kInMsgNelems = "In msg nelems";
|
constexpr auto kInMsgNelems = "In msg nelems";
|
||||||
@ -203,6 +202,5 @@ constexpr auto kP2pSrc = "Src Rank";
|
|||||||
constexpr auto kP2pDst = "Dst Rank";
|
constexpr auto kP2pDst = "Dst Rank";
|
||||||
constexpr auto kInTensorsStart = "Input Tensors start";
|
constexpr auto kInTensorsStart = "Input Tensors start";
|
||||||
constexpr auto kOutTensorsStart = "Output Tensors start";
|
constexpr auto kOutTensorsStart = "Output Tensors start";
|
||||||
#endif // USE_DISTRIBUTED
|
|
||||||
|
|
||||||
} // namespace torch::profiler::impl
|
} // namespace torch::profiler::impl
|
||||||
|
@ -14,16 +14,10 @@ log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
def is_available() -> bool:
|
def is_available() -> bool:
|
||||||
"""
|
"""
|
||||||
Return ``True`` if the distributed package is available.
|
Always returns ``True``. Note that even if distributed is available,
|
||||||
|
there may not necessarily be any usable backends.
|
||||||
Otherwise,
|
|
||||||
``torch.distributed`` does not expose any other APIs. Currently,
|
|
||||||
``torch.distributed`` is available on Linux, MacOS and Windows. Set
|
|
||||||
``USE_DISTRIBUTED=1`` to enable it when building PyTorch from source.
|
|
||||||
Currently, the default value is ``USE_DISTRIBUTED=1`` for Linux and Windows,
|
|
||||||
``USE_DISTRIBUTED=0`` for MacOS.
|
|
||||||
"""
|
"""
|
||||||
return hasattr(torch._C, "_c10d_init")
|
return True
|
||||||
|
|
||||||
|
|
||||||
if is_available() and not torch._C._c10d_init():
|
if is_available() and not torch._C._c10d_init():
|
||||||
|
@ -5,10 +5,6 @@ from typing import Union
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
|
|
||||||
# The two imports below are not always available depending on the
|
|
||||||
# USE_DISTRIBUTED compile flag. Make sure they raise import error
|
|
||||||
# if we're trying to use them.
|
|
||||||
from torch.distributed import group, ProcessGroup
|
from torch.distributed import group, ProcessGroup
|
||||||
|
|
||||||
|
|
||||||
|
@ -2,10 +2,6 @@
|
|||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from torch.autograd import Function
|
from torch.autograd import Function
|
||||||
|
|
||||||
# The two imports below are not always available depending on the
|
|
||||||
# USE_DISTRIBUTED compile flag. Make sure they raise import error
|
|
||||||
# if we're trying to use them.
|
|
||||||
from torch.distributed import group, ReduceOp
|
from torch.distributed import group, ReduceOp
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user