mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[symm_mem] Move all symm mem code into a dedicated folder (#155573)
We arrive at a point when so many files are related to symmetric memory and files are scattered around in the cpp side. Let's first put all related code (symmetric memory related) into a separate folder. We can do further refactoring later if needed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/155573 Approved by: https://github.com/fegin, https://github.com/d4l3k
This commit is contained in:
14
BUILD.bazel
14
BUILD.bazel
@ -582,9 +582,9 @@ cc_library(
|
||||
cu_library(
|
||||
name = "torch_cuda",
|
||||
srcs = [
|
||||
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
||||
"torch/csrc/distributed/c10d/NanCheck.cu",
|
||||
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
||||
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
|
||||
],
|
||||
copts = torch_cuda_half_options,
|
||||
visibility = ["//visibility:public"],
|
||||
@ -745,15 +745,15 @@ cc_library(
|
||||
srcs = if_cuda(glob(
|
||||
libtorch_cuda_sources,
|
||||
exclude = [
|
||||
"torch/csrc/cuda/python_nccl.cpp",
|
||||
"torch/csrc/cuda/nccl.cpp",
|
||||
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
||||
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
|
||||
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
|
||||
"torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp",
|
||||
"torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
|
||||
"torch/csrc/cuda/python_nccl.cpp",
|
||||
"torch/csrc/distributed/c10d/NanCheck.cu",
|
||||
"torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
|
||||
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
||||
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
|
||||
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
|
||||
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
|
||||
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
|
||||
],
|
||||
)) + torch_sources,
|
||||
copts = TORCH_COPTS,
|
||||
|
@ -493,12 +493,10 @@ libtorch_core_sources = sorted(
|
||||
|
||||
# These files are the only ones that are supported on Windows.
|
||||
libtorch_distributed_base_sources = [
|
||||
"torch/csrc/distributed/c10d/Backend.cpp",
|
||||
"torch/csrc/distributed/c10d/Backoff.cpp",
|
||||
"torch/csrc/distributed/c10d/DMAConnectivity.cpp",
|
||||
"torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp",
|
||||
"torch/csrc/distributed/c10d/FlightRecorder.cpp",
|
||||
"torch/csrc/distributed/c10d/Backend.cpp",
|
||||
"torch/csrc/distributed/c10d/FileStore.cpp",
|
||||
"torch/csrc/distributed/c10d/FlightRecorder.cpp",
|
||||
"torch/csrc/distributed/c10d/Functional.cpp",
|
||||
"torch/csrc/distributed/c10d/GlooDeviceFactory.cpp",
|
||||
"torch/csrc/distributed/c10d/GroupRegistry.cpp",
|
||||
@ -510,12 +508,15 @@ libtorch_distributed_base_sources = [
|
||||
"torch/csrc/distributed/c10d/ProcessGroupMPI.cpp",
|
||||
"torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp",
|
||||
"torch/csrc/distributed/c10d/Store.cpp",
|
||||
"torch/csrc/distributed/c10d/SymmetricMemory.cpp",
|
||||
"torch/csrc/distributed/c10d/TCPStore.cpp",
|
||||
"torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
|
||||
"torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp",
|
||||
"torch/csrc/distributed/c10d/Utils.cpp",
|
||||
"torch/csrc/distributed/c10d/Work.cpp",
|
||||
"torch/csrc/distributed/c10d/comm.cpp",
|
||||
"torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp",
|
||||
"torch/csrc/distributed/c10d/control_plane/Handlers.cpp",
|
||||
"torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp",
|
||||
"torch/csrc/distributed/c10d/debug.cpp",
|
||||
"torch/csrc/distributed/c10d/default_comm_hooks.cpp",
|
||||
"torch/csrc/distributed/c10d/logger.cpp",
|
||||
@ -524,9 +525,8 @@ libtorch_distributed_base_sources = [
|
||||
"torch/csrc/distributed/c10d/reducer.cpp",
|
||||
"torch/csrc/distributed/c10d/sequence_num.cpp",
|
||||
"torch/csrc/distributed/c10d/socket.cpp",
|
||||
"torch/csrc/distributed/c10d/Work.cpp",
|
||||
"torch/csrc/distributed/c10d/control_plane/Handlers.cpp",
|
||||
"torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp",
|
||||
"torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp",
|
||||
"torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp",
|
||||
]
|
||||
|
||||
# These files are only supported on Linux (and others) but not on Windows.
|
||||
@ -699,24 +699,24 @@ libtorch_cuda_distributed_base_sources = [
|
||||
|
||||
# These files are only supported on Linux (and others) but not on Windows.
|
||||
libtorch_cuda_distributed_extra_sources = [
|
||||
"torch/csrc/distributed/c10d/CudaDMAConnectivity.cpp",
|
||||
"torch/csrc/distributed/c10d/NCCLUtils.cpp",
|
||||
"torch/csrc/distributed/c10d/FlightRecorderCuda.cpp",
|
||||
"torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp",
|
||||
"torch/csrc/distributed/c10d/NCCLUtils.cpp",
|
||||
"torch/csrc/distributed/c10d/NanCheck.cu",
|
||||
"torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp",
|
||||
"torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp",
|
||||
"torch/csrc/distributed/c10d/ProcessGroupUCC.cpp",
|
||||
"torch/csrc/distributed/c10d/UCCTracing.cpp",
|
||||
"torch/csrc/distributed/c10d/UCCUtils.cpp",
|
||||
"torch/csrc/distributed/c10d/intra_node_comm.cpp",
|
||||
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
||||
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
|
||||
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
|
||||
"torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp",
|
||||
"torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
|
||||
"torch/csrc/distributed/c10d/cuda/utils.cpp",
|
||||
"torch/csrc/distributed/c10d/NanCheck.cu",
|
||||
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
|
||||
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
||||
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
|
||||
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
|
||||
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
|
||||
"torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp",
|
||||
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp",
|
||||
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
|
||||
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
|
||||
]
|
||||
|
||||
libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources
|
||||
|
@ -572,13 +572,13 @@ if(USE_CUDA)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
|
||||
set_source_files_properties(
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/CudaDMAConnectivity.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
|
||||
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
|
||||
)
|
||||
endif()
|
||||
@ -1004,10 +1004,10 @@ elseif(USE_CUDA)
|
||||
# which is not viable for libtorch_cuda. So we isolate the linking of
|
||||
# nvshmem in nvshmem_extension.
|
||||
add_library(nvshmem_extension SHARED
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/nvshmem_extension.cu"
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/NVSHMEMSymmetricMemory.cu"
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp"
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp"
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu"
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu"
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp"
|
||||
)
|
||||
set_target_properties(nvshmem_extension PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
||||
target_compile_options(nvshmem_extension PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true>)
|
||||
|
@ -23,8 +23,8 @@
|
||||
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
|
||||
#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
|
||||
#include <torch/csrc/distributed/c10d/Store.hpp>
|
||||
#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
|
||||
#include <torch/csrc/distributed/c10d/logger.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
|
||||
|
||||
#include <ATen/DynamicLibrary.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
|
@ -32,7 +32,7 @@
|
||||
#ifdef USE_C10D_NCCL
|
||||
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
|
||||
#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
|
||||
#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
|
||||
#endif
|
||||
|
||||
#ifdef USE_C10D_MPI
|
||||
@ -45,9 +45,9 @@
|
||||
|
||||
#include <fmt/format.h>
|
||||
#include <pybind11/chrono.h>
|
||||
#include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
|
||||
#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
|
||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
|
||||
|
||||
#include <torch/csrc/distributed/c10d/comm.hpp>
|
||||
#include <torch/csrc/distributed/c10d/debug.h>
|
||||
|
@ -1,6 +1,6 @@
|
||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
|
||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp>
|
||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
|
||||
#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
|
||||
|
||||
#include <ATen/ceil_div.h>
|
@ -1,9 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryTypes.hpp>
|
||||
#include <torch/csrc/distributed/c10d/Store.hpp>
|
||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
|
||||
|
||||
namespace c10d::symmetric_memory {
|
||||
|
@ -15,10 +15,9 @@
|
||||
#include <ATen/ops/empty_like.h>
|
||||
#endif
|
||||
|
||||
|
||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
|
||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp>
|
||||
#include <torch/csrc/distributed/c10d/cuda/AsyncMM.cuh>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp>
|
||||
|
||||
#if defined(USE_ROCM) || (defined(CUDART_VERSION) && CUDART_VERSION >= 12030)
|
||||
|
@ -12,9 +12,9 @@
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#endif
|
||||
|
||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp>
|
||||
#include <torch/csrc/distributed/c10d/Store.hpp>
|
||||
#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
|
||||
|
||||
namespace c10d::symmetric_memory {
|
||||
|
@ -1,8 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryTypes.hpp>
|
||||
#include <torch/csrc/distributed/c10d/Store.hpp>
|
||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
|
||||
|
||||
namespace c10d {
|
||||
namespace symmetric_memory {
|
@ -1,5 +1,5 @@
|
||||
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||
#include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
|
||||
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
#include <c10/cuda/driver_api.h>
|
@ -1,4 +1,4 @@
|
||||
#include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
|
||||
#include <utility>
|
||||
|
||||
namespace {
|
@ -1,8 +1,8 @@
|
||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
|
||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp>
|
||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
||||
#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
|
||||
#include <torch/csrc/distributed/c10d/nvshmem_extension.cuh>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
|
||||
|
||||
#include <ATen/ceil_div.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
@ -1,4 +1,4 @@
|
||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
|
||||
|
||||
namespace {
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
|
||||
|
||||
#include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
|
||||
#include <torch/csrc/distributed/c10d/Utils.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#include <rocm_smi/rocm_smi.h>
|
@ -1,6 +1,6 @@
|
||||
#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
|
||||
|
||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
|
||||
|
||||
namespace c10d {
|
||||
namespace intra_node_comm {
|
@ -3,8 +3,8 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <torch/csrc/distributed/c10d/Store.hpp>
|
||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
||||
#include <torch/csrc/distributed/c10d/Work.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
|
||||
|
||||
namespace c10d::intra_node_comm {
|
||||
|
@ -1,10 +1,9 @@
|
||||
#include <torch/csrc/distributed/c10d/nvshmem_extension.cuh>
|
||||
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
|
||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp>
|
||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
|
||||
|
||||
#include <cuda_awbarrier_primitives.h>
|
||||
// Use torch's cub wrapper instead of CUDA's <cub/cub.cuh>, see #55292
|
Reference in New Issue
Block a user