[symm_mem] Move all symm mem code into a dedicated folder (#155573)

We arrive at a point when so many files are related to symmetric memory and files are scattered around in the cpp side. Let's first put all related code (symmetric memory related) into a separate folder. We can do further refactoring later if needed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/155573 Approved by: https://github.com/fegin, https://github.com/d4l3k
2025-10-20 21:14:14 +08:00 · 2025-06-10 12:39:34 -07:00
parent 3e131f7779
commit ffc6cbfaf7
23 changed files with 63 additions and 66 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -582,9 +582,9 @@ cc_library(
 cu_library(
    name = "torch_cuda",
    srcs = [
        "torch/csrc/distributed/c10d/intra_node_comm.cu",
        "torch/csrc/distributed/c10d/NanCheck.cu",
        "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
        "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
    ],
    copts = torch_cuda_half_options,
    visibility = ["//visibility:public"],
@ -745,15 +745,15 @@ cc_library(
    srcs = if_cuda(glob(
        libtorch_cuda_sources,
        exclude = [
            "torch/csrc/cuda/python_nccl.cpp",
            "torch/csrc/cuda/nccl.cpp",
-            "torch/csrc/distributed/c10d/intra_node_comm.cu",
+            "torch/csrc/cuda/python_nccl.cpp",
            "torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
            "torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
            "torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp",
            "torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
            "torch/csrc/distributed/c10d/NanCheck.cu",
            "torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
            "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
            "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
            "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
            "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
            "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
        ],
    )) + torch_sources,
    copts = TORCH_COPTS,
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -493,12 +493,10 @@ libtorch_core_sources = sorted(
 # These files are the only ones that are supported on Windows.
 libtorch_distributed_base_sources = [
    "torch/csrc/distributed/c10d/Backend.cpp",
    "torch/csrc/distributed/c10d/Backoff.cpp",
-    "torch/csrc/distributed/c10d/DMAConnectivity.cpp",
+    "torch/csrc/distributed/c10d/Backend.cpp",
    "torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp",
    "torch/csrc/distributed/c10d/FlightRecorder.cpp",
    "torch/csrc/distributed/c10d/FileStore.cpp",
    "torch/csrc/distributed/c10d/FlightRecorder.cpp",
    "torch/csrc/distributed/c10d/Functional.cpp",
    "torch/csrc/distributed/c10d/GlooDeviceFactory.cpp",
    "torch/csrc/distributed/c10d/GroupRegistry.cpp",
@ -510,12 +508,15 @@ libtorch_distributed_base_sources = [
    "torch/csrc/distributed/c10d/ProcessGroupMPI.cpp",
    "torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp",
    "torch/csrc/distributed/c10d/Store.cpp",
    "torch/csrc/distributed/c10d/SymmetricMemory.cpp",
    "torch/csrc/distributed/c10d/TCPStore.cpp",
    "torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
    "torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp",
    "torch/csrc/distributed/c10d/Utils.cpp",
    "torch/csrc/distributed/c10d/Work.cpp",
    "torch/csrc/distributed/c10d/comm.cpp",
    "torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp",
    "torch/csrc/distributed/c10d/control_plane/Handlers.cpp",
    "torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp",
    "torch/csrc/distributed/c10d/debug.cpp",
    "torch/csrc/distributed/c10d/default_comm_hooks.cpp",
    "torch/csrc/distributed/c10d/logger.cpp",
@ -524,9 +525,8 @@ libtorch_distributed_base_sources = [
    "torch/csrc/distributed/c10d/reducer.cpp",
    "torch/csrc/distributed/c10d/sequence_num.cpp",
    "torch/csrc/distributed/c10d/socket.cpp",
-    "torch/csrc/distributed/c10d/Work.cpp",
+    "torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp",
-    "torch/csrc/distributed/c10d/control_plane/Handlers.cpp",
+    "torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp",
    "torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp",
 ]
 # These files are only supported on Linux (and others) but not on Windows.
@ -699,24 +699,24 @@ libtorch_cuda_distributed_base_sources = [
 # These files are only supported on Linux (and others) but not on Windows.
 libtorch_cuda_distributed_extra_sources = [
    "torch/csrc/distributed/c10d/CudaDMAConnectivity.cpp",
    "torch/csrc/distributed/c10d/NCCLUtils.cpp",
    "torch/csrc/distributed/c10d/FlightRecorderCuda.cpp",
-    "torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp",
+    "torch/csrc/distributed/c10d/NCCLUtils.cpp",
    "torch/csrc/distributed/c10d/NanCheck.cu",
    "torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp",
    "torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp",
    "torch/csrc/distributed/c10d/ProcessGroupUCC.cpp",
    "torch/csrc/distributed/c10d/UCCTracing.cpp",
    "torch/csrc/distributed/c10d/UCCUtils.cpp",
    "torch/csrc/distributed/c10d/intra_node_comm.cpp",
    "torch/csrc/distributed/c10d/intra_node_comm.cu",
    "torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
    "torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
    "torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp",
    "torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
    "torch/csrc/distributed/c10d/cuda/utils.cpp",
    "torch/csrc/distributed/c10d/NanCheck.cu",
    "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
    "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
    "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
    "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
    "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
    "torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp",
    "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp",
    "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
    "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
 ]
 libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -572,13 +572,13 @@ if(USE_CUDA)
    if(NOT WIN32)
      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
      set_source_files_properties(
        ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
        ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
        ${TORCH_SRC_DIR}/csrc/distributed/c10d/CudaDMAConnectivity.cpp
        ${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemory.cu
        ${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu
        ${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp
        ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
        ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
        ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
        PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
      )
    endif()
@ -1004,10 +1004,10 @@ elseif(USE_CUDA)
    # which is not viable for libtorch_cuda. So we isolate the linking of
    # nvshmem in nvshmem_extension.
    add_library(nvshmem_extension SHARED
        "${TORCH_SRC_DIR}/csrc/distributed/c10d/nvshmem_extension.cu"
        "${TORCH_SRC_DIR}/csrc/distributed/c10d/NVSHMEMSymmetricMemory.cu"
        "${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp"
        "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp"
        "${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu"
        "${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu"
        "${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp"
    )
    set_target_properties(nvshmem_extension PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
    target_compile_options(nvshmem_extension PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true>)
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@ -23,8 +23,8 @@
 #include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
 #include <torch/csrc/distributed/c10d/logger.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
 #include <ATen/DynamicLibrary.h>
 #include <ATen/cuda/CUDAContext.h>
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@ -32,7 +32,7 @@
 #ifdef USE_C10D_NCCL
 #include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
-#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
 #endif
 #ifdef USE_C10D_MPI
@ -45,9 +45,9 @@
 #include <fmt/format.h>
 #include <pybind11/chrono.h>
 #include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
-#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 #include <torch/csrc/distributed/c10d/comm.hpp>
 #include <torch/csrc/distributed/c10d/debug.h>
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
@ -1,6 +1,6 @@
-#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
-#include <torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp>
-#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
 #include <torch/csrc/distributed/c10d/cuda/utils.hpp>
 #include <ATen/ceil_div.h>
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
@ -1,9 +1,9 @@
 #pragma once
 #include <ATen/ATen.h>
 #include <torch/csrc/distributed/c10d/CUDASymmetricMemoryTypes.hpp>
 #include <torch/csrc/distributed/c10d/Store.hpp>
-#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 namespace c10d::symmetric_memory {
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
@ -15,10 +15,9 @@
 #include <ATen/ops/empty_like.h>
 #endif
 #include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
 #include <torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp>
 #include <torch/csrc/distributed/c10d/cuda/AsyncMM.cuh>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp>
 #if defined(USE_ROCM) || (defined(CUDART_VERSION) && CUDART_VERSION >= 12030)
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
@ -12,9 +12,9 @@
 #include <hip/hip_runtime_api.h>
 #endif
 #include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp>
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/cuda/utils.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
 namespace c10d::symmetric_memory {
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
@ -1,8 +1,8 @@
 #pragma once
 #include <torch/csrc/distributed/c10d/CUDASymmetricMemoryTypes.hpp>
 #include <torch/csrc/distributed/c10d/Store.hpp>
-#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 namespace c10d {
 namespace symmetric_memory {
--- a/torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
@ -1,5 +1,5 @@
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
-#include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/driver_api.h>
--- a/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp
@ -1,4 +1,4 @@
-#include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
 #include <utility>
 namespace {
--- a/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp
--- a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
@ -1,8 +1,8 @@
 #include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
 #include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp>
 #include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
 #include <torch/csrc/distributed/c10d/cuda/utils.hpp>
-#include <torch/csrc/distributed/c10d/nvshmem_extension.cuh>
+#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 #include <ATen/ceil_div.h>
 #include <ATen/cuda/CUDAContext.h>
--- a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
@ -1,4 +1,4 @@
-#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 namespace {
--- a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
--- a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
@ -1,7 +1,6 @@
 #include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
 #include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
 #include <torch/csrc/distributed/c10d/Utils.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
 #if defined(USE_ROCM)
 #include <rocm_smi/rocm_smi.h>
--- a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu
@ -1,6 +1,6 @@
-#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
-#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
 namespace c10d {
 namespace intra_node_comm {
--- a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp
@ -3,8 +3,8 @@
 #include <ATen/ATen.h>
 #include <c10/cuda/CUDAStream.h>
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
 #include <torch/csrc/distributed/c10d/Work.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 namespace c10d::intra_node_comm {
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
@ -1,10 +1,9 @@
 #include <torch/csrc/distributed/c10d/nvshmem_extension.cuh>
 #include <c10/cuda/CUDAGuard.h>
-#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
+#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
-#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
-#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 #include <cuda_awbarrier_primitives.h>
 // Use torch's cub wrapper instead of CUDA's <cub/cub.cuh>, see #55292
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
`@ -1,4 +1,4 @@`
	`#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>`	`#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>`

	`namespace {`	`namespace {`