[symm_mem] Move all symm mem code into a dedicated folder (#155573)

We arrive at a point when so many files are related to symmetric memory and files are scattered around in the cpp side. Let's first put all related code (symmetric memory related) into a separate folder. We can do further refactoring later if needed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/155573
Approved by: https://github.com/fegin, https://github.com/d4l3k
This commit is contained in:
fduwjj
2025-06-10 12:39:34 -07:00
committed by PyTorch MergeBot
parent 3e131f7779
commit ffc6cbfaf7
23 changed files with 63 additions and 66 deletions

View File

@ -582,9 +582,9 @@ cc_library(
cu_library( cu_library(
name = "torch_cuda", name = "torch_cuda",
srcs = [ srcs = [
"torch/csrc/distributed/c10d/intra_node_comm.cu",
"torch/csrc/distributed/c10d/NanCheck.cu", "torch/csrc/distributed/c10d/NanCheck.cu",
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu", "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
], ],
copts = torch_cuda_half_options, copts = torch_cuda_half_options,
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
@ -745,15 +745,15 @@ cc_library(
srcs = if_cuda(glob( srcs = if_cuda(glob(
libtorch_cuda_sources, libtorch_cuda_sources,
exclude = [ exclude = [
"torch/csrc/cuda/python_nccl.cpp",
"torch/csrc/cuda/nccl.cpp", "torch/csrc/cuda/nccl.cpp",
"torch/csrc/distributed/c10d/intra_node_comm.cu", "torch/csrc/cuda/python_nccl.cpp",
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
"torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp",
"torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
"torch/csrc/distributed/c10d/NanCheck.cu", "torch/csrc/distributed/c10d/NanCheck.cu",
"torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu", "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
], ],
)) + torch_sources, )) + torch_sources,
copts = TORCH_COPTS, copts = TORCH_COPTS,

View File

@ -493,12 +493,10 @@ libtorch_core_sources = sorted(
# These files are the only ones that are supported on Windows. # These files are the only ones that are supported on Windows.
libtorch_distributed_base_sources = [ libtorch_distributed_base_sources = [
"torch/csrc/distributed/c10d/Backend.cpp",
"torch/csrc/distributed/c10d/Backoff.cpp", "torch/csrc/distributed/c10d/Backoff.cpp",
"torch/csrc/distributed/c10d/DMAConnectivity.cpp", "torch/csrc/distributed/c10d/Backend.cpp",
"torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp",
"torch/csrc/distributed/c10d/FlightRecorder.cpp",
"torch/csrc/distributed/c10d/FileStore.cpp", "torch/csrc/distributed/c10d/FileStore.cpp",
"torch/csrc/distributed/c10d/FlightRecorder.cpp",
"torch/csrc/distributed/c10d/Functional.cpp", "torch/csrc/distributed/c10d/Functional.cpp",
"torch/csrc/distributed/c10d/GlooDeviceFactory.cpp", "torch/csrc/distributed/c10d/GlooDeviceFactory.cpp",
"torch/csrc/distributed/c10d/GroupRegistry.cpp", "torch/csrc/distributed/c10d/GroupRegistry.cpp",
@ -510,12 +508,15 @@ libtorch_distributed_base_sources = [
"torch/csrc/distributed/c10d/ProcessGroupMPI.cpp", "torch/csrc/distributed/c10d/ProcessGroupMPI.cpp",
"torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp", "torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp",
"torch/csrc/distributed/c10d/Store.cpp", "torch/csrc/distributed/c10d/Store.cpp",
"torch/csrc/distributed/c10d/SymmetricMemory.cpp",
"torch/csrc/distributed/c10d/TCPStore.cpp", "torch/csrc/distributed/c10d/TCPStore.cpp",
"torch/csrc/distributed/c10d/TCPStoreBackend.cpp", "torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
"torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp", "torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp",
"torch/csrc/distributed/c10d/Utils.cpp", "torch/csrc/distributed/c10d/Utils.cpp",
"torch/csrc/distributed/c10d/Work.cpp",
"torch/csrc/distributed/c10d/comm.cpp", "torch/csrc/distributed/c10d/comm.cpp",
"torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp",
"torch/csrc/distributed/c10d/control_plane/Handlers.cpp",
"torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp",
"torch/csrc/distributed/c10d/debug.cpp", "torch/csrc/distributed/c10d/debug.cpp",
"torch/csrc/distributed/c10d/default_comm_hooks.cpp", "torch/csrc/distributed/c10d/default_comm_hooks.cpp",
"torch/csrc/distributed/c10d/logger.cpp", "torch/csrc/distributed/c10d/logger.cpp",
@ -524,9 +525,8 @@ libtorch_distributed_base_sources = [
"torch/csrc/distributed/c10d/reducer.cpp", "torch/csrc/distributed/c10d/reducer.cpp",
"torch/csrc/distributed/c10d/sequence_num.cpp", "torch/csrc/distributed/c10d/sequence_num.cpp",
"torch/csrc/distributed/c10d/socket.cpp", "torch/csrc/distributed/c10d/socket.cpp",
"torch/csrc/distributed/c10d/Work.cpp", "torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp",
"torch/csrc/distributed/c10d/control_plane/Handlers.cpp", "torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp",
"torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp",
] ]
# These files are only supported on Linux (and others) but not on Windows. # These files are only supported on Linux (and others) but not on Windows.
@ -699,24 +699,24 @@ libtorch_cuda_distributed_base_sources = [
# These files are only supported on Linux (and others) but not on Windows. # These files are only supported on Linux (and others) but not on Windows.
libtorch_cuda_distributed_extra_sources = [ libtorch_cuda_distributed_extra_sources = [
"torch/csrc/distributed/c10d/CudaDMAConnectivity.cpp",
"torch/csrc/distributed/c10d/NCCLUtils.cpp",
"torch/csrc/distributed/c10d/FlightRecorderCuda.cpp", "torch/csrc/distributed/c10d/FlightRecorderCuda.cpp",
"torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp", "torch/csrc/distributed/c10d/NCCLUtils.cpp",
"torch/csrc/distributed/c10d/NanCheck.cu",
"torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp", "torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp",
"torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp",
"torch/csrc/distributed/c10d/ProcessGroupUCC.cpp", "torch/csrc/distributed/c10d/ProcessGroupUCC.cpp",
"torch/csrc/distributed/c10d/UCCTracing.cpp", "torch/csrc/distributed/c10d/UCCTracing.cpp",
"torch/csrc/distributed/c10d/UCCUtils.cpp", "torch/csrc/distributed/c10d/UCCUtils.cpp",
"torch/csrc/distributed/c10d/intra_node_comm.cpp",
"torch/csrc/distributed/c10d/intra_node_comm.cu",
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
"torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp",
"torch/csrc/distributed/c10d/cuda/AsyncMM.cu", "torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
"torch/csrc/distributed/c10d/cuda/utils.cpp", "torch/csrc/distributed/c10d/cuda/utils.cpp",
"torch/csrc/distributed/c10d/NanCheck.cu",
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu", "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
"torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp",
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp",
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
] ]
libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources

View File

@ -572,13 +572,13 @@ if(USE_CUDA)
if(NOT WIN32) if(NOT WIN32)
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS) append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
set_source_files_properties( set_source_files_properties(
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/CudaDMAConnectivity.cpp ${TORCH_SRC_DIR}/csrc/distributed/c10d/CudaDMAConnectivity.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemory.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1" PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
) )
endif() endif()
@ -1004,10 +1004,10 @@ elseif(USE_CUDA)
# which is not viable for libtorch_cuda. So we isolate the linking of # which is not viable for libtorch_cuda. So we isolate the linking of
# nvshmem in nvshmem_extension. # nvshmem in nvshmem_extension.
add_library(nvshmem_extension SHARED add_library(nvshmem_extension SHARED
"${TORCH_SRC_DIR}/csrc/distributed/c10d/nvshmem_extension.cu"
"${TORCH_SRC_DIR}/csrc/distributed/c10d/NVSHMEMSymmetricMemory.cu"
"${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp"
"${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp" "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp"
"${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu"
"${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu"
"${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp"
) )
set_target_properties(nvshmem_extension PROPERTIES CUDA_SEPARABLE_COMPILATION ON) set_target_properties(nvshmem_extension PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_compile_options(nvshmem_extension PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true>) target_compile_options(nvshmem_extension PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true>)

View File

@ -23,8 +23,8 @@
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp> #include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
#include <torch/csrc/distributed/c10d/PrefixStore.hpp> #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
#include <torch/csrc/distributed/c10d/Store.hpp> #include <torch/csrc/distributed/c10d/Store.hpp>
#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
#include <torch/csrc/distributed/c10d/logger.hpp> #include <torch/csrc/distributed/c10d/logger.hpp>
#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
#include <ATen/DynamicLibrary.h> #include <ATen/DynamicLibrary.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>

View File

@ -32,7 +32,7 @@
#ifdef USE_C10D_NCCL #ifdef USE_C10D_NCCL
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp> #include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp> #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
#include <torch/csrc/distributed/c10d/intra_node_comm.hpp> #include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
#endif #endif
#ifdef USE_C10D_MPI #ifdef USE_C10D_MPI
@ -45,9 +45,9 @@
#include <fmt/format.h> #include <fmt/format.h>
#include <pybind11/chrono.h> #include <pybind11/chrono.h>
#include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
#include <torch/csrc/distributed/c10d/PrefixStore.hpp> #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp> #include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
#include <torch/csrc/distributed/c10d/comm.hpp> #include <torch/csrc/distributed/c10d/comm.hpp>
#include <torch/csrc/distributed/c10d/debug.h> #include <torch/csrc/distributed/c10d/debug.h>

View File

@ -1,6 +1,6 @@
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h> #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp> #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp>
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp> #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
#include <torch/csrc/distributed/c10d/cuda/utils.hpp> #include <torch/csrc/distributed/c10d/cuda/utils.hpp>
#include <ATen/ceil_div.h> #include <ATen/ceil_div.h>

View File

@ -1,9 +1,9 @@
#pragma once #pragma once
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryTypes.hpp>
#include <torch/csrc/distributed/c10d/Store.hpp> #include <torch/csrc/distributed/c10d/Store.hpp>
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp> #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp>
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
namespace c10d::symmetric_memory { namespace c10d::symmetric_memory {

View File

@ -15,10 +15,9 @@
#include <ATen/ops/empty_like.h> #include <ATen/ops/empty_like.h>
#endif #endif
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp>
#include <torch/csrc/distributed/c10d/cuda/AsyncMM.cuh> #include <torch/csrc/distributed/c10d/cuda/AsyncMM.cuh>
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp>
#if defined(USE_ROCM) || (defined(CUDART_VERSION) && CUDART_VERSION >= 12030) #if defined(USE_ROCM) || (defined(CUDART_VERSION) && CUDART_VERSION >= 12030)

View File

@ -12,9 +12,9 @@
#include <hip/hip_runtime_api.h> #include <hip/hip_runtime_api.h>
#endif #endif
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp>
#include <torch/csrc/distributed/c10d/Store.hpp> #include <torch/csrc/distributed/c10d/Store.hpp>
#include <torch/csrc/distributed/c10d/cuda/utils.hpp> #include <torch/csrc/distributed/c10d/cuda/utils.hpp>
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
namespace c10d::symmetric_memory { namespace c10d::symmetric_memory {

View File

@ -1,8 +1,8 @@
#pragma once #pragma once
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryTypes.hpp>
#include <torch/csrc/distributed/c10d/Store.hpp> #include <torch/csrc/distributed/c10d/Store.hpp>
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp> #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp>
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
namespace c10d { namespace c10d {
namespace symmetric_memory { namespace symmetric_memory {

View File

@ -1,5 +1,5 @@
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <torch/csrc/distributed/c10d/DMAConnectivity.hpp> #include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
#include <c10/cuda/CUDAException.h> #include <c10/cuda/CUDAException.h>
#include <c10/cuda/driver_api.h> #include <c10/cuda/driver_api.h>

View File

@ -1,4 +1,4 @@
#include <torch/csrc/distributed/c10d/DMAConnectivity.hpp> #include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
#include <utility> #include <utility>
namespace { namespace {

View File

@ -1,8 +1,8 @@
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp>
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
#include <torch/csrc/distributed/c10d/cuda/utils.hpp> #include <torch/csrc/distributed/c10d/cuda/utils.hpp>
#include <torch/csrc/distributed/c10d/nvshmem_extension.cuh> #include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
#include <ATen/ceil_div.h> #include <ATen/ceil_div.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>

View File

@ -1,4 +1,4 @@
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp> #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
namespace { namespace {

View File

@ -1,7 +1,6 @@
#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
#include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
#include <torch/csrc/distributed/c10d/Utils.hpp> #include <torch/csrc/distributed/c10d/Utils.hpp>
#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
#if defined(USE_ROCM) #if defined(USE_ROCM)
#include <rocm_smi/rocm_smi.h> #include <rocm_smi/rocm_smi.h>

View File

@ -1,6 +1,6 @@
#include <torch/csrc/distributed/c10d/intra_node_comm.hpp> #include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h> #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
namespace c10d { namespace c10d {
namespace intra_node_comm { namespace intra_node_comm {

View File

@ -3,8 +3,8 @@
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <c10/cuda/CUDAStream.h> #include <c10/cuda/CUDAStream.h>
#include <torch/csrc/distributed/c10d/Store.hpp> #include <torch/csrc/distributed/c10d/Store.hpp>
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
#include <torch/csrc/distributed/c10d/Work.hpp> #include <torch/csrc/distributed/c10d/Work.hpp>
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
namespace c10d::intra_node_comm { namespace c10d::intra_node_comm {

View File

@ -1,10 +1,9 @@
#include <torch/csrc/distributed/c10d/nvshmem_extension.cuh>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h> #include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp> #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp> #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
#include <cuda_awbarrier_primitives.h> #include <cuda_awbarrier_primitives.h>
// Use torch's cub wrapper instead of CUDA's <cub/cub.cuh>, see #55292 // Use torch's cub wrapper instead of CUDA's <cub/cub.cuh>, see #55292