mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[symm_mem] Move all symm mem code into a dedicated folder (#155573)
We arrive at a point when so many files are related to symmetric memory and files are scattered around in the cpp side. Let's first put all related code (symmetric memory related) into a separate folder. We can do further refactoring later if needed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/155573 Approved by: https://github.com/fegin, https://github.com/d4l3k
This commit is contained in:
14
BUILD.bazel
14
BUILD.bazel
@ -582,9 +582,9 @@ cc_library(
|
|||||||
cu_library(
|
cu_library(
|
||||||
name = "torch_cuda",
|
name = "torch_cuda",
|
||||||
srcs = [
|
srcs = [
|
||||||
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
|
||||||
"torch/csrc/distributed/c10d/NanCheck.cu",
|
"torch/csrc/distributed/c10d/NanCheck.cu",
|
||||||
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
||||||
|
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
|
||||||
],
|
],
|
||||||
copts = torch_cuda_half_options,
|
copts = torch_cuda_half_options,
|
||||||
visibility = ["//visibility:public"],
|
visibility = ["//visibility:public"],
|
||||||
@ -745,15 +745,15 @@ cc_library(
|
|||||||
srcs = if_cuda(glob(
|
srcs = if_cuda(glob(
|
||||||
libtorch_cuda_sources,
|
libtorch_cuda_sources,
|
||||||
exclude = [
|
exclude = [
|
||||||
"torch/csrc/cuda/python_nccl.cpp",
|
|
||||||
"torch/csrc/cuda/nccl.cpp",
|
"torch/csrc/cuda/nccl.cpp",
|
||||||
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
"torch/csrc/cuda/python_nccl.cpp",
|
||||||
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
|
|
||||||
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
|
|
||||||
"torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp",
|
|
||||||
"torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
|
|
||||||
"torch/csrc/distributed/c10d/NanCheck.cu",
|
"torch/csrc/distributed/c10d/NanCheck.cu",
|
||||||
|
"torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
|
||||||
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
||||||
|
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
|
||||||
|
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
|
||||||
|
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
|
||||||
|
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
|
||||||
],
|
],
|
||||||
)) + torch_sources,
|
)) + torch_sources,
|
||||||
copts = TORCH_COPTS,
|
copts = TORCH_COPTS,
|
||||||
|
@ -493,12 +493,10 @@ libtorch_core_sources = sorted(
|
|||||||
|
|
||||||
# These files are the only ones that are supported on Windows.
|
# These files are the only ones that are supported on Windows.
|
||||||
libtorch_distributed_base_sources = [
|
libtorch_distributed_base_sources = [
|
||||||
"torch/csrc/distributed/c10d/Backend.cpp",
|
|
||||||
"torch/csrc/distributed/c10d/Backoff.cpp",
|
"torch/csrc/distributed/c10d/Backoff.cpp",
|
||||||
"torch/csrc/distributed/c10d/DMAConnectivity.cpp",
|
"torch/csrc/distributed/c10d/Backend.cpp",
|
||||||
"torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp",
|
|
||||||
"torch/csrc/distributed/c10d/FlightRecorder.cpp",
|
|
||||||
"torch/csrc/distributed/c10d/FileStore.cpp",
|
"torch/csrc/distributed/c10d/FileStore.cpp",
|
||||||
|
"torch/csrc/distributed/c10d/FlightRecorder.cpp",
|
||||||
"torch/csrc/distributed/c10d/Functional.cpp",
|
"torch/csrc/distributed/c10d/Functional.cpp",
|
||||||
"torch/csrc/distributed/c10d/GlooDeviceFactory.cpp",
|
"torch/csrc/distributed/c10d/GlooDeviceFactory.cpp",
|
||||||
"torch/csrc/distributed/c10d/GroupRegistry.cpp",
|
"torch/csrc/distributed/c10d/GroupRegistry.cpp",
|
||||||
@ -510,12 +508,15 @@ libtorch_distributed_base_sources = [
|
|||||||
"torch/csrc/distributed/c10d/ProcessGroupMPI.cpp",
|
"torch/csrc/distributed/c10d/ProcessGroupMPI.cpp",
|
||||||
"torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp",
|
"torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp",
|
||||||
"torch/csrc/distributed/c10d/Store.cpp",
|
"torch/csrc/distributed/c10d/Store.cpp",
|
||||||
"torch/csrc/distributed/c10d/SymmetricMemory.cpp",
|
|
||||||
"torch/csrc/distributed/c10d/TCPStore.cpp",
|
"torch/csrc/distributed/c10d/TCPStore.cpp",
|
||||||
"torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
|
"torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
|
||||||
"torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp",
|
"torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp",
|
||||||
"torch/csrc/distributed/c10d/Utils.cpp",
|
"torch/csrc/distributed/c10d/Utils.cpp",
|
||||||
|
"torch/csrc/distributed/c10d/Work.cpp",
|
||||||
"torch/csrc/distributed/c10d/comm.cpp",
|
"torch/csrc/distributed/c10d/comm.cpp",
|
||||||
|
"torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp",
|
||||||
|
"torch/csrc/distributed/c10d/control_plane/Handlers.cpp",
|
||||||
|
"torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp",
|
||||||
"torch/csrc/distributed/c10d/debug.cpp",
|
"torch/csrc/distributed/c10d/debug.cpp",
|
||||||
"torch/csrc/distributed/c10d/default_comm_hooks.cpp",
|
"torch/csrc/distributed/c10d/default_comm_hooks.cpp",
|
||||||
"torch/csrc/distributed/c10d/logger.cpp",
|
"torch/csrc/distributed/c10d/logger.cpp",
|
||||||
@ -524,9 +525,8 @@ libtorch_distributed_base_sources = [
|
|||||||
"torch/csrc/distributed/c10d/reducer.cpp",
|
"torch/csrc/distributed/c10d/reducer.cpp",
|
||||||
"torch/csrc/distributed/c10d/sequence_num.cpp",
|
"torch/csrc/distributed/c10d/sequence_num.cpp",
|
||||||
"torch/csrc/distributed/c10d/socket.cpp",
|
"torch/csrc/distributed/c10d/socket.cpp",
|
||||||
"torch/csrc/distributed/c10d/Work.cpp",
|
"torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp",
|
||||||
"torch/csrc/distributed/c10d/control_plane/Handlers.cpp",
|
"torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp",
|
||||||
"torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# These files are only supported on Linux (and others) but not on Windows.
|
# These files are only supported on Linux (and others) but not on Windows.
|
||||||
@ -699,24 +699,24 @@ libtorch_cuda_distributed_base_sources = [
|
|||||||
|
|
||||||
# These files are only supported on Linux (and others) but not on Windows.
|
# These files are only supported on Linux (and others) but not on Windows.
|
||||||
libtorch_cuda_distributed_extra_sources = [
|
libtorch_cuda_distributed_extra_sources = [
|
||||||
"torch/csrc/distributed/c10d/CudaDMAConnectivity.cpp",
|
|
||||||
"torch/csrc/distributed/c10d/NCCLUtils.cpp",
|
|
||||||
"torch/csrc/distributed/c10d/FlightRecorderCuda.cpp",
|
"torch/csrc/distributed/c10d/FlightRecorderCuda.cpp",
|
||||||
"torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp",
|
"torch/csrc/distributed/c10d/NCCLUtils.cpp",
|
||||||
|
"torch/csrc/distributed/c10d/NanCheck.cu",
|
||||||
"torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp",
|
"torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp",
|
||||||
|
"torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp",
|
||||||
"torch/csrc/distributed/c10d/ProcessGroupUCC.cpp",
|
"torch/csrc/distributed/c10d/ProcessGroupUCC.cpp",
|
||||||
"torch/csrc/distributed/c10d/UCCTracing.cpp",
|
"torch/csrc/distributed/c10d/UCCTracing.cpp",
|
||||||
"torch/csrc/distributed/c10d/UCCUtils.cpp",
|
"torch/csrc/distributed/c10d/UCCUtils.cpp",
|
||||||
"torch/csrc/distributed/c10d/intra_node_comm.cpp",
|
|
||||||
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
|
||||||
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
|
|
||||||
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
|
|
||||||
"torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp",
|
|
||||||
"torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
|
"torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
|
||||||
"torch/csrc/distributed/c10d/cuda/utils.cpp",
|
"torch/csrc/distributed/c10d/cuda/utils.cpp",
|
||||||
"torch/csrc/distributed/c10d/NanCheck.cu",
|
|
||||||
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
|
|
||||||
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
||||||
|
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
|
||||||
|
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
|
||||||
|
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
|
||||||
|
"torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp",
|
||||||
|
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp",
|
||||||
|
"torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
|
||||||
|
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
|
||||||
]
|
]
|
||||||
|
|
||||||
libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources
|
libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources
|
||||||
|
@ -572,13 +572,13 @@ if(USE_CUDA)
|
|||||||
if(NOT WIN32)
|
if(NOT WIN32)
|
||||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
|
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
|
||||||
set_source_files_properties(
|
set_source_files_properties(
|
||||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
|
|
||||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
|
|
||||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/CudaDMAConnectivity.cpp
|
${TORCH_SRC_DIR}/csrc/distributed/c10d/CudaDMAConnectivity.cpp
|
||||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemory.cu
|
|
||||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu
|
|
||||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp
|
|
||||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
|
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
|
||||||
|
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
|
||||||
|
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
|
||||||
|
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
|
||||||
|
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
|
||||||
|
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
|
||||||
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
|
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
@ -1004,10 +1004,10 @@ elseif(USE_CUDA)
|
|||||||
# which is not viable for libtorch_cuda. So we isolate the linking of
|
# which is not viable for libtorch_cuda. So we isolate the linking of
|
||||||
# nvshmem in nvshmem_extension.
|
# nvshmem in nvshmem_extension.
|
||||||
add_library(nvshmem_extension SHARED
|
add_library(nvshmem_extension SHARED
|
||||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/nvshmem_extension.cu"
|
|
||||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/NVSHMEMSymmetricMemory.cu"
|
|
||||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp"
|
|
||||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp"
|
"${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp"
|
||||||
|
"${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu"
|
||||||
|
"${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu"
|
||||||
|
"${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp"
|
||||||
)
|
)
|
||||||
set_target_properties(nvshmem_extension PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
set_target_properties(nvshmem_extension PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
||||||
target_compile_options(nvshmem_extension PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true>)
|
target_compile_options(nvshmem_extension PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true>)
|
||||||
|
@ -23,8 +23,8 @@
|
|||||||
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
|
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
|
#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/Store.hpp>
|
#include <torch/csrc/distributed/c10d/Store.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
|
|
||||||
#include <torch/csrc/distributed/c10d/logger.hpp>
|
#include <torch/csrc/distributed/c10d/logger.hpp>
|
||||||
|
#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
|
||||||
|
|
||||||
#include <ATen/DynamicLibrary.h>
|
#include <ATen/DynamicLibrary.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
@ -32,7 +32,7 @@
|
|||||||
#ifdef USE_C10D_NCCL
|
#ifdef USE_C10D_NCCL
|
||||||
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
|
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
|
#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
|
#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef USE_C10D_MPI
|
#ifdef USE_C10D_MPI
|
||||||
@ -45,9 +45,9 @@
|
|||||||
|
|
||||||
#include <fmt/format.h>
|
#include <fmt/format.h>
|
||||||
#include <pybind11/chrono.h>
|
#include <pybind11/chrono.h>
|
||||||
#include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
|
|
||||||
#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
|
#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
|
||||||
|
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
|
||||||
|
|
||||||
#include <torch/csrc/distributed/c10d/comm.hpp>
|
#include <torch/csrc/distributed/c10d/comm.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/debug.h>
|
#include <torch/csrc/distributed/c10d/debug.h>
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
|
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
|
||||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp>
|
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp>
|
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
|
#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
|
||||||
|
|
||||||
#include <ATen/ceil_div.h>
|
#include <ATen/ceil_div.h>
|
@ -1,9 +1,9 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryTypes.hpp>
|
|
||||||
#include <torch/csrc/distributed/c10d/Store.hpp>
|
#include <torch/csrc/distributed/c10d/Store.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp>
|
||||||
|
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
|
||||||
|
|
||||||
namespace c10d::symmetric_memory {
|
namespace c10d::symmetric_memory {
|
||||||
|
|
@ -15,10 +15,9 @@
|
|||||||
#include <ATen/ops/empty_like.h>
|
#include <ATen/ops/empty_like.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
|
|
||||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp>
|
|
||||||
#include <torch/csrc/distributed/c10d/cuda/AsyncMM.cuh>
|
#include <torch/csrc/distributed/c10d/cuda/AsyncMM.cuh>
|
||||||
|
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
|
||||||
|
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp>
|
||||||
|
|
||||||
#if defined(USE_ROCM) || (defined(CUDART_VERSION) && CUDART_VERSION >= 12030)
|
#if defined(USE_ROCM) || (defined(CUDART_VERSION) && CUDART_VERSION >= 12030)
|
||||||
|
|
@ -12,9 +12,9 @@
|
|||||||
#include <hip/hip_runtime_api.h>
|
#include <hip/hip_runtime_api.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp>
|
|
||||||
#include <torch/csrc/distributed/c10d/Store.hpp>
|
#include <torch/csrc/distributed/c10d/Store.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
|
#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
|
||||||
|
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
|
||||||
|
|
||||||
namespace c10d::symmetric_memory {
|
namespace c10d::symmetric_memory {
|
||||||
|
|
@ -1,8 +1,8 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryTypes.hpp>
|
|
||||||
#include <torch/csrc/distributed/c10d/Store.hpp>
|
#include <torch/csrc/distributed/c10d/Store.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp>
|
||||||
|
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
|
||||||
|
|
||||||
namespace c10d {
|
namespace c10d {
|
||||||
namespace symmetric_memory {
|
namespace symmetric_memory {
|
@ -1,5 +1,5 @@
|
|||||||
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||||
#include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
|
#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
|
||||||
|
|
||||||
#include <c10/cuda/CUDAException.h>
|
#include <c10/cuda/CUDAException.h>
|
||||||
#include <c10/cuda/driver_api.h>
|
#include <c10/cuda/driver_api.h>
|
@ -1,4 +1,4 @@
|
|||||||
#include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
|
#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
@ -1,8 +1,8 @@
|
|||||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
|
|
||||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp>
|
|
||||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
|
||||||
#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
|
#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/nvshmem_extension.cuh>
|
#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
|
||||||
|
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
|
||||||
|
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
|
||||||
|
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
|
||||||
|
|
||||||
#include <ATen/ceil_div.h>
|
#include <ATen/ceil_div.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
@ -1,4 +1,4 @@
|
|||||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
@ -1,7 +1,6 @@
|
|||||||
#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
|
|
||||||
|
|
||||||
#include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
|
|
||||||
#include <torch/csrc/distributed/c10d/Utils.hpp>
|
#include <torch/csrc/distributed/c10d/Utils.hpp>
|
||||||
|
#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
|
||||||
|
#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
|
||||||
|
|
||||||
#if defined(USE_ROCM)
|
#if defined(USE_ROCM)
|
||||||
#include <rocm_smi/rocm_smi.h>
|
#include <rocm_smi/rocm_smi.h>
|
@ -1,6 +1,6 @@
|
|||||||
#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
|
#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
|
||||||
|
|
||||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
|
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
|
||||||
|
|
||||||
namespace c10d {
|
namespace c10d {
|
||||||
namespace intra_node_comm {
|
namespace intra_node_comm {
|
@ -3,8 +3,8 @@
|
|||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <c10/cuda/CUDAStream.h>
|
#include <c10/cuda/CUDAStream.h>
|
||||||
#include <torch/csrc/distributed/c10d/Store.hpp>
|
#include <torch/csrc/distributed/c10d/Store.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
|
||||||
#include <torch/csrc/distributed/c10d/Work.hpp>
|
#include <torch/csrc/distributed/c10d/Work.hpp>
|
||||||
|
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
|
||||||
|
|
||||||
namespace c10d::intra_node_comm {
|
namespace c10d::intra_node_comm {
|
||||||
|
|
@ -1,10 +1,9 @@
|
|||||||
#include <torch/csrc/distributed/c10d/nvshmem_extension.cuh>
|
|
||||||
|
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
|
#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
|
||||||
#include <torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp>
|
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
|
||||||
#include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
|
#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
|
||||||
|
#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
|
||||||
|
|
||||||
#include <cuda_awbarrier_primitives.h>
|
#include <cuda_awbarrier_primitives.h>
|
||||||
// Use torch's cub wrapper instead of CUDA's <cub/cub.cuh>, see #55292
|
// Use torch's cub wrapper instead of CUDA's <cub/cub.cuh>, see #55292
|
Reference in New Issue
Block a user