From 2e739f822b49e651ee8de2550b2a69b09d36e021 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 26 Mar 2020 17:43:45 -0700 Subject: [PATCH] Fix PyTorch separate compilation (#34863) Summary: Looks like there is a bug in CUDA device linker, but kernels that uses `thust::sort_by_key` can not be linked with other kernels Solve the problem by splitting 5 thrust-heavy .cu files into `__torch_cuda_sp` library which is statically linked into `torch_cuda` For default compilation workflow it should not make any difference. Test Plan: Compile with `-DCUDA_SEPARABLE_COMPILATION=YES` and observe library size difference: 310Mb before, 173Mb after if compiled for sm_75 Pull Request resolved: https://github.com/pytorch/pytorch/pull/34863 Differential Revision: D20683972 Pulled By: malfet fbshipit-source-id: bc1492aa9d1d2d21c48e8764a8a7b403feaec5da --- aten/CMakeLists.txt | 2 ++ aten/src/ATen/CMakeLists.txt | 4 ++++ aten/src/THC/CMakeLists.txt | 5 ++++- caffe2/CMakeLists.txt | 15 ++++++++++++++- caffe2/operators/CMakeLists.txt | 6 ++++++ 5 files changed, 30 insertions(+), 2 deletions(-) diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt index c25a2570d1bd..b2b8fa983bd8 100644 --- a/aten/CMakeLists.txt +++ b/aten/CMakeLists.txt @@ -22,6 +22,7 @@ set(ATen_CPU_TEST_SRCS) set(ATen_CPU_INCLUDE) set(ATen_THIRD_PARTY_INCLUDE) set(ATen_CUDA_SRCS) +set(ATen_CUDA_SRCS_W_SORT_BY_KEY) set(ATen_CUDA_TEST_SRCS) set(ATen_CUDA_INCLUDE) set(ATen_NVRTC_STUB_SRCS) @@ -104,6 +105,7 @@ add_subdirectory(src/ATen) # Pass source, includes, and libs to parent set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE) +set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE) set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE) set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE) set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 3d45042bffb2..aed61ed6cfb8 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -67,7 +67,9 @@ FILE(GLOB native_h "native/*.h") FILE(GLOB native_quantized_h "native/quantized/*.h" "native/quantized/cpu/*.h") FILE(GLOB native_cpu_h "native/cpu/*.h") +FILE(GLOB native_cuda_cu_sp "native/cuda/Unique.cu" "native/cuda/TensorFactories.cu") FILE(GLOB native_cuda_cu "native/cuda/*.cu") +exclude(native_cuda_cu "${native_cuda_cu}" ${native_cuda_cu_sp}) FILE(GLOB native_cuda_cpp "native/cuda/*.cpp") FILE(GLOB native_cudnn_cpp "native/cudnn/*.cpp") FILE(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu") @@ -103,6 +105,7 @@ endif() IF(USE_CUDA) list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/cuda) set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} ${cuda_cu} ${native_cuda_cu} ${native_sparse_cuda_cu} ${native_quantized_cuda_cu}) + set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} ${native_cuda_cu_sp}) set(all_cuda_cpp ${native_sparse_cuda_cpp} ${native_quantized_cuda_cpp} ${cuda_cpp} ${native_cuda_cpp} ${cuda_generated_cpp} ${ATen_CUDA_SRCS}) SET(all_cuda_cpp ${native_cudnn_cpp} ${native_miopen_cpp} ${all_cuda_cpp}) IF(CAFFE2_USE_CUDNN) @@ -419,6 +422,7 @@ endif() set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE) set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE) +set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE) set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE) set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE) set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE) diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt index 3b52ede6fc52..052a0b1e690f 100644 --- a/aten/src/THC/CMakeLists.txt +++ b/aten/src/THC/CMakeLists.txt @@ -59,10 +59,13 @@ set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorTopK.cu ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorSort.cu ${CMAKE_CURRENT_SOURCE_DIR}/THCSortUtils.cu - ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMode.cu ${extra_src} PARENT_SCOPE) +set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} + ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMode.cu + PARENT_SCOPE) + INSTALL(FILES THC.h ${CMAKE_CURRENT_BINARY_DIR}/THCGeneral.h diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index c105f4abdc30..88d1d717f157 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -32,6 +32,7 @@ if(INTERN_BUILD_ATEN_OPS) # Add source, includes, and libs to lists list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS}) list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_SRCS}) + list(APPEND Caffe2_GPU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY}) list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS}) list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS}) list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS}) @@ -702,7 +703,19 @@ if(USE_ROCM) endif() elseif(USE_CUDA) set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE) - cuda_add_library(torch_cuda ${Caffe2_GPU_SRCS}) + if(CUDA_SEPARABLE_COMPILATION) + # Separate compilation fails when kernels using `thrust::sort_by_key` + # are linked with the rest of CUDA code. Workaround by linking the separateley + set(_generated_name "torch_cuda_w_sort_by_key_intermediate_link${CMAKE_C_OUTPUT_EXTENSION}") + set(torch_cuda_w_sort_by_key_link_file "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/torch_cuda.dir/${CMAKE_CFG_INTDIR}/${_generated_name}") + cuda_wrap_srcs(torch_cuda OBJ Caffe2_GPU_W_SORT_BY_KEY_OBJ ${Caffe2_GPU_SRCS_W_SORT_BY_KEY}) + CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${torch_cuda_w_sort_by_key_link_file}" torch_cpu "${_options}" "${torch_cuda_SEPARABLE_COMPILATION_OBJECTS}") + set( torch_cuda_SEPARABLE_COMPILATION_OBJECTS ) + # Pass compiled sort-by-key object + device-linked fatbin as extra dependencies of torch_cuda + cuda_add_library(torch_cuda ${Caffe2_GPU_SRCS} ${torch_cuda_w_sort_by_key_link_file} ${Caffe2_GPU_W_SORT_BY_KEY_OBJ}) + else() + cuda_add_library(torch_cuda ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY}) + endif() set(CUDA_LINK_LIBRARIES_KEYWORD) torch_compile_options(torch_cuda) # see cmake/public/utils.cmake if(USE_NCCL) diff --git a/caffe2/operators/CMakeLists.txt b/caffe2/operators/CMakeLists.txt index 8f37176ce735..ad8b0f48e269 100644 --- a/caffe2/operators/CMakeLists.txt +++ b/caffe2/operators/CMakeLists.txt @@ -18,7 +18,12 @@ file(GLOB tmp *.cu) # TODO: when we move to explicit file list, this would not be needed. file(GLOB tmp_cudnn *_cudnn.cu) exclude(tmp "${tmp}" ${tmp_cudnn}) +# Exclude top_k.cu and unique_ops.cu for separate compilation +file(GLOB tmp_w_sort_by_key "top_k.cu" "unique_ops.cu") +exclude(tmp "${tmp}" ${tmp_w_sort_by_key}) + set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp}) +set(Caffe2_GPU_SRCS_W_SORT_BY_KEY ${Caffe2_GPU_SRCS_W_SORT_BY_KEY} ${tmp_w_sort_by_key}) # exclude test files file(GLOB tmp *_test.cc) exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp}) @@ -94,6 +99,7 @@ exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS} $ # ---[ Send the lists to the parent scope. set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE) +set(Caffe2_GPU_SRCS_W_SORT_BY_KEY ${Caffe2_GPU_SRCS_W_SORT_BY_KEY} PARENT_SCOPE) set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE) set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE) set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)