Fix PyTorch separate compilation (#34863)

Summary:
Looks like there is a bug in CUDA device linker, but kernels that uses `thust::sort_by_key` can not be linked with other kernels
    Solve the problem by splitting 5 thrust-heavy .cu files into `__torch_cuda_sp` library which is statically linked into `torch_cuda`
    For default compilation workflow it should not make any difference.

    Test Plan: Compile with `-DCUDA_SEPARABLE_COMPILATION=YES` and observe library size difference: 310Mb before, 173Mb after if compiled for sm_75
Pull Request resolved: https://github.com/pytorch/pytorch/pull/34863

Differential Revision: D20683972

Pulled By: malfet

fbshipit-source-id: bc1492aa9d1d2d21c48e8764a8a7b403feaec5da
This commit is contained in:
Nikita Shulga
2020-03-26 17:43:45 -07:00
committed by Facebook GitHub Bot
parent 2f6f1781af
commit 2e739f822b
5 changed files with 30 additions and 2 deletions

View File

@ -22,6 +22,7 @@ set(ATen_CPU_TEST_SRCS)
set(ATen_CPU_INCLUDE)
set(ATen_THIRD_PARTY_INCLUDE)
set(ATen_CUDA_SRCS)
set(ATen_CUDA_SRCS_W_SORT_BY_KEY)
set(ATen_CUDA_TEST_SRCS)
set(ATen_CUDA_INCLUDE)
set(ATen_NVRTC_STUB_SRCS)
@ -104,6 +105,7 @@ add_subdirectory(src/ATen)
# Pass source, includes, and libs to parent
set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE)
set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)

View File

@ -67,7 +67,9 @@ FILE(GLOB native_h "native/*.h")
FILE(GLOB native_quantized_h "native/quantized/*.h" "native/quantized/cpu/*.h")
FILE(GLOB native_cpu_h "native/cpu/*.h")
FILE(GLOB native_cuda_cu_sp "native/cuda/Unique.cu" "native/cuda/TensorFactories.cu")
FILE(GLOB native_cuda_cu "native/cuda/*.cu")
exclude(native_cuda_cu "${native_cuda_cu}" ${native_cuda_cu_sp})
FILE(GLOB native_cuda_cpp "native/cuda/*.cpp")
FILE(GLOB native_cudnn_cpp "native/cudnn/*.cpp")
FILE(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu")
@ -103,6 +105,7 @@ endif()
IF(USE_CUDA)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/cuda)
set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} ${cuda_cu} ${native_cuda_cu} ${native_sparse_cuda_cu} ${native_quantized_cuda_cu})
set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} ${native_cuda_cu_sp})
set(all_cuda_cpp ${native_sparse_cuda_cpp} ${native_quantized_cuda_cpp} ${cuda_cpp} ${native_cuda_cpp} ${cuda_generated_cpp} ${ATen_CUDA_SRCS})
SET(all_cuda_cpp ${native_cudnn_cpp} ${native_miopen_cpp} ${all_cuda_cpp})
IF(CAFFE2_USE_CUDNN)
@ -419,6 +422,7 @@ endif()
set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE)
set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE)

View File

@ -59,10 +59,13 @@ set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS}
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorTopK.cu
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorSort.cu
${CMAKE_CURRENT_SOURCE_DIR}/THCSortUtils.cu
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMode.cu
${extra_src}
PARENT_SCOPE)
set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY}
${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMode.cu
PARENT_SCOPE)
INSTALL(FILES
THC.h
${CMAKE_CURRENT_BINARY_DIR}/THCGeneral.h

View File

@ -32,6 +32,7 @@ if(INTERN_BUILD_ATEN_OPS)
# Add source, includes, and libs to lists
list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_SRCS})
list(APPEND Caffe2_GPU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY})
list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS})
list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS})
list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS})
@ -702,7 +703,19 @@ if(USE_ROCM)
endif()
elseif(USE_CUDA)
set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
cuda_add_library(torch_cuda ${Caffe2_GPU_SRCS})
if(CUDA_SEPARABLE_COMPILATION)
# Separate compilation fails when kernels using `thrust::sort_by_key`
# are linked with the rest of CUDA code. Workaround by linking the separateley
set(_generated_name "torch_cuda_w_sort_by_key_intermediate_link${CMAKE_C_OUTPUT_EXTENSION}")
set(torch_cuda_w_sort_by_key_link_file "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/torch_cuda.dir/${CMAKE_CFG_INTDIR}/${_generated_name}")
cuda_wrap_srcs(torch_cuda OBJ Caffe2_GPU_W_SORT_BY_KEY_OBJ ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${torch_cuda_w_sort_by_key_link_file}" torch_cpu "${_options}" "${torch_cuda_SEPARABLE_COMPILATION_OBJECTS}")
set( torch_cuda_SEPARABLE_COMPILATION_OBJECTS )
# Pass compiled sort-by-key object + device-linked fatbin as extra dependencies of torch_cuda
cuda_add_library(torch_cuda ${Caffe2_GPU_SRCS} ${torch_cuda_w_sort_by_key_link_file} ${Caffe2_GPU_W_SORT_BY_KEY_OBJ})
else()
cuda_add_library(torch_cuda ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
endif()
set(CUDA_LINK_LIBRARIES_KEYWORD)
torch_compile_options(torch_cuda) # see cmake/public/utils.cmake
if(USE_NCCL)

View File

@ -18,7 +18,12 @@ file(GLOB tmp *.cu)
# TODO: when we move to explicit file list, this would not be needed.
file(GLOB tmp_cudnn *_cudnn.cu)
exclude(tmp "${tmp}" ${tmp_cudnn})
# Exclude top_k.cu and unique_ops.cu for separate compilation
file(GLOB tmp_w_sort_by_key "top_k.cu" "unique_ops.cu")
exclude(tmp "${tmp}" ${tmp_w_sort_by_key})
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
set(Caffe2_GPU_SRCS_W_SORT_BY_KEY ${Caffe2_GPU_SRCS_W_SORT_BY_KEY} ${tmp_w_sort_by_key})
# exclude test files
file(GLOB tmp *_test.cc)
exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
@ -94,6 +99,7 @@ exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS} $
# ---[ Send the lists to the parent scope.
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_SRCS_W_SORT_BY_KEY ${Caffe2_GPU_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)