mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Fix PyTorch separate compilation (#34863)
Summary: Looks like there is a bug in CUDA device linker, but kernels that uses `thust::sort_by_key` can not be linked with other kernels Solve the problem by splitting 5 thrust-heavy .cu files into `__torch_cuda_sp` library which is statically linked into `torch_cuda` For default compilation workflow it should not make any difference. Test Plan: Compile with `-DCUDA_SEPARABLE_COMPILATION=YES` and observe library size difference: 310Mb before, 173Mb after if compiled for sm_75 Pull Request resolved: https://github.com/pytorch/pytorch/pull/34863 Differential Revision: D20683972 Pulled By: malfet fbshipit-source-id: bc1492aa9d1d2d21c48e8764a8a7b403feaec5da
This commit is contained in:
committed by
Facebook GitHub Bot
parent
2f6f1781af
commit
2e739f822b
@ -32,6 +32,7 @@ if(INTERN_BUILD_ATEN_OPS)
|
||||
# Add source, includes, and libs to lists
|
||||
list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
|
||||
list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_SRCS})
|
||||
list(APPEND Caffe2_GPU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY})
|
||||
list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS})
|
||||
list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS})
|
||||
list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS})
|
||||
@ -702,7 +703,19 @@ if(USE_ROCM)
|
||||
endif()
|
||||
elseif(USE_CUDA)
|
||||
set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
|
||||
cuda_add_library(torch_cuda ${Caffe2_GPU_SRCS})
|
||||
if(CUDA_SEPARABLE_COMPILATION)
|
||||
# Separate compilation fails when kernels using `thrust::sort_by_key`
|
||||
# are linked with the rest of CUDA code. Workaround by linking the separateley
|
||||
set(_generated_name "torch_cuda_w_sort_by_key_intermediate_link${CMAKE_C_OUTPUT_EXTENSION}")
|
||||
set(torch_cuda_w_sort_by_key_link_file "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/torch_cuda.dir/${CMAKE_CFG_INTDIR}/${_generated_name}")
|
||||
cuda_wrap_srcs(torch_cuda OBJ Caffe2_GPU_W_SORT_BY_KEY_OBJ ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
|
||||
CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${torch_cuda_w_sort_by_key_link_file}" torch_cpu "${_options}" "${torch_cuda_SEPARABLE_COMPILATION_OBJECTS}")
|
||||
set( torch_cuda_SEPARABLE_COMPILATION_OBJECTS )
|
||||
# Pass compiled sort-by-key object + device-linked fatbin as extra dependencies of torch_cuda
|
||||
cuda_add_library(torch_cuda ${Caffe2_GPU_SRCS} ${torch_cuda_w_sort_by_key_link_file} ${Caffe2_GPU_W_SORT_BY_KEY_OBJ})
|
||||
else()
|
||||
cuda_add_library(torch_cuda ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
|
||||
endif()
|
||||
set(CUDA_LINK_LIBRARIES_KEYWORD)
|
||||
torch_compile_options(torch_cuda) # see cmake/public/utils.cmake
|
||||
if(USE_NCCL)
|
||||
|
Reference in New Issue
Block a user