[Reland2] Update NVTX to NVTX3 (#109843)

Another attempt to update NVTX to NVTX3. We now avoid changing NVTX header inclusion of existing code.  The advantage of NVTX3 over NVTX is that it is a header-only library so that linking with NVTX3 can greatly simplify our CMake and other building scripts for finding libraries in user environments. In addition, NVTX are indeed still present in the latest CUDA versions, but they're no longer a compiled library: It's now a header-only library. That's why there isn't a .lib file anymore.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/109843
Approved by: https://github.com/peterbell10, https://github.com/eqy

Co-authored-by: Ivan Zaitsev <108101595+izaitsevfb@users.noreply.github.com>
This commit is contained in:
cyy
2024-08-20 16:33:26 +00:00
committed by PyTorch MergeBot
parent 33f1ee036e
commit c3d02fa390
16 changed files with 56 additions and 28 deletions

View File

@ -65,13 +65,6 @@ set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
set CUDNN_ROOT_DIR=%CUDA_PATH%
set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
set CUDNN_ROOT_DIR=%CUDA_PATH%
set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
:cuda_build_end

View File

@ -40,7 +40,6 @@ set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
set CUDNN_ROOT_DIR=%CUDA_PATH%
set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
set NUMBAPRO_CUDALIB=%CUDA_PATH%\bin
set NUMBAPRO_LIBDEVICE=%CUDA_PATH%\nvvm\libdevice

View File

@ -31,6 +31,6 @@ if ERRORLEVEL 1 exit /b 1
:: Run tests C++-side and load the exported script module.
cd build
set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
test_custom_backend.exe model.pt
if ERRORLEVEL 1 exit /b 1

View File

@ -31,6 +31,6 @@ if ERRORLEVEL 1 exit /b 1
:: Run tests C++-side and load the exported script module.
cd build
set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
test_custom_ops.exe model.pt
if ERRORLEVEL 1 exit /b 1

View File

@ -5,7 +5,7 @@ if errorlevel 1 exit /b 1
set CWD=%cd%
set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\bin
set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
set TORCH_CPP_TEST_MNIST_PATH=%CWD%\test\cpp\api\mnist
python tools\download_mnist.py --quiet -d %TORCH_CPP_TEST_MNIST_PATH%

3
.gitmodules vendored
View File

@ -124,3 +124,6 @@
path = third_party/cpp-httplib
url = https://github.com/yhirose/cpp-httplib.git
branch = v0.15.3
[submodule "third_party/NVTX"]
path = third_party/NVTX
url = https://github.com/NVIDIA/NVTX.git

View File

@ -1349,6 +1349,7 @@ if(DEFINED USE_CUSTOM_DEBINFO)
# We have to specify the scope here. We do this by specifying the targets we
# care about and caffe2/ for all test targets defined there
if(BUILD_LIBTORCHLESS)
caffe2_update_option(USE_CUDA OFF)
set(ALL_PT_TARGETS "torch_python;${C10_LIB};${TORCH_CPU_LIB};${TORCH_LIB}")
else()
# @todo test if we can remove this

View File

@ -1511,7 +1511,12 @@ if(USE_CUDA)
target_link_libraries(torch_cpu PRIVATE torch::cudart)
endif()
target_link_libraries(torch_cuda INTERFACE torch::cudart)
target_link_libraries(torch_cuda PUBLIC c10_cuda torch::nvtoolsext)
target_link_libraries(torch_cuda PUBLIC c10_cuda)
if(TARGET torch::nvtx3)
target_link_libraries(torch_cuda PRIVATE torch::nvtx3)
else()
target_link_libraries(torch_cuda PUBLIC torch::nvtoolsext)
endif()
target_include_directories(
torch_cuda INTERFACE $<INSTALL_INTERFACE:include>)
@ -1598,7 +1603,10 @@ if(BUILD_SHARED_LIBS)
# not find them, because they're usually in non-standard locations)
if(USE_CUDA)
target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
target_link_libraries(torch_global_deps torch::cudart torch::nvtoolsext)
target_link_libraries(torch_global_deps torch::cudart)
if(TARGET torch::nvtoolsext)
target_link_libraries(torch_global_deps torch::nvtoolsext)
endif()
endif()
install(TARGETS torch_global_deps DESTINATION "${TORCH_INSTALL_LIB_DIR}")
endif()

View File

@ -123,10 +123,14 @@ endif()
if(@USE_CUDA@)
if(MSVC)
find_library(CAFFE2_NVRTC_LIBRARY caffe2_nvrtc PATHS "${TORCH_INSTALL_PREFIX}/lib")
list(APPEND TORCH_CUDA_LIBRARIES ${CAFFE2_NVRTC_LIBRARY} torch::nvtoolsext)
list(APPEND TORCH_CUDA_LIBRARIES ${CAFFE2_NVRTC_LIBRARY})
else()
set(TORCH_CUDA_LIBRARIES ${CUDA_NVRTC_LIB} torch::nvtoolsext)
set(TORCH_CUDA_LIBRARIES ${CUDA_NVRTC_LIB})
endif()
if(TARGET torch::nvtoolsext)
list(APPEND TORCH_CUDA_LIBRARIES torch::nvtoolsext)
endif()
if(@BUILD_SHARED_LIBS@)
find_library(C10_CUDA_LIBRARY c10_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib")
list(APPEND TORCH_CUDA_LIBRARIES ${C10_CUDA_LIBRARY} ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})

View File

@ -66,10 +66,6 @@ if(NOT CMAKE_CUDA_COMPILER_VERSION VERSION_EQUAL CUDAToolkit_VERSION)
"V${CUDAToolkit_VERSION} in '${CUDAToolkit_INCLUDE_DIRS}'")
endif()
if(NOT TARGET CUDA::nvToolsExt)
message(FATAL_ERROR "Failed to find nvToolsExt")
endif()
message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION})
message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE})
message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR})
@ -174,10 +170,18 @@ else()
endif()
# nvToolsExt
add_library(torch::nvtoolsext INTERFACE IMPORTED)
set_property(
TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES
CUDA::nvToolsExt)
find_path(nvtx3_dir NAMES nvtx3 PATHS "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH)
find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir)
if(nvtx3_FOUND)
add_library(torch::nvtx3 INTERFACE IMPORTED)
target_include_directories(torch::nvtx3 INTERFACE "${nvtx3_dir}")
target_compile_definitions(torch::nvtx3 INTERFACE TORCH_CUDA_USE_NVTX3)
else()
message(WARNING "Cannot find NVTX3, find old NVTX instead")
add_library(torch::nvtoolsext INTERFACE IMPORTED)
set_property(TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES CUDA::nvToolsExt)
endif()
# cublas
add_library(caffe2::cublas INTERFACE IMPORTED)

View File

@ -164,9 +164,6 @@
# NCCL_INCLUDE_DIR
# specify where nccl is installed
#
# NVTOOLSEXT_PATH (Windows only)
# specify where nvtoolsext is installed
#
# ACL_ROOT_DIR
# specify where Compute Library is installed
#

1
third_party/NVTX vendored Submodule

Submodule third_party/NVTX added at e170594ac7

View File

@ -135,8 +135,13 @@ if(USE_CUDA)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::cudnn)
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUDNN)
endif()
list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtoolsext)
if(TARGET torch::nvtx3)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtx3)
else()
if(TARGET torch::nvtoolsext)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtoolsext)
endif()
endif()
endif()
if(USE_ROCM)

View File

@ -1,7 +1,11 @@
#ifdef _WIN32
#include <wchar.h> // _wgetenv for nvtx
#endif
#ifdef TORCH_CUDA_USE_NVTX3
#include <nvtx3/nvtx3.hpp>
#else
#include <nvToolsExt.h>
#endif
#include <torch/csrc/utils/pybind.h>
namespace torch::cuda::shared {
@ -9,7 +13,11 @@ namespace torch::cuda::shared {
void initNvtxBindings(PyObject* module) {
auto m = py::handle(module).cast<py::module>();
#ifdef TORCH_CUDA_USE_NVTX3
auto nvtx = m.def_submodule("_nvtx", "nvtx3 bindings");
#else
auto nvtx = m.def_submodule("_nvtx", "libNvToolsExt.so bindings");
#endif
nvtx.def("rangePushA", nvtxRangePushA);
nvtx.def("rangePop", nvtxRangePop);
nvtx.def("rangeStartA", nvtxRangeStartA);

View File

@ -1,6 +1,10 @@
#include <sstream>
#ifdef TORCH_CUDA_USE_NVTX3
#include <nvtx3/nvtx3.hpp>
#else
#include <nvToolsExt.h>
#endif
#include <c10/cuda/CUDAGuard.h>
#include <c10/util/ApproximateClock.h>

View File

@ -621,6 +621,7 @@ CUDA_INCLUDE_MAP = collections.OrderedDict(
("cub/device/device_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
("cub/device/device_select.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
("nvtx3/nvtx3.hpp", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
("nvToolsExt.h", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
("nvml.h", ("rocm_smi/rocm_smi.h", CONV_INCLUDE, API_ROCMSMI)),
]