[Reland2] Update NVTX to NVTX3 (#109843)

Another attempt to update NVTX to NVTX3. We now avoid changing NVTX header inclusion of existing code. The advantage of NVTX3 over NVTX is that it is a header-only library so that linking with NVTX3 can greatly simplify our CMake and other building scripts for finding libraries in user environments. In addition, NVTX are indeed still present in the latest CUDA versions, but they're no longer a compiled library: It's now a header-only library. That's why there isn't a .lib file anymore. Pull Request resolved: https://github.com/pytorch/pytorch/pull/109843 Approved by: https://github.com/peterbell10, https://github.com/eqy Co-authored-by: Ivan Zaitsev <108101595+izaitsevfb@users.noreply.github.com>
2025-10-20 21:14:14 +08:00 · 2024-08-20 16:33:26 +00:00
parent 33f1ee036e
commit c3d02fa390
16 changed files with 56 additions and 28 deletions
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -65,13 +65,6 @@ set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
 set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
 set CUDNN_ROOT_DIR=%CUDA_PATH%
-set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
-set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
-
-set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
-set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
-set CUDNN_ROOT_DIR=%CUDA_PATH%
-set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
 set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%

 :cuda_build_end
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@ -40,7 +40,6 @@ set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
 set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
 set CUDNN_ROOT_DIR=%CUDA_PATH%
-set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
 set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
 set NUMBAPRO_CUDALIB=%CUDA_PATH%\bin
 set NUMBAPRO_LIBDEVICE=%CUDA_PATH%\nvvm\libdevice
--- a/.ci/pytorch/win-test-helpers/test_custom_backend.bat
+++ b/.ci/pytorch/win-test-helpers/test_custom_backend.bat
@ -31,6 +31,6 @@ if ERRORLEVEL 1 exit /b 1

 :: Run tests C++-side and load the exported script module.
 cd build
-set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
+set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
 test_custom_backend.exe model.pt
 if ERRORLEVEL 1 exit /b 1
--- a/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
+++ b/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
@ -31,6 +31,6 @@ if ERRORLEVEL 1 exit /b 1

 :: Run tests C++-side and load the exported script module.
 cd build
-set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
+set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
 test_custom_ops.exe model.pt
 if ERRORLEVEL 1 exit /b 1
--- a/.ci/pytorch/win-test-helpers/test_libtorch.bat
+++ b/.ci/pytorch/win-test-helpers/test_libtorch.bat
@ -5,7 +5,7 @@ if errorlevel 1 exit /b 1
 set CWD=%cd%

 set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\bin
-set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
+set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%

 set TORCH_CPP_TEST_MNIST_PATH=%CWD%\test\cpp\api\mnist
 python tools\download_mnist.py --quiet -d %TORCH_CPP_TEST_MNIST_PATH%
--- a/.gitmodules
+++ b/.gitmodules
@ -124,3 +124,6 @@
 	path = third_party/cpp-httplib
 	url = https://github.com/yhirose/cpp-httplib.git
 	branch = v0.15.3
+[submodule "third_party/NVTX"]
+	path = third_party/NVTX
+	url = https://github.com/NVIDIA/NVTX.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1349,6 +1349,7 @@ if(DEFINED USE_CUSTOM_DEBINFO)
    # We have to specify the scope here. We do this by specifying the targets we
    # care about and caffe2/ for all test targets defined there
    if(BUILD_LIBTORCHLESS)
+      caffe2_update_option(USE_CUDA OFF)
      set(ALL_PT_TARGETS "torch_python;${C10_LIB};${TORCH_CPU_LIB};${TORCH_LIB}")
    else()
      # @todo test if we can remove this
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1511,7 +1511,12 @@ if(USE_CUDA)
    target_link_libraries(torch_cpu PRIVATE torch::cudart)
  endif()
  target_link_libraries(torch_cuda INTERFACE torch::cudart)
-  target_link_libraries(torch_cuda PUBLIC c10_cuda torch::nvtoolsext)
+  target_link_libraries(torch_cuda PUBLIC c10_cuda)
+  if(TARGET torch::nvtx3)
+    target_link_libraries(torch_cuda PRIVATE torch::nvtx3)
+  else()
+    target_link_libraries(torch_cuda PUBLIC torch::nvtoolsext)
+  endif()

  target_include_directories(
      torch_cuda INTERFACE $<INSTALL_INTERFACE:include>)
@ -1598,7 +1603,10 @@ if(BUILD_SHARED_LIBS)
  # not find them, because they're usually in non-standard locations)
  if(USE_CUDA)
    target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
-    target_link_libraries(torch_global_deps torch::cudart torch::nvtoolsext)
+    target_link_libraries(torch_global_deps torch::cudart)
+    if(TARGET torch::nvtoolsext)
+      target_link_libraries(torch_global_deps torch::nvtoolsext)
+    endif()
  endif()
  install(TARGETS torch_global_deps DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 endif()
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@ -123,10 +123,14 @@ endif()
 if(@USE_CUDA@)
  if(MSVC)
    find_library(CAFFE2_NVRTC_LIBRARY caffe2_nvrtc PATHS "${TORCH_INSTALL_PREFIX}/lib")
-    list(APPEND TORCH_CUDA_LIBRARIES ${CAFFE2_NVRTC_LIBRARY} torch::nvtoolsext)
+    list(APPEND TORCH_CUDA_LIBRARIES ${CAFFE2_NVRTC_LIBRARY})
  else()
-    set(TORCH_CUDA_LIBRARIES ${CUDA_NVRTC_LIB} torch::nvtoolsext)
+    set(TORCH_CUDA_LIBRARIES ${CUDA_NVRTC_LIB})
  endif()
+  if(TARGET torch::nvtoolsext)
+    list(APPEND TORCH_CUDA_LIBRARIES torch::nvtoolsext)
+  endif()
+
  if(@BUILD_SHARED_LIBS@)
    find_library(C10_CUDA_LIBRARY c10_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib")
    list(APPEND TORCH_CUDA_LIBRARIES ${C10_CUDA_LIBRARY} ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@ -66,10 +66,6 @@ if(NOT CMAKE_CUDA_COMPILER_VERSION VERSION_EQUAL CUDAToolkit_VERSION)
                      "V${CUDAToolkit_VERSION} in '${CUDAToolkit_INCLUDE_DIRS}'")
 endif()

-if(NOT TARGET CUDA::nvToolsExt)
-  message(FATAL_ERROR "Failed to find nvToolsExt")
-endif()
-
 message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION})
 message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE})
 message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR})
@ -174,10 +170,18 @@ else()
 endif()

 # nvToolsExt
-add_library(torch::nvtoolsext INTERFACE IMPORTED)
-set_property(
-    TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES
-    CUDA::nvToolsExt)
+find_path(nvtx3_dir NAMES nvtx3 PATHS "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH)
+find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir)
+if(nvtx3_FOUND)
+  add_library(torch::nvtx3 INTERFACE IMPORTED)
+  target_include_directories(torch::nvtx3 INTERFACE "${nvtx3_dir}")
+  target_compile_definitions(torch::nvtx3 INTERFACE TORCH_CUDA_USE_NVTX3)
+else()
+  message(WARNING "Cannot find NVTX3, find old NVTX instead")
+  add_library(torch::nvtoolsext INTERFACE IMPORTED)
+  set_property(TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES CUDA::nvToolsExt)
+endif()
+

 # cublas
 add_library(caffe2::cublas INTERFACE IMPORTED)
--- a/setup.py
+++ b/setup.py
@ -164,9 +164,6 @@
 #   NCCL_INCLUDE_DIR
 #     specify where nccl is installed
 #
-#   NVTOOLSEXT_PATH (Windows only)
-#     specify where nvtoolsext is installed
-#
 #   ACL_ROOT_DIR
 #     specify where Compute Library is installed
 #
--- a/third_party/NVTX
+++ b/third_party/NVTX
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@ -135,8 +135,13 @@ if(USE_CUDA)
        list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::cudnn)
        list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUDNN)
    endif()
-
-    list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtoolsext)
+    if(TARGET torch::nvtx3)
+      list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtx3)
+    else()
+      if(TARGET torch::nvtoolsext)
+        list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtoolsext)
+      endif()
+    endif()
 endif()

 if(USE_ROCM)
--- a/torch/csrc/cuda/shared/nvtx.cpp
+++ b/torch/csrc/cuda/shared/nvtx.cpp
@ -1,7 +1,11 @@
 #ifdef _WIN32
 #include <wchar.h> // _wgetenv for nvtx
 #endif
+#ifdef TORCH_CUDA_USE_NVTX3
+#include <nvtx3/nvtx3.hpp>
+#else
 #include <nvToolsExt.h>
+#endif
 #include <torch/csrc/utils/pybind.h>

 namespace torch::cuda::shared {
@ -9,7 +13,11 @@ namespace torch::cuda::shared {
 void initNvtxBindings(PyObject* module) {
  auto m = py::handle(module).cast<py::module>();

+#ifdef TORCH_CUDA_USE_NVTX3
+  auto nvtx = m.def_submodule("_nvtx", "nvtx3 bindings");
+#else
  auto nvtx = m.def_submodule("_nvtx", "libNvToolsExt.so bindings");
+#endif
  nvtx.def("rangePushA", nvtxRangePushA);
  nvtx.def("rangePop", nvtxRangePop);
  nvtx.def("rangeStartA", nvtxRangeStartA);
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@ -1,6 +1,10 @@
 #include <sstream>

+#ifdef TORCH_CUDA_USE_NVTX3
+#include <nvtx3/nvtx3.hpp>
+#else
 #include <nvToolsExt.h>
+#endif

 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/ApproximateClock.h>
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@ -621,6 +621,7 @@ CUDA_INCLUDE_MAP = collections.OrderedDict(
        ("cub/device/device_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
        ("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
        ("cub/device/device_select.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("nvtx3/nvtx3.hpp", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
        ("nvToolsExt.h", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
        ("nvml.h", ("rocm_smi/rocm_smi.h", CONV_INCLUDE, API_ROCMSMI)),
    ]