From 19f851ce10b16f0ed11d18d937ca7b32746153b0 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 24 Jun 2025 20:02:07 +0000
Subject: [PATCH] Revert "Simplify nvtx3 CMake handling, always use nvtx3
 (#153784)"

This reverts commit 099d0d6121125062ebc05771c8330cb7cd8d053a.

Reverted https://github.com/pytorch/pytorch/pull/153784 on behalf of https://github.com/Camyll due to breaking internal tests and cuda 12.4 builds still used in CI ([comment](https://github.com/pytorch/pytorch/pull/153784#issuecomment-3001702310))
---
 .ci/manywheel/build_cuda.sh                   |  5 +++++
 .ci/pytorch/test_example_code/CMakeLists.txt  |  2 +-
 .ci/pytorch/windows/cuda126.bat               |  9 ++++++++
 .ci/pytorch/windows/cuda128.bat               |  9 ++++++++
 .ci/pytorch/windows/cuda129.bat               |  9 ++++++++
 .ci/pytorch/windows/internal/copy.bat         |  1 +
 .ci/pytorch/windows/internal/cuda_install.bat | 15 +++++++++++++
 .github/scripts/windows/build_magma.bat       |  1 +
 caffe2/CMakeLists.txt                         |  9 +++++++-
 cmake/Dependencies.cmake                      | 21 ++++++++++---------
 cmake/TorchConfig.cmake.in                    |  3 +++
 torch/CMakeLists.txt                          |  8 ++++++-
 torch/__init__.py                             | 19 ++++++++++++++++-
 torch/csrc/cuda/shared/nvtx.cpp               |  8 +++++++
 torch/csrc/profiler/stubs/cuda.cpp            |  4 ++++
 torch/utils/hipify/cuda_to_hip_mappings.py    |  1 +
 16 files changed, 110 insertions(+), 14 deletions(-)

diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh
index 81c920b04dda..ec513b80099e 100644
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@@ -150,6 +150,11 @@ if [[ $CUDA_VERSION == 12* ]]; then
             "libcufile.so.0"
             "libcufile_rdma.so.1"
         )
+        # Add libnvToolsExt only if CUDA version is not 12.9
+        if [[ $CUDA_VERSION != 12.9* ]]; then
+            DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
+            DEPS_SONAME+=("libnvToolsExt.so.1")
+        fi
     else
         echo "Using nvidia libs from pypi."
         CUDA_RPATHS=(
diff --git a/.ci/pytorch/test_example_code/CMakeLists.txt b/.ci/pytorch/test_example_code/CMakeLists.txt
index 1793007d515d..e87f37ae61fb 100644
--- a/.ci/pytorch/test_example_code/CMakeLists.txt
+++ b/.ci/pytorch/test_example_code/CMakeLists.txt
@@ -16,7 +16,7 @@ target_link_libraries(simple-torch-test CUDA::cudart CUDA::cufft CUDA::cusparse
 find_library(CUDNN_LIBRARY NAMES cudnn)
 target_link_libraries(simple-torch-test  ${CUDNN_LIBRARY} )
 if(MSVC)
-  file(GLOB TORCH_DLLS  "$ENV{CUDA_PATH}/bin/cudnn64_8.dll")
+  file(GLOB TORCH_DLLS  "$ENV{CUDA_PATH}/bin/cudnn64_8.dll" "$ENV{NVTOOLSEXT_PATH}/bin/x64/*.dll")
   message("dlls to copy "  ${TORCH_DLLS})
   add_custom_command(TARGET simple-torch-test
                      POST_BUILD
diff --git a/.ci/pytorch/windows/cuda126.bat b/.ci/pytorch/windows/cuda126.bat
index cc45a589c94d..dd30cc25d4a6 100644
--- a/.ci/pytorch/windows/cuda126.bat
+++ b/.ci/pytorch/windows/cuda126.bat
@@ -18,6 +18,15 @@ REM Check for optional components
 set USE_CUDA=
 set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
 
+IF "%NVTOOLSEXT_PATH%"=="" (
+    IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
+        set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
+    ) ELSE (
+        echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
+        exit /b 1
+    )
+)
+
 IF "%CUDA_PATH_V126%"=="" (
     IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin\nvcc.exe" (
         set "CUDA_PATH_V126=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6"
diff --git a/.ci/pytorch/windows/cuda128.bat b/.ci/pytorch/windows/cuda128.bat
index fdc1c6a0eed8..bbdfb4bd1bb7 100644
--- a/.ci/pytorch/windows/cuda128.bat
+++ b/.ci/pytorch/windows/cuda128.bat
@@ -18,6 +18,15 @@ REM Check for optional components
 set USE_CUDA=
 set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
 
+IF "%NVTOOLSEXT_PATH%"=="" (
+    IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
+        set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
+    ) ELSE (
+        echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
+        exit /b 1
+    )
+)
+
 IF "%CUDA_PATH_V128%"=="" (
     IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin\nvcc.exe" (
         set "CUDA_PATH_V128=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8"
diff --git a/.ci/pytorch/windows/cuda129.bat b/.ci/pytorch/windows/cuda129.bat
index 4c580b789f39..77ef14921aa6 100644
--- a/.ci/pytorch/windows/cuda129.bat
+++ b/.ci/pytorch/windows/cuda129.bat
@@ -18,6 +18,15 @@ REM Check for optional components
 set USE_CUDA=
 set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
 
+IF "%NVTOOLSEXT_PATH%"=="" (
+    IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
+        set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
+    ) ELSE (
+        echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
+        exit /b 1
+    )
+)
+
 IF "%CUDA_PATH_V129%"=="" (
     IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\bin\nvcc.exe" (
         set "CUDA_PATH_V129=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9"
diff --git a/.ci/pytorch/windows/internal/copy.bat b/.ci/pytorch/windows/internal/copy.bat
index 2e19a498c7f0..8042db09f462 100644
--- a/.ci/pytorch/windows/internal/copy.bat
+++ b/.ci/pytorch/windows/internal/copy.bat
@@ -9,6 +9,7 @@ copy "%CUDA_PATH%\bin\cudnn*64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\bin\nvrtc*64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib
 
+copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" pytorch\torch\lib
 copy "%PYTHON_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
 
 :: Should be set in build_pytorch.bat
diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat
index 891fb097cc01..a0eb650f8506 100644
--- a/.ci/pytorch/windows/internal/cuda_install.bat
+++ b/.ci/pytorch/windows/internal/cuda_install.bat
@@ -119,6 +119,11 @@ goto cuda_common
 :: If you cannot find the CUDA version you want to build for here then please
 :: add it @ https://github.com/pytorch/test-infra/tree/main/aws/ami/windows
 if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" (
+    if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" (
+        curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output "%SRC_DIR%\temp_build\NvToolsExt.7z"
+        if errorlevel 1 exit /b 1
+    )
+
     if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.zip" (
         curl -k -L "https://ossci-windows.s3.us-east-1.amazonaws.com/builder/additional_dlls.zip" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip"
         if errorlevel 1 exit /b 1
@@ -145,6 +150,15 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_
         xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations"
     )
 
+    echo Installing NvToolsExt...
+    7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt"
+    mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
+    mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
+    mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
+    xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\bin\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
+    xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
+    xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
+
     echo Installing cuDNN...
     7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn"
     xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin"
@@ -175,3 +189,4 @@ echo Setting up environment...
 set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%"
 set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
 set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
+set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt"
diff --git a/.github/scripts/windows/build_magma.bat b/.github/scripts/windows/build_magma.bat
index 6a9b2069b00f..0f11fe34068e 100644
--- a/.github/scripts/windows/build_magma.bat
+++ b/.github/scripts/windows/build_magma.bat
@@ -17,6 +17,7 @@ if errorlevel 1 exit /b 1
 
 set "PATH=C:\Tools;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\libnvvp;%PATH%"
 set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%
+set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
 
 mkdir magma_cuda%CUVER_NODOT%
 cd magma_cuda%CUVER_NODOT%
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 4202d83a95eb..7b5820cad2d8 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1633,7 +1633,11 @@ if(USE_CUDA)
   endif()
   target_link_libraries(torch_cuda INTERFACE torch::cudart)
   target_link_libraries(torch_cuda PUBLIC c10_cuda)
-  target_link_libraries(torch_cuda PRIVATE CUDA::nvtx3)
+  if(TARGET torch::nvtx3)
+    target_link_libraries(torch_cuda PRIVATE torch::nvtx3)
+  else()
+    target_link_libraries(torch_cuda PUBLIC torch::nvtoolsext)
+  endif()
 
   target_include_directories(
       torch_cuda INTERFACE $<INSTALL_INTERFACE:include>)
@@ -1725,6 +1729,9 @@ if(BUILD_SHARED_LIBS)
   if(USE_CUDA)
     target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
     target_link_libraries(torch_global_deps torch::cudart)
+    if(TARGET torch::nvtoolsext)
+      target_link_libraries(torch_global_deps torch::nvtoolsext)
+    endif()
   endif()
   install(TARGETS torch_global_deps DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 endif()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index a93386c27f8d..9241e5e119c5 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -968,17 +968,18 @@ endif()
 # ---[ nvtx
 if(USE_SYSTEM_NVTX)
   find_path(nvtx3_dir NAMES nvtx3 PATHS ${CUDA_INCLUDE_DIRS})
-  find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir)
-  if(NOT nvtx3_FOUND)
-    message(WARNING "Cannot find system NVTX3, find shipped NVTX3 instead")
-  endif()
-endif()
-if(NOT TARGET CUDA::nvtx3)
-  add_library(CUDA::nvtx3 INTERFACE IMPORTED)
-endif()
-if(NOT nvtx3_dir)
+else()
   find_path(nvtx3_dir NAMES nvtx3 PATHS "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH)
-  target_include_directories(CUDA::nvtx3 INTERFACE "${nvtx3_dir}")
+endif()
+find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir)
+if(nvtx3_FOUND)
+  add_library(torch::nvtx3 INTERFACE IMPORTED)
+  target_include_directories(torch::nvtx3 INTERFACE "${nvtx3_dir}")
+  target_compile_definitions(torch::nvtx3 INTERFACE TORCH_CUDA_USE_NVTX3)
+else()
+  message(WARNING "Cannot find NVTX3, find old NVTX instead")
+  add_library(torch::nvtoolsext INTERFACE IMPORTED)
+  set_property(TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES CUDA::nvToolsExt)
 endif()
 
 
diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index 8a5587cad272..0b32ffa99ceb 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -132,6 +132,9 @@ if(@USE_CUDA@)
   else()
     set(TORCH_CUDA_LIBRARIES ${CUDA_NVRTC_LIB})
   endif()
+  if(TARGET torch::nvtoolsext)
+    list(APPEND TORCH_CUDA_LIBRARIES torch::nvtoolsext)
+  endif()
 
   if(@BUILD_SHARED_LIBS@)
     find_library(C10_CUDA_LIBRARY c10_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib")
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 3985cf50c141..55bd03122eee 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -146,7 +146,13 @@ if(USE_CUDA)
         list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUFILE)
     endif()
 
-    list(APPEND TORCH_PYTHON_LINK_LIBRARIES CUDA::nvtx3)
+    if(TARGET torch::nvtx3)
+      list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtx3)
+    else()
+      if(TARGET torch::nvtoolsext)
+        list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtoolsext)
+      endif()
+    endif()
 endif()
 
 if(USE_ROCM)
diff --git a/torch/__init__.py b/torch/__init__.py
index 4b862e8699a4..0c54eb1a50b1 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -206,6 +206,20 @@ if sys.platform == "win32":
             if os.path.exists(p)
         ]
 
+        if not builtins.any(
+            os.path.exists(os.path.join(p, "nvToolsExt64_1.dll")) for p in dll_paths
+        ):
+            nvtoolsext_dll_path = os.path.join(
+                os.getenv(
+                    "NVTOOLSEXT_PATH",
+                    os.path.join(pfiles_path, "NVIDIA Corporation", "NvToolsExt"),
+                ),
+                "bin",
+                "x64",
+            )
+        else:
+            nvtoolsext_dll_path = ""
+
         if cuda_version and builtins.all(
             not glob.glob(os.path.join(p, "cudart64*.dll")) for p in dll_paths
         ):
@@ -218,7 +232,9 @@ if sys.platform == "win32":
         else:
             cuda_path = ""
 
-        dll_paths.extend(p for p in (cuda_path,) if os.path.exists(p))
+        dll_paths.extend(
+            p for p in (nvtoolsext_dll_path, cuda_path) if os.path.exists(p)
+        )
 
         kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
         with_load_library_flags = hasattr(kernel32, "AddDllDirectory")
@@ -355,6 +371,7 @@ def _load_global_deps() -> None:
             "cusparselt": "libcusparseLt.so.*[0-9]",
             "cusolver": "libcusolver.so.*[0-9]",
             "nccl": "libnccl.so.*[0-9]",
+            "nvtx": "libnvToolsExt.so.*[0-9]",
             "nvshmem": "libnvshmem_host.so.*[0-9]",
         }
         # cufiile is only available on cuda 12+
diff --git a/torch/csrc/cuda/shared/nvtx.cpp b/torch/csrc/cuda/shared/nvtx.cpp
index 8faf319071c3..d28e8ae222ea 100644
--- a/torch/csrc/cuda/shared/nvtx.cpp
+++ b/torch/csrc/cuda/shared/nvtx.cpp
@@ -3,7 +3,11 @@
 #endif
 
 #ifndef ROCM_ON_WINDOWS
+#ifdef TORCH_CUDA_USE_NVTX3
 #include <nvtx3/nvtx3.hpp>
+#else // TORCH_CUDA_USE_NVTX3
+#include <nvToolsExt.h>
+#endif // TORCH_CUDA_USE_NVTX3
 #else // ROCM_ON_WINDOWS
 #include <c10/util/Exception.h>
 #endif // ROCM_ON_WINDOWS
@@ -50,7 +54,11 @@ static void* device_nvtxRangeStart(const char* msg, std::intptr_t stream) {
 void initNvtxBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
+#ifdef TORCH_CUDA_USE_NVTX3
   auto nvtx = m.def_submodule("_nvtx", "nvtx3 bindings");
+#else
+  auto nvtx = m.def_submodule("_nvtx", "libNvToolsExt.so bindings");
+#endif
   nvtx.def("rangePushA", nvtxRangePushA);
   nvtx.def("rangePop", nvtxRangePop);
   nvtx.def("rangeStartA", nvtxRangeStartA);
diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp
index 1b4d786b0b67..e08b2a3efd0f 100644
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@@ -1,7 +1,11 @@
 #include <sstream>
 
 #ifndef ROCM_ON_WINDOWS
+#ifdef TORCH_CUDA_USE_NVTX3
 #include <nvtx3/nvtx3.hpp>
+#else
+#include <nvToolsExt.h>
+#endif
 #else // ROCM_ON_WINDOWS
 #include <c10/util/Exception.h>
 #endif // ROCM_ON_WINDOWS
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index a5145a2f4870..8bb247cb4524 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -630,6 +630,7 @@ CUDA_INCLUDE_MAP = collections.OrderedDict(
         ("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("cub/device/device_select.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("nvtx3/nvtx3.hpp", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
+        ("nvToolsExt.h", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
         ("nvml.h", ("rocm_smi/rocm_smi.h", CONV_INCLUDE, API_ROCMSMI)),
     ]
 )