From 19f851ce10b16f0ed11d18d937ca7b32746153b0 Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Tue, 24 Jun 2025 20:02:07 +0000 Subject: [PATCH] Revert "Simplify nvtx3 CMake handling, always use nvtx3 (#153784)" This reverts commit 099d0d6121125062ebc05771c8330cb7cd8d053a. Reverted https://github.com/pytorch/pytorch/pull/153784 on behalf of https://github.com/Camyll due to breaking internal tests and cuda 12.4 builds still used in CI ([comment](https://github.com/pytorch/pytorch/pull/153784#issuecomment-3001702310)) --- .ci/manywheel/build_cuda.sh | 5 +++++ .ci/pytorch/test_example_code/CMakeLists.txt | 2 +- .ci/pytorch/windows/cuda126.bat | 9 ++++++++ .ci/pytorch/windows/cuda128.bat | 9 ++++++++ .ci/pytorch/windows/cuda129.bat | 9 ++++++++ .ci/pytorch/windows/internal/copy.bat | 1 + .ci/pytorch/windows/internal/cuda_install.bat | 15 +++++++++++++ .github/scripts/windows/build_magma.bat | 1 + caffe2/CMakeLists.txt | 9 +++++++- cmake/Dependencies.cmake | 21 ++++++++++--------- cmake/TorchConfig.cmake.in | 3 +++ torch/CMakeLists.txt | 8 ++++++- torch/__init__.py | 19 ++++++++++++++++- torch/csrc/cuda/shared/nvtx.cpp | 8 +++++++ torch/csrc/profiler/stubs/cuda.cpp | 4 ++++ torch/utils/hipify/cuda_to_hip_mappings.py | 1 + 16 files changed, 110 insertions(+), 14 deletions(-) diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh index 81c920b04dda..ec513b80099e 100644 --- a/.ci/manywheel/build_cuda.sh +++ b/.ci/manywheel/build_cuda.sh @@ -150,6 +150,11 @@ if [[ $CUDA_VERSION == 12* ]]; then "libcufile.so.0" "libcufile_rdma.so.1" ) + # Add libnvToolsExt only if CUDA version is not 12.9 + if [[ $CUDA_VERSION != 12.9* ]]; then + DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1") + DEPS_SONAME+=("libnvToolsExt.so.1") + fi else echo "Using nvidia libs from pypi." CUDA_RPATHS=( diff --git a/.ci/pytorch/test_example_code/CMakeLists.txt b/.ci/pytorch/test_example_code/CMakeLists.txt index 1793007d515d..e87f37ae61fb 100644 --- a/.ci/pytorch/test_example_code/CMakeLists.txt +++ b/.ci/pytorch/test_example_code/CMakeLists.txt @@ -16,7 +16,7 @@ target_link_libraries(simple-torch-test CUDA::cudart CUDA::cufft CUDA::cusparse find_library(CUDNN_LIBRARY NAMES cudnn) target_link_libraries(simple-torch-test ${CUDNN_LIBRARY} ) if(MSVC) - file(GLOB TORCH_DLLS "$ENV{CUDA_PATH}/bin/cudnn64_8.dll") + file(GLOB TORCH_DLLS "$ENV{CUDA_PATH}/bin/cudnn64_8.dll" "$ENV{NVTOOLSEXT_PATH}/bin/x64/*.dll") message("dlls to copy " ${TORCH_DLLS}) add_custom_command(TARGET simple-torch-test POST_BUILD diff --git a/.ci/pytorch/windows/cuda126.bat b/.ci/pytorch/windows/cuda126.bat index cc45a589c94d..dd30cc25d4a6 100644 --- a/.ci/pytorch/windows/cuda126.bat +++ b/.ci/pytorch/windows/cuda126.bat @@ -18,6 +18,15 @@ REM Check for optional components set USE_CUDA= set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 +IF "%NVTOOLSEXT_PATH%"=="" ( + IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib" ( + set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt + ) ELSE ( + echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing + exit /b 1 + ) +) + IF "%CUDA_PATH_V126%"=="" ( IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin\nvcc.exe" ( set "CUDA_PATH_V126=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6" diff --git a/.ci/pytorch/windows/cuda128.bat b/.ci/pytorch/windows/cuda128.bat index fdc1c6a0eed8..bbdfb4bd1bb7 100644 --- a/.ci/pytorch/windows/cuda128.bat +++ b/.ci/pytorch/windows/cuda128.bat @@ -18,6 +18,15 @@ REM Check for optional components set USE_CUDA= set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 +IF "%NVTOOLSEXT_PATH%"=="" ( + IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib" ( + set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt + ) ELSE ( + echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing + exit /b 1 + ) +) + IF "%CUDA_PATH_V128%"=="" ( IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin\nvcc.exe" ( set "CUDA_PATH_V128=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8" diff --git a/.ci/pytorch/windows/cuda129.bat b/.ci/pytorch/windows/cuda129.bat index 4c580b789f39..77ef14921aa6 100644 --- a/.ci/pytorch/windows/cuda129.bat +++ b/.ci/pytorch/windows/cuda129.bat @@ -18,6 +18,15 @@ REM Check for optional components set USE_CUDA= set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 +IF "%NVTOOLSEXT_PATH%"=="" ( + IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib" ( + set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt + ) ELSE ( + echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing + exit /b 1 + ) +) + IF "%CUDA_PATH_V129%"=="" ( IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\bin\nvcc.exe" ( set "CUDA_PATH_V129=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9" diff --git a/.ci/pytorch/windows/internal/copy.bat b/.ci/pytorch/windows/internal/copy.bat index 2e19a498c7f0..8042db09f462 100644 --- a/.ci/pytorch/windows/internal/copy.bat +++ b/.ci/pytorch/windows/internal/copy.bat @@ -9,6 +9,7 @@ copy "%CUDA_PATH%\bin\cudnn*64_*.dll*" pytorch\torch\lib copy "%CUDA_PATH%\bin\nvrtc*64_*.dll*" pytorch\torch\lib copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib +copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" pytorch\torch\lib copy "%PYTHON_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib :: Should be set in build_pytorch.bat diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat index 891fb097cc01..a0eb650f8506 100644 --- a/.ci/pytorch/windows/internal/cuda_install.bat +++ b/.ci/pytorch/windows/internal/cuda_install.bat @@ -119,6 +119,11 @@ goto cuda_common :: If you cannot find the CUDA version you want to build for here then please :: add it @ https://github.com/pytorch/test-infra/tree/main/aws/ami/windows if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" ( + if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" ( + curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output "%SRC_DIR%\temp_build\NvToolsExt.7z" + if errorlevel 1 exit /b 1 + ) + if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.zip" ( curl -k -L "https://ossci-windows.s3.us-east-1.amazonaws.com/builder/additional_dlls.zip" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip" if errorlevel 1 exit /b 1 @@ -145,6 +150,15 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_ xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations" ) + echo Installing NvToolsExt... + 7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt" + mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64" + mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include" + mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64" + xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\bin\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64" + xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include" + xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64" + echo Installing cuDNN... 7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn" xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin" @@ -175,3 +189,4 @@ echo Setting up environment... set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%" set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%" set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%" +set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt" diff --git a/.github/scripts/windows/build_magma.bat b/.github/scripts/windows/build_magma.bat index 6a9b2069b00f..0f11fe34068e 100644 --- a/.github/scripts/windows/build_magma.bat +++ b/.github/scripts/windows/build_magma.bat @@ -17,6 +17,7 @@ if errorlevel 1 exit /b 1 set "PATH=C:\Tools;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\libnvvp;%PATH%" set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER% +set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt mkdir magma_cuda%CUVER_NODOT% cd magma_cuda%CUVER_NODOT% diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 4202d83a95eb..7b5820cad2d8 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1633,7 +1633,11 @@ if(USE_CUDA) endif() target_link_libraries(torch_cuda INTERFACE torch::cudart) target_link_libraries(torch_cuda PUBLIC c10_cuda) - target_link_libraries(torch_cuda PRIVATE CUDA::nvtx3) + if(TARGET torch::nvtx3) + target_link_libraries(torch_cuda PRIVATE torch::nvtx3) + else() + target_link_libraries(torch_cuda PUBLIC torch::nvtoolsext) + endif() target_include_directories( torch_cuda INTERFACE $) @@ -1725,6 +1729,9 @@ if(BUILD_SHARED_LIBS) if(USE_CUDA) target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS}) target_link_libraries(torch_global_deps torch::cudart) + if(TARGET torch::nvtoolsext) + target_link_libraries(torch_global_deps torch::nvtoolsext) + endif() endif() install(TARGETS torch_global_deps DESTINATION "${TORCH_INSTALL_LIB_DIR}") endif() diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index a93386c27f8d..9241e5e119c5 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -968,17 +968,18 @@ endif() # ---[ nvtx if(USE_SYSTEM_NVTX) find_path(nvtx3_dir NAMES nvtx3 PATHS ${CUDA_INCLUDE_DIRS}) - find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir) - if(NOT nvtx3_FOUND) - message(WARNING "Cannot find system NVTX3, find shipped NVTX3 instead") - endif() -endif() -if(NOT TARGET CUDA::nvtx3) - add_library(CUDA::nvtx3 INTERFACE IMPORTED) -endif() -if(NOT nvtx3_dir) +else() find_path(nvtx3_dir NAMES nvtx3 PATHS "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH) - target_include_directories(CUDA::nvtx3 INTERFACE "${nvtx3_dir}") +endif() +find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir) +if(nvtx3_FOUND) + add_library(torch::nvtx3 INTERFACE IMPORTED) + target_include_directories(torch::nvtx3 INTERFACE "${nvtx3_dir}") + target_compile_definitions(torch::nvtx3 INTERFACE TORCH_CUDA_USE_NVTX3) +else() + message(WARNING "Cannot find NVTX3, find old NVTX instead") + add_library(torch::nvtoolsext INTERFACE IMPORTED) + set_property(TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES CUDA::nvToolsExt) endif() diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in index 8a5587cad272..0b32ffa99ceb 100644 --- a/cmake/TorchConfig.cmake.in +++ b/cmake/TorchConfig.cmake.in @@ -132,6 +132,9 @@ if(@USE_CUDA@) else() set(TORCH_CUDA_LIBRARIES ${CUDA_NVRTC_LIB}) endif() + if(TARGET torch::nvtoolsext) + list(APPEND TORCH_CUDA_LIBRARIES torch::nvtoolsext) + endif() if(@BUILD_SHARED_LIBS@) find_library(C10_CUDA_LIBRARY c10_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib") diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 3985cf50c141..55bd03122eee 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -146,7 +146,13 @@ if(USE_CUDA) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUFILE) endif() - list(APPEND TORCH_PYTHON_LINK_LIBRARIES CUDA::nvtx3) + if(TARGET torch::nvtx3) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtx3) + else() + if(TARGET torch::nvtoolsext) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtoolsext) + endif() + endif() endif() if(USE_ROCM) diff --git a/torch/__init__.py b/torch/__init__.py index 4b862e8699a4..0c54eb1a50b1 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -206,6 +206,20 @@ if sys.platform == "win32": if os.path.exists(p) ] + if not builtins.any( + os.path.exists(os.path.join(p, "nvToolsExt64_1.dll")) for p in dll_paths + ): + nvtoolsext_dll_path = os.path.join( + os.getenv( + "NVTOOLSEXT_PATH", + os.path.join(pfiles_path, "NVIDIA Corporation", "NvToolsExt"), + ), + "bin", + "x64", + ) + else: + nvtoolsext_dll_path = "" + if cuda_version and builtins.all( not glob.glob(os.path.join(p, "cudart64*.dll")) for p in dll_paths ): @@ -218,7 +232,9 @@ if sys.platform == "win32": else: cuda_path = "" - dll_paths.extend(p for p in (cuda_path,) if os.path.exists(p)) + dll_paths.extend( + p for p in (nvtoolsext_dll_path, cuda_path) if os.path.exists(p) + ) kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True) with_load_library_flags = hasattr(kernel32, "AddDllDirectory") @@ -355,6 +371,7 @@ def _load_global_deps() -> None: "cusparselt": "libcusparseLt.so.*[0-9]", "cusolver": "libcusolver.so.*[0-9]", "nccl": "libnccl.so.*[0-9]", + "nvtx": "libnvToolsExt.so.*[0-9]", "nvshmem": "libnvshmem_host.so.*[0-9]", } # cufiile is only available on cuda 12+ diff --git a/torch/csrc/cuda/shared/nvtx.cpp b/torch/csrc/cuda/shared/nvtx.cpp index 8faf319071c3..d28e8ae222ea 100644 --- a/torch/csrc/cuda/shared/nvtx.cpp +++ b/torch/csrc/cuda/shared/nvtx.cpp @@ -3,7 +3,11 @@ #endif #ifndef ROCM_ON_WINDOWS +#ifdef TORCH_CUDA_USE_NVTX3 #include +#else // TORCH_CUDA_USE_NVTX3 +#include +#endif // TORCH_CUDA_USE_NVTX3 #else // ROCM_ON_WINDOWS #include #endif // ROCM_ON_WINDOWS @@ -50,7 +54,11 @@ static void* device_nvtxRangeStart(const char* msg, std::intptr_t stream) { void initNvtxBindings(PyObject* module) { auto m = py::handle(module).cast(); +#ifdef TORCH_CUDA_USE_NVTX3 auto nvtx = m.def_submodule("_nvtx", "nvtx3 bindings"); +#else + auto nvtx = m.def_submodule("_nvtx", "libNvToolsExt.so bindings"); +#endif nvtx.def("rangePushA", nvtxRangePushA); nvtx.def("rangePop", nvtxRangePop); nvtx.def("rangeStartA", nvtxRangeStartA); diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp index 1b4d786b0b67..e08b2a3efd0f 100644 --- a/torch/csrc/profiler/stubs/cuda.cpp +++ b/torch/csrc/profiler/stubs/cuda.cpp @@ -1,7 +1,11 @@ #include #ifndef ROCM_ON_WINDOWS +#ifdef TORCH_CUDA_USE_NVTX3 #include +#else +#include +#endif #else // ROCM_ON_WINDOWS #include #endif // ROCM_ON_WINDOWS diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py index a5145a2f4870..8bb247cb4524 100644 --- a/torch/utils/hipify/cuda_to_hip_mappings.py +++ b/torch/utils/hipify/cuda_to_hip_mappings.py @@ -630,6 +630,7 @@ CUDA_INCLUDE_MAP = collections.OrderedDict( ("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), ("cub/device/device_select.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), ("nvtx3/nvtx3.hpp", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)), + ("nvToolsExt.h", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)), ("nvml.h", ("rocm_smi/rocm_smi.h", CONV_INCLUDE, API_ROCMSMI)), ] )