mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
We did a lot of optimization for PyTorch Windows, and we got good progress of it. But still some models have performance gap between PyTorch Windows and PyTorch Linux. Ref: https://pytorch.org/blog/performance-boost-windows/#conclusion From the blog conclusion, we found the `ResNet50` is typical case of it. Let's focus on the `ResNet50`, and collect the profiling log: ```cmd (nightly) D:\xu_git\dnnl_cb>python test_script_resnet50.py --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ model_inference 3.91% 682.427ms 100.00% 17.448s 17.448s 1 aten::conv2d 0.18% 30.906ms 64.79% 11.305s 2.133ms 5300 aten::convolution 0.45% 78.031ms 64.62% 11.275s 2.127ms 5300 aten::_convolution 0.30% 51.670ms 64.17% 11.196s 2.113ms 5300 aten::mkldnn_convolution 63.58% 11.093s 63.87% 11.145s 2.103ms 5300 aten::batch_norm 0.13% 23.536ms 20.10% 3.506s 661.580us 5300 aten::_batch_norm_impl_index 0.28% 49.486ms 19.96% 3.483s 657.139us 5300 aten::native_batch_norm 19.26% 3.360s 19.64% 3.427s 646.615us 5300 aten::max_pool2d 0.01% 1.038ms 5.84% 1.018s 10.181ms 100 aten::max_pool2d_with_indices 5.83% 1.017s 5.83% 1.017s 10.171ms 100 aten::add_ 3.38% 588.907ms 3.38% 588.907ms 85.349us 6900 aten::relu_ 0.35% 60.358ms 1.67% 292.155ms 59.624us 4900 aten::clamp_min_ 1.33% 231.797ms 1.33% 231.797ms 47.306us 4900 aten::empty 0.46% 80.195ms 0.46% 80.195ms 1.513us 53000 aten::linear 0.01% 927.300us 0.23% 39.353ms 393.532us 100 aten::addmm 0.20% 35.379ms 0.21% 37.016ms 370.155us 100 aten::empty_like 0.12% 20.455ms 0.17% 29.976ms 5.656us 5300 aten::as_strided_ 0.11% 18.830ms 0.11% 18.830ms 3.553us 5300 aten::adaptive_avg_pool2d 0.00% 419.900us 0.08% 14.265ms 142.647us 100 aten::mean 0.01% 1.737ms 0.08% 13.845ms 138.448us 100 aten::sum 0.05% 8.113ms 0.05% 8.648ms 86.479us 100 aten::resize_ 0.03% 5.182ms 0.03% 5.182ms 0.978us 5300 aten::div_ 0.01% 1.445ms 0.02% 3.460ms 34.600us 100 aten::to 0.00% 337.000us 0.01% 2.015ms 20.154us 100 aten::_to_copy 0.01% 977.500us 0.01% 1.678ms 16.784us 100 aten::copy_ 0.01% 1.474ms 0.01% 1.474ms 7.371us 200 aten::t 0.00% 775.900us 0.01% 1.410ms 14.104us 100 aten::flatten 0.00% 420.900us 0.01% 1.311ms 13.106us 100 aten::view 0.01% 889.700us 0.01% 889.700us 8.897us 100 aten::transpose 0.00% 410.700us 0.00% 634.500us 6.345us 100 aten::expand 0.00% 496.800us 0.00% 566.800us 5.668us 100 aten::fill_ 0.00% 534.800us 0.00% 534.800us 5.348us 100 aten::as_strided 0.00% 293.800us 0.00% 293.800us 1.469us 200 aten::empty_strided 0.00% 241.700us 0.00% 241.700us 2.417us 100 aten::resolve_conj 0.00% 54.800us 0.00% 54.800us 0.274us 200 --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ Self CPU time total: 17.448s Execution time: 20.02380895614624 ``` We found the major kernel consume CPU resource is `aten::mkldnn_convolution`. It was dispatched to `MKLDNN`. Acturally, we had optimized memory allocation via integrated mimalloc to pytorch C10 module. It helps PyTorch Windows boost a lot, but it does not cover `MKL` and `MKLDNN`'s intermediary temporary memory. We still have potential to improve PyTorch Windows performance via optimize `MKL` and `MKLDNN`'s intermediary temporary memory. So, I discussed with Intel MKL team, and get a method to register high performance memory allocation API to MKL, and it would help MKL to boost memory performance. Please check the online document: https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-windows/2023-0/redefining-memory-functions.html This PR is optimize MKL memory alloction performance on Windows, via register mi_malloc to MKL. PR Changes: 1. Add cmake option: `USE_MIMALLOC_ON_MKL`, It is sub-option of `USE_MIMALLOC`. 2. Wrap and export mi_malloc APIs in C10, when `USE_MIMALLOC_ON_MKL` is `ON`. 3. Add MklAllocationHelp.cpp to register allocation APIs to MKL, when `USE_MIMALLOC_ON_MKL` is `ON`. For `oneDNN`, it is still tracking in this proposal: https://github.com/oneapi-src/oneDNN/issues/1898 Pull Request resolved: https://github.com/pytorch/pytorch/pull/138419 Approved by: https://github.com/jgong5, https://github.com/ezyang
196 lines
10 KiB
CMake
196 lines
10 KiB
CMake
# Prints accumulated Caffe2 configuration summary
|
|
function(caffe2_print_configuration_summary)
|
|
message(STATUS "")
|
|
message(STATUS "******** Summary ********")
|
|
message(STATUS "General:")
|
|
message(STATUS " CMake version : ${CMAKE_VERSION}")
|
|
message(STATUS " CMake command : ${CMAKE_COMMAND}")
|
|
message(STATUS " System : ${CMAKE_SYSTEM_NAME}")
|
|
message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}")
|
|
message(STATUS " C++ compiler id : ${CMAKE_CXX_COMPILER_ID}")
|
|
message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}")
|
|
message(STATUS " Using ccache if found : ${USE_CCACHE}")
|
|
if(USE_CCACHE)
|
|
message(STATUS " Found ccache : ${CCACHE_PROGRAM}")
|
|
endif()
|
|
message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS}")
|
|
message(STATUS " Shared LD flags : ${CMAKE_SHARED_LINKER_FLAGS}")
|
|
message(STATUS " Static LD flags : ${CMAKE_STATIC_LINKER_FLAGS}")
|
|
message(STATUS " Module LD flags : ${CMAKE_MODULE_LINKER_FLAGS}")
|
|
message(STATUS " Build type : ${CMAKE_BUILD_TYPE}")
|
|
get_directory_property(tmp DIRECTORY ${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS)
|
|
message(STATUS " Compile definitions : ${tmp}")
|
|
message(STATUS " CMAKE_PREFIX_PATH : ${CMAKE_PREFIX_PATH}")
|
|
message(STATUS " CMAKE_INSTALL_PREFIX : ${CMAKE_INSTALL_PREFIX}")
|
|
message(STATUS " USE_GOLD_LINKER : ${USE_GOLD_LINKER}")
|
|
message(STATUS "")
|
|
|
|
message(STATUS " TORCH_VERSION : ${TORCH_VERSION}")
|
|
message(STATUS " BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}")
|
|
message(STATUS " BUILD_BINARY : ${BUILD_BINARY}")
|
|
message(STATUS " BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")
|
|
if(${CAFFE2_LINK_LOCAL_PROTOBUF})
|
|
message(STATUS " Link local protobuf : ${CAFFE2_LINK_LOCAL_PROTOBUF}")
|
|
else()
|
|
message(STATUS " Protobuf compiler : ${PROTOBUF_PROTOC_EXECUTABLE}")
|
|
message(STATUS " Protobuf includes : ${PROTOBUF_INCLUDE_DIRS}")
|
|
message(STATUS " Protobuf libraries : ${PROTOBUF_LIBRARIES}")
|
|
endif()
|
|
message(STATUS " BUILD_PYTHON : ${BUILD_PYTHON}")
|
|
if(${BUILD_PYTHON})
|
|
message(STATUS " Python version : ${Python_VERSION}")
|
|
message(STATUS " Python executable : ${Python_EXECUTABLE}")
|
|
message(STATUS " Python library : ${Python_LIBRARIES}")
|
|
message(STATUS " Python includes : ${Python_INCLUDE_DIRS}")
|
|
message(STATUS " Python site-package : ${Python_SITELIB}")
|
|
endif()
|
|
message(STATUS " BUILD_SHARED_LIBS : ${BUILD_SHARED_LIBS}")
|
|
message(STATUS " CAFFE2_USE_MSVC_STATIC_RUNTIME : ${CAFFE2_USE_MSVC_STATIC_RUNTIME}")
|
|
message(STATUS " BUILD_TEST : ${BUILD_TEST}")
|
|
message(STATUS " BUILD_JNI : ${BUILD_JNI}")
|
|
message(STATUS " BUILD_MOBILE_AUTOGRAD : ${BUILD_MOBILE_AUTOGRAD}")
|
|
message(STATUS " BUILD_LITE_INTERPRETER: ${BUILD_LITE_INTERPRETER}")
|
|
if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
|
|
message(STATUS " CROSS_COMPILING_MACOSX : ${CROSS_COMPILING_MACOSX}")
|
|
endif()
|
|
message(STATUS " INTERN_BUILD_MOBILE : ${INTERN_BUILD_MOBILE}")
|
|
message(STATUS " TRACING_BASED : ${TRACING_BASED}")
|
|
|
|
message(STATUS " USE_BLAS : ${USE_BLAS}")
|
|
if(${USE_BLAS})
|
|
message(STATUS " BLAS : ${BLAS_INFO}")
|
|
message(STATUS " BLAS_HAS_SBGEMM : ${BLAS_HAS_SBGEMM}")
|
|
endif()
|
|
message(STATUS " USE_LAPACK : ${USE_LAPACK}")
|
|
if(${USE_LAPACK})
|
|
message(STATUS " LAPACK : ${LAPACK_INFO}")
|
|
endif()
|
|
message(STATUS " USE_ASAN : ${USE_ASAN}")
|
|
message(STATUS " USE_TSAN : ${USE_TSAN}")
|
|
message(STATUS " USE_CPP_CODE_COVERAGE : ${USE_CPP_CODE_COVERAGE}")
|
|
message(STATUS " USE_CUDA : ${USE_CUDA}")
|
|
if(${USE_CUDA})
|
|
message(STATUS " Split CUDA : ${BUILD_SPLIT_CUDA}")
|
|
message(STATUS " CUDA static link : ${CAFFE2_STATIC_LINK_CUDA}")
|
|
message(STATUS " USE_CUDNN : ${USE_CUDNN}")
|
|
message(STATUS " USE_CUSPARSELT : ${USE_CUSPARSELT}")
|
|
message(STATUS " USE_CUDSS : ${USE_CUDSS}")
|
|
message(STATUS " USE_CUFILE : ${USE_CUFILE}")
|
|
message(STATUS " CUDA version : ${CUDA_VERSION}")
|
|
message(STATUS " USE_FLASH_ATTENTION : ${USE_FLASH_ATTENTION}")
|
|
message(STATUS " USE_MEM_EFF_ATTENTION : ${USE_MEM_EFF_ATTENTION}")
|
|
if(${USE_CUDNN})
|
|
message(STATUS " cuDNN version : ${CUDNN_VERSION}")
|
|
endif()
|
|
if(${USE_CUSPARSELT})
|
|
message(STATUS " cuSPARSELt version : ${CUSPARSELT_VERSION}")
|
|
endif()
|
|
if(${USE_CUFILE})
|
|
message(STATUS " cufile library : ${CUDA_cuFile_LIBRARY}")
|
|
endif()
|
|
message(STATUS " CUDA root directory : ${CUDA_TOOLKIT_ROOT_DIR}")
|
|
message(STATUS " CUDA library : ${CUDA_cuda_driver_LIBRARY}")
|
|
message(STATUS " cudart library : ${CUDA_cudart_LIBRARY}")
|
|
message(STATUS " cublas library : ${CUDA_cublas_LIBRARY}")
|
|
message(STATUS " cufft library : ${CUDA_cufft_LIBRARY}")
|
|
message(STATUS " curand library : ${CUDA_curand_LIBRARY}")
|
|
message(STATUS " cusparse library : ${CUDA_cusparse_LIBRARY}")
|
|
if(${USE_CUDNN})
|
|
get_target_property(__tmp torch::cudnn INTERFACE_LINK_LIBRARIES)
|
|
message(STATUS " cuDNN library : ${__tmp}")
|
|
endif()
|
|
if(${USE_CUSPARSELT})
|
|
get_target_property(__tmp torch::cusparselt INTERFACE_LINK_LIBRARIES)
|
|
message(STATUS " cuSPARSELt library : ${__tmp}")
|
|
endif()
|
|
if(${USE_CUDSS})
|
|
get_target_property(__tmp torch::cudss INTERFACE_LINK_LIBRARIES)
|
|
message(STATUS " cuDSS library : ${__tmp}")
|
|
endif()
|
|
message(STATUS " nvrtc : ${CUDA_nvrtc_LIBRARY}")
|
|
message(STATUS " CUDA include path : ${CUDA_INCLUDE_DIRS}")
|
|
message(STATUS " NVCC executable : ${CUDA_NVCC_EXECUTABLE}")
|
|
message(STATUS " CUDA compiler : ${CMAKE_CUDA_COMPILER}")
|
|
message(STATUS " CUDA flags : ${CMAKE_CUDA_FLAGS}")
|
|
message(STATUS " CUDA host compiler : ${CMAKE_CUDA_HOST_COMPILER}")
|
|
message(STATUS " CUDA --device-c : ${CUDA_SEPARABLE_COMPILATION}")
|
|
message(STATUS " USE_TENSORRT : ${USE_TENSORRT}")
|
|
if(${USE_TENSORRT})
|
|
message(STATUS " TensorRT runtime library: ${TENSORRT_LIBRARY}")
|
|
message(STATUS " TensorRT include path : ${TENSORRT_INCLUDE_DIR}")
|
|
endif()
|
|
endif()
|
|
message(STATUS " USE_XPU : ${USE_XPU}")
|
|
if(${USE_XPU})
|
|
message(STATUS " SYCL include path : ${SYCL_INCLUDE_DIR}")
|
|
message(STATUS " SYCL library : ${SYCL_LIBRARY}")
|
|
endif()
|
|
message(STATUS " USE_ROCM : ${USE_ROCM}")
|
|
if(${USE_ROCM})
|
|
message(STATUS " ROCM_VERSION : ${ROCM_VERSION}")
|
|
message(STATUS " USE_FLASH_ATTENTION : ${USE_FLASH_ATTENTION}")
|
|
message(STATUS " USE_MEM_EFF_ATTENTION : ${USE_MEM_EFF_ATTENTION}")
|
|
endif()
|
|
message(STATUS " BUILD_NVFUSER : ${BUILD_NVFUSER}")
|
|
message(STATUS " USE_EIGEN_FOR_BLAS : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
|
|
message(STATUS " USE_FBGEMM : ${USE_FBGEMM}")
|
|
message(STATUS " USE_FAKELOWP : ${USE_FAKELOWP}")
|
|
message(STATUS " USE_KINETO : ${USE_KINETO}")
|
|
message(STATUS " USE_GFLAGS : ${USE_GFLAGS}")
|
|
message(STATUS " USE_GLOG : ${USE_GLOG}")
|
|
message(STATUS " USE_LITE_PROTO : ${USE_LITE_PROTO}")
|
|
message(STATUS " USE_PYTORCH_METAL : ${USE_PYTORCH_METAL}")
|
|
message(STATUS " USE_PYTORCH_METAL_EXPORT : ${USE_PYTORCH_METAL_EXPORT}")
|
|
message(STATUS " USE_MPS : ${USE_MPS}")
|
|
message(STATUS " USE_MKL : ${CAFFE2_USE_MKL}")
|
|
message(STATUS " USE_MKLDNN : ${USE_MKLDNN}")
|
|
if(${USE_MKLDNN})
|
|
message(STATUS " USE_MKLDNN_ACL : ${USE_MKLDNN_ACL}")
|
|
message(STATUS " USE_MKLDNN_CBLAS : ${USE_MKLDNN_CBLAS}")
|
|
endif()
|
|
message(STATUS " USE_UCC : ${USE_UCC}")
|
|
if(${USE_UCC})
|
|
message(STATUS " USE_SYSTEM_UCC : ${USE_SYSTEM_UCC}")
|
|
endif()
|
|
message(STATUS " USE_ITT : ${USE_ITT}")
|
|
message(STATUS " USE_NCCL : ${USE_NCCL}")
|
|
if(${USE_NCCL})
|
|
message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}")
|
|
endif()
|
|
message(STATUS " USE_NNPACK : ${USE_NNPACK}")
|
|
message(STATUS " USE_NUMPY : ${USE_NUMPY}")
|
|
message(STATUS " USE_OBSERVERS : ${USE_OBSERVERS}")
|
|
message(STATUS " USE_OPENCL : ${USE_OPENCL}")
|
|
message(STATUS " USE_OPENMP : ${USE_OPENMP}")
|
|
message(STATUS " USE_MIMALLOC : ${USE_MIMALLOC}")
|
|
if(${USE_MIMALLOC})
|
|
message(STATUS " USE_MIMALLOC_ON_MKL : ${USE_MIMALLOC_ON_MKL}")
|
|
endif()
|
|
message(STATUS " USE_VULKAN : ${USE_VULKAN}")
|
|
if(${USE_VULKAN})
|
|
message(STATUS " USE_VULKAN_FP16_INFERENCE : ${USE_VULKAN_FP16_INFERENCE}")
|
|
message(STATUS " USE_VULKAN_RELAXED_PRECISION : ${USE_VULKAN_RELAXED_PRECISION}")
|
|
endif()
|
|
message(STATUS " USE_PROF : ${USE_PROF}")
|
|
message(STATUS " USE_PYTORCH_QNNPACK : ${USE_PYTORCH_QNNPACK}")
|
|
message(STATUS " USE_XNNPACK : ${USE_XNNPACK}")
|
|
message(STATUS " USE_DISTRIBUTED : ${USE_DISTRIBUTED}")
|
|
if(${USE_DISTRIBUTED})
|
|
message(STATUS " USE_MPI : ${USE_MPI}")
|
|
message(STATUS " USE_GLOO : ${USE_GLOO}")
|
|
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
|
|
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
|
|
endif()
|
|
if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
|
|
message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}")
|
|
endif()
|
|
message(STATUS " Public Dependencies : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}")
|
|
message(STATUS " Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}")
|
|
message(STATUS " Public CUDA Deps. : ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS}")
|
|
message(STATUS " Private CUDA Deps. : ${Caffe2_CUDA_DEPENDENCY_LIBS}")
|
|
# coreml
|
|
message(STATUS " USE_COREML_DELEGATE : ${USE_COREML_DELEGATE}")
|
|
message(STATUS " BUILD_LAZY_TS_BACKEND : ${BUILD_LAZY_TS_BACKEND}")
|
|
message(STATUS " USE_ROCM_KERNEL_ASSERT : ${USE_ROCM_KERNEL_ASSERT}")
|
|
endfunction()
|