Files
pytorch/cmake/Summary.cmake
Xu Han 96b30dcb25 [Windows][cpu] mkl use mimalloc as allocator on Windows (#138419)
We did a lot of optimization for PyTorch Windows, and we got good progress of it. But still some models have performance gap between PyTorch Windows and PyTorch Linux. Ref: https://pytorch.org/blog/performance-boost-windows/#conclusion
From the blog conclusion, we found the `ResNet50` is typical case of it.

Let's focus on the `ResNet50`, and collect the profiling log:
```cmd
(nightly) D:\xu_git\dnnl_cb>python test_script_resnet50.py
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
                  model_inference         3.91%     682.427ms       100.00%       17.448s       17.448s             1
                     aten::conv2d         0.18%      30.906ms        64.79%       11.305s       2.133ms          5300
                aten::convolution         0.45%      78.031ms        64.62%       11.275s       2.127ms          5300
               aten::_convolution         0.30%      51.670ms        64.17%       11.196s       2.113ms          5300
         aten::mkldnn_convolution        63.58%       11.093s        63.87%       11.145s       2.103ms          5300
                 aten::batch_norm         0.13%      23.536ms        20.10%        3.506s     661.580us          5300
     aten::_batch_norm_impl_index         0.28%      49.486ms        19.96%        3.483s     657.139us          5300
          aten::native_batch_norm        19.26%        3.360s        19.64%        3.427s     646.615us          5300
                 aten::max_pool2d         0.01%       1.038ms         5.84%        1.018s      10.181ms           100
    aten::max_pool2d_with_indices         5.83%        1.017s         5.83%        1.017s      10.171ms           100
                       aten::add_         3.38%     588.907ms         3.38%     588.907ms      85.349us          6900
                      aten::relu_         0.35%      60.358ms         1.67%     292.155ms      59.624us          4900
                 aten::clamp_min_         1.33%     231.797ms         1.33%     231.797ms      47.306us          4900
                      aten::empty         0.46%      80.195ms         0.46%      80.195ms       1.513us         53000
                     aten::linear         0.01%     927.300us         0.23%      39.353ms     393.532us           100
                      aten::addmm         0.20%      35.379ms         0.21%      37.016ms     370.155us           100
                 aten::empty_like         0.12%      20.455ms         0.17%      29.976ms       5.656us          5300
                aten::as_strided_         0.11%      18.830ms         0.11%      18.830ms       3.553us          5300
        aten::adaptive_avg_pool2d         0.00%     419.900us         0.08%      14.265ms     142.647us           100
                       aten::mean         0.01%       1.737ms         0.08%      13.845ms     138.448us           100
                        aten::sum         0.05%       8.113ms         0.05%       8.648ms      86.479us           100
                    aten::resize_         0.03%       5.182ms         0.03%       5.182ms       0.978us          5300
                       aten::div_         0.01%       1.445ms         0.02%       3.460ms      34.600us           100
                         aten::to         0.00%     337.000us         0.01%       2.015ms      20.154us           100
                   aten::_to_copy         0.01%     977.500us         0.01%       1.678ms      16.784us           100
                      aten::copy_         0.01%       1.474ms         0.01%       1.474ms       7.371us           200
                          aten::t         0.00%     775.900us         0.01%       1.410ms      14.104us           100
                    aten::flatten         0.00%     420.900us         0.01%       1.311ms      13.106us           100
                       aten::view         0.01%     889.700us         0.01%     889.700us       8.897us           100
                  aten::transpose         0.00%     410.700us         0.00%     634.500us       6.345us           100
                     aten::expand         0.00%     496.800us         0.00%     566.800us       5.668us           100
                      aten::fill_         0.00%     534.800us         0.00%     534.800us       5.348us           100
                 aten::as_strided         0.00%     293.800us         0.00%     293.800us       1.469us           200
              aten::empty_strided         0.00%     241.700us         0.00%     241.700us       2.417us           100
               aten::resolve_conj         0.00%      54.800us         0.00%      54.800us       0.274us           200
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
Self CPU time total: 17.448s

Execution time: 20.02380895614624
```
We found the major kernel consume CPU resource is `aten::mkldnn_convolution`. It was dispatched to `MKLDNN`.
Acturally, we had optimized memory allocation via integrated mimalloc to pytorch C10 module. It helps PyTorch Windows boost a lot, but it does not cover `MKL` and `MKLDNN`'s intermediary temporary memory.
We still have potential to improve PyTorch Windows performance via optimize `MKL` and `MKLDNN`'s intermediary temporary memory.

So, I discussed with Intel MKL team, and get a method to register high performance memory allocation API to MKL, and it would help MKL to boost memory performance. Please check the online document: https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-windows/2023-0/redefining-memory-functions.html

This PR is optimize MKL memory alloction performance on Windows, via register mi_malloc to MKL. PR Changes:
1. Add cmake option: `USE_MIMALLOC_ON_MKL`, It is sub-option of `USE_MIMALLOC`.
2. Wrap and export mi_malloc APIs in C10, when `USE_MIMALLOC_ON_MKL` is `ON`.
3. Add MklAllocationHelp.cpp to register allocation APIs to MKL, when `USE_MIMALLOC_ON_MKL` is `ON`.

For `oneDNN`, it is still tracking in this proposal: https://github.com/oneapi-src/oneDNN/issues/1898

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138419
Approved by: https://github.com/jgong5, https://github.com/ezyang
2024-10-24 05:29:47 +00:00

196 lines
10 KiB
CMake

# Prints accumulated Caffe2 configuration summary
function(caffe2_print_configuration_summary)
message(STATUS "")
message(STATUS "******** Summary ********")
message(STATUS "General:")
message(STATUS " CMake version : ${CMAKE_VERSION}")
message(STATUS " CMake command : ${CMAKE_COMMAND}")
message(STATUS " System : ${CMAKE_SYSTEM_NAME}")
message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}")
message(STATUS " C++ compiler id : ${CMAKE_CXX_COMPILER_ID}")
message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS " Using ccache if found : ${USE_CCACHE}")
if(USE_CCACHE)
message(STATUS " Found ccache : ${CCACHE_PROGRAM}")
endif()
message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS}")
message(STATUS " Shared LD flags : ${CMAKE_SHARED_LINKER_FLAGS}")
message(STATUS " Static LD flags : ${CMAKE_STATIC_LINKER_FLAGS}")
message(STATUS " Module LD flags : ${CMAKE_MODULE_LINKER_FLAGS}")
message(STATUS " Build type : ${CMAKE_BUILD_TYPE}")
get_directory_property(tmp DIRECTORY ${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS)
message(STATUS " Compile definitions : ${tmp}")
message(STATUS " CMAKE_PREFIX_PATH : ${CMAKE_PREFIX_PATH}")
message(STATUS " CMAKE_INSTALL_PREFIX : ${CMAKE_INSTALL_PREFIX}")
message(STATUS " USE_GOLD_LINKER : ${USE_GOLD_LINKER}")
message(STATUS "")
message(STATUS " TORCH_VERSION : ${TORCH_VERSION}")
message(STATUS " BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}")
message(STATUS " BUILD_BINARY : ${BUILD_BINARY}")
message(STATUS " BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")
if(${CAFFE2_LINK_LOCAL_PROTOBUF})
message(STATUS " Link local protobuf : ${CAFFE2_LINK_LOCAL_PROTOBUF}")
else()
message(STATUS " Protobuf compiler : ${PROTOBUF_PROTOC_EXECUTABLE}")
message(STATUS " Protobuf includes : ${PROTOBUF_INCLUDE_DIRS}")
message(STATUS " Protobuf libraries : ${PROTOBUF_LIBRARIES}")
endif()
message(STATUS " BUILD_PYTHON : ${BUILD_PYTHON}")
if(${BUILD_PYTHON})
message(STATUS " Python version : ${Python_VERSION}")
message(STATUS " Python executable : ${Python_EXECUTABLE}")
message(STATUS " Python library : ${Python_LIBRARIES}")
message(STATUS " Python includes : ${Python_INCLUDE_DIRS}")
message(STATUS " Python site-package : ${Python_SITELIB}")
endif()
message(STATUS " BUILD_SHARED_LIBS : ${BUILD_SHARED_LIBS}")
message(STATUS " CAFFE2_USE_MSVC_STATIC_RUNTIME : ${CAFFE2_USE_MSVC_STATIC_RUNTIME}")
message(STATUS " BUILD_TEST : ${BUILD_TEST}")
message(STATUS " BUILD_JNI : ${BUILD_JNI}")
message(STATUS " BUILD_MOBILE_AUTOGRAD : ${BUILD_MOBILE_AUTOGRAD}")
message(STATUS " BUILD_LITE_INTERPRETER: ${BUILD_LITE_INTERPRETER}")
if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
message(STATUS " CROSS_COMPILING_MACOSX : ${CROSS_COMPILING_MACOSX}")
endif()
message(STATUS " INTERN_BUILD_MOBILE : ${INTERN_BUILD_MOBILE}")
message(STATUS " TRACING_BASED : ${TRACING_BASED}")
message(STATUS " USE_BLAS : ${USE_BLAS}")
if(${USE_BLAS})
message(STATUS " BLAS : ${BLAS_INFO}")
message(STATUS " BLAS_HAS_SBGEMM : ${BLAS_HAS_SBGEMM}")
endif()
message(STATUS " USE_LAPACK : ${USE_LAPACK}")
if(${USE_LAPACK})
message(STATUS " LAPACK : ${LAPACK_INFO}")
endif()
message(STATUS " USE_ASAN : ${USE_ASAN}")
message(STATUS " USE_TSAN : ${USE_TSAN}")
message(STATUS " USE_CPP_CODE_COVERAGE : ${USE_CPP_CODE_COVERAGE}")
message(STATUS " USE_CUDA : ${USE_CUDA}")
if(${USE_CUDA})
message(STATUS " Split CUDA : ${BUILD_SPLIT_CUDA}")
message(STATUS " CUDA static link : ${CAFFE2_STATIC_LINK_CUDA}")
message(STATUS " USE_CUDNN : ${USE_CUDNN}")
message(STATUS " USE_CUSPARSELT : ${USE_CUSPARSELT}")
message(STATUS " USE_CUDSS : ${USE_CUDSS}")
message(STATUS " USE_CUFILE : ${USE_CUFILE}")
message(STATUS " CUDA version : ${CUDA_VERSION}")
message(STATUS " USE_FLASH_ATTENTION : ${USE_FLASH_ATTENTION}")
message(STATUS " USE_MEM_EFF_ATTENTION : ${USE_MEM_EFF_ATTENTION}")
if(${USE_CUDNN})
message(STATUS " cuDNN version : ${CUDNN_VERSION}")
endif()
if(${USE_CUSPARSELT})
message(STATUS " cuSPARSELt version : ${CUSPARSELT_VERSION}")
endif()
if(${USE_CUFILE})
message(STATUS " cufile library : ${CUDA_cuFile_LIBRARY}")
endif()
message(STATUS " CUDA root directory : ${CUDA_TOOLKIT_ROOT_DIR}")
message(STATUS " CUDA library : ${CUDA_cuda_driver_LIBRARY}")
message(STATUS " cudart library : ${CUDA_cudart_LIBRARY}")
message(STATUS " cublas library : ${CUDA_cublas_LIBRARY}")
message(STATUS " cufft library : ${CUDA_cufft_LIBRARY}")
message(STATUS " curand library : ${CUDA_curand_LIBRARY}")
message(STATUS " cusparse library : ${CUDA_cusparse_LIBRARY}")
if(${USE_CUDNN})
get_target_property(__tmp torch::cudnn INTERFACE_LINK_LIBRARIES)
message(STATUS " cuDNN library : ${__tmp}")
endif()
if(${USE_CUSPARSELT})
get_target_property(__tmp torch::cusparselt INTERFACE_LINK_LIBRARIES)
message(STATUS " cuSPARSELt library : ${__tmp}")
endif()
if(${USE_CUDSS})
get_target_property(__tmp torch::cudss INTERFACE_LINK_LIBRARIES)
message(STATUS " cuDSS library : ${__tmp}")
endif()
message(STATUS " nvrtc : ${CUDA_nvrtc_LIBRARY}")
message(STATUS " CUDA include path : ${CUDA_INCLUDE_DIRS}")
message(STATUS " NVCC executable : ${CUDA_NVCC_EXECUTABLE}")
message(STATUS " CUDA compiler : ${CMAKE_CUDA_COMPILER}")
message(STATUS " CUDA flags : ${CMAKE_CUDA_FLAGS}")
message(STATUS " CUDA host compiler : ${CMAKE_CUDA_HOST_COMPILER}")
message(STATUS " CUDA --device-c : ${CUDA_SEPARABLE_COMPILATION}")
message(STATUS " USE_TENSORRT : ${USE_TENSORRT}")
if(${USE_TENSORRT})
message(STATUS " TensorRT runtime library: ${TENSORRT_LIBRARY}")
message(STATUS " TensorRT include path : ${TENSORRT_INCLUDE_DIR}")
endif()
endif()
message(STATUS " USE_XPU : ${USE_XPU}")
if(${USE_XPU})
message(STATUS " SYCL include path : ${SYCL_INCLUDE_DIR}")
message(STATUS " SYCL library : ${SYCL_LIBRARY}")
endif()
message(STATUS " USE_ROCM : ${USE_ROCM}")
if(${USE_ROCM})
message(STATUS " ROCM_VERSION : ${ROCM_VERSION}")
message(STATUS " USE_FLASH_ATTENTION : ${USE_FLASH_ATTENTION}")
message(STATUS " USE_MEM_EFF_ATTENTION : ${USE_MEM_EFF_ATTENTION}")
endif()
message(STATUS " BUILD_NVFUSER : ${BUILD_NVFUSER}")
message(STATUS " USE_EIGEN_FOR_BLAS : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
message(STATUS " USE_FBGEMM : ${USE_FBGEMM}")
message(STATUS " USE_FAKELOWP : ${USE_FAKELOWP}")
message(STATUS " USE_KINETO : ${USE_KINETO}")
message(STATUS " USE_GFLAGS : ${USE_GFLAGS}")
message(STATUS " USE_GLOG : ${USE_GLOG}")
message(STATUS " USE_LITE_PROTO : ${USE_LITE_PROTO}")
message(STATUS " USE_PYTORCH_METAL : ${USE_PYTORCH_METAL}")
message(STATUS " USE_PYTORCH_METAL_EXPORT : ${USE_PYTORCH_METAL_EXPORT}")
message(STATUS " USE_MPS : ${USE_MPS}")
message(STATUS " USE_MKL : ${CAFFE2_USE_MKL}")
message(STATUS " USE_MKLDNN : ${USE_MKLDNN}")
if(${USE_MKLDNN})
message(STATUS " USE_MKLDNN_ACL : ${USE_MKLDNN_ACL}")
message(STATUS " USE_MKLDNN_CBLAS : ${USE_MKLDNN_CBLAS}")
endif()
message(STATUS " USE_UCC : ${USE_UCC}")
if(${USE_UCC})
message(STATUS " USE_SYSTEM_UCC : ${USE_SYSTEM_UCC}")
endif()
message(STATUS " USE_ITT : ${USE_ITT}")
message(STATUS " USE_NCCL : ${USE_NCCL}")
if(${USE_NCCL})
message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}")
endif()
message(STATUS " USE_NNPACK : ${USE_NNPACK}")
message(STATUS " USE_NUMPY : ${USE_NUMPY}")
message(STATUS " USE_OBSERVERS : ${USE_OBSERVERS}")
message(STATUS " USE_OPENCL : ${USE_OPENCL}")
message(STATUS " USE_OPENMP : ${USE_OPENMP}")
message(STATUS " USE_MIMALLOC : ${USE_MIMALLOC}")
if(${USE_MIMALLOC})
message(STATUS " USE_MIMALLOC_ON_MKL : ${USE_MIMALLOC_ON_MKL}")
endif()
message(STATUS " USE_VULKAN : ${USE_VULKAN}")
if(${USE_VULKAN})
message(STATUS " USE_VULKAN_FP16_INFERENCE : ${USE_VULKAN_FP16_INFERENCE}")
message(STATUS " USE_VULKAN_RELAXED_PRECISION : ${USE_VULKAN_RELAXED_PRECISION}")
endif()
message(STATUS " USE_PROF : ${USE_PROF}")
message(STATUS " USE_PYTORCH_QNNPACK : ${USE_PYTORCH_QNNPACK}")
message(STATUS " USE_XNNPACK : ${USE_XNNPACK}")
message(STATUS " USE_DISTRIBUTED : ${USE_DISTRIBUTED}")
if(${USE_DISTRIBUTED})
message(STATUS " USE_MPI : ${USE_MPI}")
message(STATUS " USE_GLOO : ${USE_GLOO}")
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
endif()
if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}")
endif()
message(STATUS " Public Dependencies : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}")
message(STATUS " Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}")
message(STATUS " Public CUDA Deps. : ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS}")
message(STATUS " Private CUDA Deps. : ${Caffe2_CUDA_DEPENDENCY_LIBS}")
# coreml
message(STATUS " USE_COREML_DELEGATE : ${USE_COREML_DELEGATE}")
message(STATUS " BUILD_LAZY_TS_BACKEND : ${BUILD_LAZY_TS_BACKEND}")
message(STATUS " USE_ROCM_KERNEL_ASSERT : ${USE_ROCM_KERNEL_ASSERT}")
endfunction()