Files
pytorch/third_party/mkl-dnn.BUILD
Jiang, Yanbing f2f25a5444 Upgrade submodule oneDNN to v3.7.1 (#148293)
This PR is to upgrade submodule oneDNN to v3.7.1.

## Improvements

- Improved performance of convolution and matmul primitives on Intel Xeon processors with Intel AMX instruction set support (formerly Sapphire Rapids and Granite Rapids).
- Improved performance of int8 and fp32 forward convolution primitive on processors with Intel AVX2 instruction set support.
- Improved performance of fp8 matmul primitives with bf16 and fp16 bias data type on Intel Xeon processors with Intel AMX instruction set support (formerly Sapphire Rapids and Granite Rapids).
- Introduced initial optimizations for Intel GPUs based on Xe3 architecture.
- Added bfloat16 support for SDPA, implemented fp16 and bf16 gemm kernel in SDPA.
- Fixed f16 matmul accuracy, the issue of SDPA cannot dispatched to ukernel, bf16/fp16/fp32 conv performance, INT8 Kernel trigger page fault, deconvolution precision issue on complex128 and fp64 and gemm correctness issue in float16 issues.
- Improved bf16 matmul performance with fp32 destination with Arm Compute Library (ACL).
- Improved bf16 to fp32 reorder performance.
- Improved bf16 reorder performance.
- Improved bf16 convolution with ACL.

Fixes https://github.com/pytorch/pytorch/issues/136348.

## Validation results on CPU

1. NLP models accuracy/inference/training
![image](https://github.com/user-attachments/assets/859279b8-1631-4268-b226-7de9ac5870d8)

![image](https://github.com/user-attachments/assets/30ec7151-41ca-482a-9d2d-0c4850e75bab)

2. Torchbench cpu userbenchmark inference & training

![image](https://github.com/user-attachments/assets/71c9807c-caf9-4385-9990-d2ab637031cd)

3. Inductor quantization

![image](https://github.com/user-attachments/assets/3d2a3bd3-82fa-4566-8050-7ea5d6b61675)

4. Dynamo benchmarks
![image](https://github.com/user-attachments/assets/554ecce3-c85c-4a0e-88f1-2e73983c5dcd)
![image](https://github.com/user-attachments/assets/148c88f8-4367-4428-bb54-ce8a4deefd1b)
![image](https://github.com/user-attachments/assets/f2e744f4-d710-4699-acf4-1f130ecfadf1)
![image](https://github.com/user-attachments/assets/97128b80-4d0e-495a-aeda-dde3e70c96fd)
![image](https://github.com/user-attachments/assets/a9afce37-684c-45c0-b938-6dd7e0383805)
![image](https://github.com/user-attachments/assets/b8714236-9681-4fbe-8d98-be93deedab88)
![image](https://github.com/user-attachments/assets/4423061f-d133-45ba-98bd-d2f739e50431)
![image](https://github.com/user-attachments/assets/7955da10-3d23-493e-99fa-658f7f40035b)

## Validation results on XPU
Accuracy is same as baseline. Performance is shown below.
![image](https://github.com/user-attachments/assets/7645304d-5b1d-43f9-b840-9f846ed380a0)

## Validation results on ARM
![image](https://github.com/user-attachments/assets/080f7c02-0238-436f-ad20-5a9e3f6aafbb)
![image](https://github.com/user-attachments/assets/443742aa-ca61-41de-ae80-5d4c65cd0c87)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/148293
Approved by: https://github.com/mingfeima, https://github.com/atalman
2025-03-04 13:56:45 +00:00

162 lines
6.9 KiB
Plaintext

load("@rules_cc//cc:defs.bzl", "cc_library")
load("@pytorch//third_party:substitution.bzl", "template_rule")
_DNNL_RUNTIME_OMP = {
"#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
"#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
"#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
"#cmakedefine DNNL_GPU_VENDOR DNNL_VENDOR_${DNNL_GPU_VENDOR}": "/* undef DNNL_GPU_VENDOR */",
"#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "/* undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE */",
"#cmakedefine DNNL_WITH_SYCL": "/* #undef DNNL_WITH_SYCL */",
"#cmakedefine DNNL_WITH_LEVEL_ZERO": "/* #undef DNNL_WITH_LEVEL_ZERO */",
"#cmakedefine DNNL_SYCL_CUDA": "/* #undef DNNL_SYCL_CUDA */",
"#cmakedefine DNNL_SYCL_HIP": "/* #undef DNNL_SYCL_HIP */",
"#cmakedefine DNNL_SYCL_GENERIC": "/* #undef DNNL_SYCL_GENERIC */",
"#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
"#cmakedefine DNNL_EXPERIMENTAL_UKERNEL": "/* undef DNNL_EXPERIMENTAL_UKERNEL */",
"#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
"#cmakedefine DNNL_EXPERIMENTAL_SPARSE": "#undef DNNL_EXPERIMENTAL_SPARSE",
"#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
"#cmakedefine DNNL_EXPERIMENTAL_PROFILING": "#undef DNNL_EXPERIMENTAL_PROFILING",
"#cmakedefine DNNL_DISABLE_GPU_REF_KERNELS": "#undef DNNL_DISABLE_GPU_REF_KERNELS",
"#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
"#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
"#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
"#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0",
"#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0",
"#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0",
"#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0",
"#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0",
"#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0",
"#cmakedefine01 BUILD_GROUP_NORMALIZATION": "#define BUILD_GROUP_NORMALIZATION 0",
"#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0",
"#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0",
"#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0",
"#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0",
"#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0",
"#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0",
"#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0",
"#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0",
"#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0",
"#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0",
"#cmakedefine01 BUILD_SDPA": "#define BUILD_SDPA 0",
"#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0",
"#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0",
"#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0",
"#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 1",
"#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0",
"#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
"#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
"#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
"#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 1",
"#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
"#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
"#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
"#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
"#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
"#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
"#cmakedefine01 BUILD_XE2": "#define BUILD_XE2 0",
"#cmakedefine01 BUILD_XE3": "#define BUILD_XE3 0",
"#cmakedefine01 BUILD_GEMM_KERNELS_ALL": "#define BUILD_GEMM_KERNELS_ALL 0",
"#cmakedefine01 BUILD_GEMM_KERNELS_NONE": "#define BUILD_GEMM_KERNELS_NONE 0",
"#cmakedefine01 BUILD_GEMM_SSE41": "#define BUILD_GEMM_SSE41 0",
"#cmakedefine01 BUILD_GEMM_AVX2": "#define BUILD_GEMM_AVX2 0",
"#cmakedefine01 BUILD_GEMM_AVX512": "#define BUILD_GEMM_AVX512 0",
}
template_rule(
name = "include_dnnl_version",
src = "include/oneapi/dnnl/dnnl_version.h.in",
out = "include/oneapi/dnnl/dnnl_version.h",
substitutions = {
"@DNNL_VERSION_MAJOR@": "3",
"@DNNL_VERSION_MINOR@": "7",
"@DNNL_VERSION_PATCH@": "1",
},
)
template_rule(
name = "include_dnnl_config",
src = "include/oneapi/dnnl/dnnl_config.h.in",
out = "include/oneapi/dnnl/dnnl_config.h",
substitutions = _DNNL_RUNTIME_OMP,
)
template_rule(
name = "include_dnnl_version_hash",
src = "include/oneapi/dnnl/dnnl_version_hash.h.in",
out = "include/oneapi/dnnl/dnnl_version_hash.h",
substitutions = {"@DNNL_VERSION_HASH@": "8d263e693366ef8db40acc569cc7d8edf644556d",}
)
cc_library(
name = "mkl-dnn",
srcs = glob([
"src/common/*.cpp",
"src/cpu/**/*.cpp",
"src/cpu/**/**/*.cpp",
], exclude=[
"src/cpu/aarch64/**/*.cpp",
"src/cpu/rv64/**/*.cpp",
"src/cpu/sycl/**/*.cpp",
]),
hdrs = glob([
"include/oneapi/dnnl/*.h",
"include/oneapi/dnnl/*.hpp",
"include/*.h",
"include/*.hpp",
"src/cpu/**/*.hpp",
"src/cpu/**/*.h",
"src/cpu/**/**/*.h",
"src/common/*.hpp",
"src/common/**/**/*.h",
"src/common/ittnotify/jitprofiling.h",
], exclude=[
"src/cpu/aarch64/**/*.hpp",
"src/cpu/aarch64/**/*.h",
"src/cpu/rv64/**/*.hpp",
"src/cpu/rv64/**/*.h",
"src/cpu/sycl/**/*.hpp",
]) + [
"include/oneapi/dnnl/dnnl_config.h",
"include/oneapi/dnnl/dnnl_version.h",
"include/oneapi/dnnl/dnnl_version_hash.h",
],
copts = [
"-DDNNL_DLL",
"-DDNNL_DLL_EXPORTS",
"-DDNNL_ENABLE_CONCURRENT_EXEC",
"-D__STDC_CONSTANT_MACROS",
"-D__STDC_LIMIT_MACROS",
"-fno-strict-overflow",
"-fopenmp",
] + select({
"@pytorch//tools/config:thread_sanitizer": ["-DDNNL_CPU_RUNTIME=0"],
"//conditions:default": ["-DDNNL_CPU_RUNTIME=2"],
}),
includes = [
"include/",
"include/oneapi/",
"include/oneapi/dnnl/",
"src/",
"src/common/",
"src/cpu/",
"src/cpu/x64/xbyak/",
],
visibility = ["//visibility:public"],
linkopts = [
"-lgomp",
],
deps = [
"@mkl",
],
defines = [
"DNNL_ENABLE_MAX_CPU_ISA",
"DNNL_ENABLE_CONCURRENT_EXEC",
"DNNL_ENABLE_PRIMITIVE_CACHE",
"DNNL_ENABLE_CPU_ISA_HINTS",
"DNNL_EXPERIMENTAL_UKERNEL",
"ONEDNN_BUILD_GRAPH",
],
)