Enable onednn in pytorch for ppc64le architecture (#143743)

This PR will enable onednn for powerpc Architecture which will help to do quantization of the model via onednn for powerpc. Pull Request resolved: https://github.com/pytorch/pytorch/pull/143743 Approved by: https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
2025-10-20 21:14:14 +08:00 · 2025-03-07 23:35:47 +00:00
parent 187d5c0eb1
commit 26f8d81037
3 changed files with 18 additions and 3 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -180,11 +180,14 @@ endif()

 set(CPU_AARCH64 OFF)
 set(CPU_INTEL OFF)
+set(CPU_POWER OFF)

 if(CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|x86_64)")
  set(CPU_INTEL ON)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64)")
  set(CPU_AARCH64 ON)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
+  set(CPU_POWER ON)
 endif()

 # For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
@ -315,8 +318,8 @@ cmake_dependent_option(USE_ITT "Use Intel(R) VTune Profiler ITT functionality"
 # Ensure that an MKLDNN build is the default for x86 CPUs but optional for
 # AArch64 (dependent on -DUSE_MKLDNN).
 cmake_dependent_option(
-  USE_MKLDNN "Use MKLDNN. Only available on x86, x86_64, and AArch64."
-  "${CPU_INTEL}" "CPU_INTEL OR CPU_AARCH64" OFF)
+  USE_MKLDNN "Use MKLDNN. Only available on x86, x86_64, AArch64, and ppc64le."
+  "${CPU_INTEL}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)
 cmake_dependent_option(
  USE_MKLDNN_ACL "Use Compute Library for the Arm architecture." OFF
  "USE_MKLDNN AND CPU_AARCH64" OFF)
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@ -348,6 +348,8 @@ void gemm(
   // MKLDNN also supports ARM for bf16, and the bypass is only
   // currently intended for x86/x86_64.
   const bool use_bf16_gemv_trans = false;
+#elif defined(__powerpc__)
+   const bool use_bf16_gemv_trans = false;
 #else
   const bool bf16_gemv_trans_would_be_faster = cpuinfo_initialize() &&
     !cpuinfo_has_x86_avx512bf16();
@ -378,8 +380,12 @@ void gemm(
   // we should not bother checking for !cpuinfo_has_x86_avx512fp16() here,
   // because "onednn (mkldnn) won't use avx512fp16 to compute gemms by default
   // because the avx512fp16 fma would incur accuracy loss".
+#if defined(__powerpc__)
+   const bool fp16_gemv_trans_would_be_faster = false;
+#else
   const bool fp16_gemv_trans_would_be_faster = cpuinfo_initialize() &&
     cpuinfo_has_x86_f16c();
+#endif
   const bool use_fp16_gemv_trans = fp16_gemv_trans_would_be_faster &&
     transa == TransposeType::Transpose &&
     transb == TransposeType::NoTranspose && n == 1 && alpha == 1.0;
--- a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
@ -5,7 +5,9 @@
 #include <ATen/Tensor.h>
 #include <ATen/native/quantized/PackedParams.h>
 #include <ideep.hpp>
+#if !defined(__powerpc__)
 #include <cpuinfo.h>
+#endif

 #include <c10/util/CallOnce.h>

@ -432,7 +434,11 @@ inline bool should_use_onednn_quant(
 #if !defined(__linux__)
  return false;
 #else
-  bool vnni_available = cpuinfo_has_x86_avx512vnni();
+#if defined(__powerpc__)
+  constexpr auto vnni_available = true;
+#else
+  const auto vnni_available = cpuinfo_has_x86_avx512vnni();
+#endif
  bool w_sym_quant =
      is_weight_symmetric_quant(weight, is_transposed_conv);
  bool opad_all_zero =