Change the quantizer to match the behavior of the FBGEMM implementation (#20892)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/20892 FBGEMM uses 64 bit values. Need to change our implementation to match Reviewed By: jerryzh168 Differential Revision: D15487664 fbshipit-source-id: 29cba26093c6f9aeafce14982c1ae12149e63562
2025-10-20 21:14:14 +08:00 · 2019-05-24 00:33:02 -07:00
parent fc941d3bca
commit 1bb728fe14
1 changed files with 4 additions and 4 deletions
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@ -134,13 +134,13 @@ T quantize_val(float scale, int32_t zero_point, float value) {
  // cases away from zero, and can be consistent with SIMD implementations for
  // example in x86 using _mm512_cvtps_epi32 or mm512_round_ps with
  // _MM_FROUND_CUR_DIRECTION option that also follow the current rounding mode.
-  int32_t qvalue;
+  int64_t qvalue;
  constexpr int32_t qmin = std::numeric_limits<typename T::underlying>::min();
  constexpr int32_t qmax = std::numeric_limits<typename T::underlying>::max();
  checkZeroPoint<typename T::underlying>("quantize_val", zero_point);
-  qvalue = static_cast<int32_t>(std::nearbyint(value / scale + zero_point));
-  qvalue = std::max(qvalue, qmin);
-  qvalue = std::min(qvalue, qmax);
+  qvalue = static_cast<int64_t>(std::nearbyint(value / scale + zero_point));
+  qvalue = std::max<int64_t>(qvalue, qmin);
+  qvalue = std::min<int64_t>(qvalue, qmax);
  return static_cast<T>(qvalue);
 }