Change the quantizer to match the behavior of the FBGEMM implementation (#20892)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/20892

FBGEMM uses 64 bit values. Need to change our implementation to match

Reviewed By: jerryzh168

Differential Revision: D15487664

fbshipit-source-id: 29cba26093c6f9aeafce14982c1ae12149e63562
This commit is contained in:
Zafar Takhirov
2019-05-24 00:33:02 -07:00
committed by Facebook Github Bot
parent fc941d3bca
commit 1bb728fe14

View File

@ -134,13 +134,13 @@ T quantize_val(float scale, int32_t zero_point, float value) {
// cases away from zero, and can be consistent with SIMD implementations for
// example in x86 using _mm512_cvtps_epi32 or mm512_round_ps with
// _MM_FROUND_CUR_DIRECTION option that also follow the current rounding mode.
int32_t qvalue;
int64_t qvalue;
constexpr int32_t qmin = std::numeric_limits<typename T::underlying>::min();
constexpr int32_t qmax = std::numeric_limits<typename T::underlying>::max();
checkZeroPoint<typename T::underlying>("quantize_val", zero_point);
qvalue = static_cast<int32_t>(std::nearbyint(value / scale + zero_point));
qvalue = std::max(qvalue, qmin);
qvalue = std::min(qvalue, qmax);
qvalue = static_cast<int64_t>(std::nearbyint(value / scale + zero_point));
qvalue = std::max<int64_t>(qvalue, qmin);
qvalue = std::min<int64_t>(qvalue, qmax);
return static_cast<T>(qvalue);
}