[Quant][CPU] fix fake_quantize_per_tensor_affine of inf values (#155109)

Fixes #154328 **Summary** Fail reason: The input value is infinity in float and it has undefined behavior to convert it to int64_t. On X86, it will be converted to the min value of int64_t, which is not expected. Fix: Clamping `(input * inv_scale + zero_point)` to `[quant_min, quant_max]` before converting it to int64_t. **Test plan** ``` pytest test/quantization/core/test_workflow_ops.py -k test_fake_quantize_per_tensor_affine_inf ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/155109 Approved by: https://github.com/leslie-fang-intel, https://github.com/jerryzh168
2025-10-20 21:14:14 +08:00 · 2025-06-14 14:12:38 +00:00
parent 1a568f4e5d
commit e375d21bb9
2 changed files with 23 additions and 6 deletions
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@ -2699,10 +2699,11 @@ void _fake_quantize_tensor_helper(
          bool* mask_val = (bool*)(data[1] + i * strides[1]);
          scalar_t* input_val = (scalar_t*)(data[2] + i * strides[2]);

-          const auto qval = static_cast<int64_t>(z_point + std::nearbyint(*input_val * inv_scale));
          if (fake_quant_on) {
-          *output_val = (std::fmin(std::fmax(qval, quant_min), quant_max) - z_point) * sc;
-          *mask_val = ((quant_min <= qval) && (qval <= quant_max));
+            auto qval_f = z_point + std::nearbyint(*input_val * inv_scale);
+            const auto qval = static_cast<int64_t>(std::fmin(std::fmax(qval_f, quant_min), quant_max));
+            *output_val = (qval - z_point) * sc;
+            *mask_val = ((quant_min <= qval_f) && (qval_f <= quant_max));
          } else {
            *output_val = *input_val;
            *mask_val = 1;
@ -2718,10 +2719,11 @@ void _fake_quantize_tensor_helper(
          bool* mask_val = (bool*)(data[1] + i * strides[1]);
          scalar_t* input_val = (scalar_t*)(data[2] + i * strides[2]);

-          const auto qval = static_cast<int64_t>(z_point + std::nearbyint(*input_val * inv_scale));
          if (fake_quant_on) {
-          *output_val = (std::fmin(std::fmax(qval, quant_min), quant_max) - z_point) * sc;
-          *mask_val = ((quant_min <= qval) && (qval <= quant_max));
+            auto qval_f = z_point + std::nearbyint(*input_val * inv_scale);
+            const auto qval = static_cast<int64_t>(std::fmin(std::fmax(qval_f, quant_min), quant_max));
+            *output_val = (qval - z_point) * sc;
+            *mask_val = ((quant_min <= qval_f) && (qval_f <= quant_max));
          } else {
            *output_val = *input_val;
            *mask_val = 1;
--- a/test/quantization/core/test_workflow_ops.py
+++ b/test/quantization/core/test_workflow_ops.py
@ -1038,6 +1038,21 @@ class TestFakeQuantizeOps(TestCase):
                    input, scale, zero_point, axis, quant_min, quant_max
                )

+    @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
+    @given(dtype=st.sampled_from([torch.float, torch.float64, torch.half, torch.bfloat16]),
+           device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']))
+    def test_fake_quantize_per_tensor_affine_inf(self, dtype, device) -> None:
+        # https://github.com/pytorch/pytorch/issues/154328
+        input_tensor = torch.tensor([torch.inf], dtype=dtype).to(device)
+        scale = 0.01
+        zero_point = 0
+        quant_min = 0
+        quant_max = 255
+        result = torch.fake_quantize_per_tensor_affine(input_tensor, scale, zero_point, quant_min, quant_max)
+        ref_result = (min(quant_max, max(quant_min, torch.round(input_tensor / scale) + zero_point)) - zero_point) * scale
+        ref_result = torch.Tensor([ref_result]).to(dtype).to(device)
+        self.assertEqual(result, ref_result)
+

 class TestFusedObsFakeQuant(TestCase):
    @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),