[CUDA] Use accumulate type to improve accuracy of grid_sample on half precision inputs [v2] (#96586)

Fixes #96429

This PR is also a follow up for #90427. In that PR, we also discussed whether calculations of grid indices `grid_sampler_compute_source_index` should also be upcasted to `opmath_t` https://github.com/pytorch/pytorch/pull/90427/files#r1048876708. Due to another unit test failure, we didn't upcast those calculations in that PR.

After some investigations, I found that the inaccurate results have nothing to do with the internals of `affine_grid`, even if it's calculated using `double` internally. As long as input `grid` is passed to `grid_sample` in **half** precision, the results will be less inaccurate than a **float** `grid`. This can be verified with a short C++ program like this (by setting `TYPE_T` to `__half` and `float` in compilations)

```cpp
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>

#include <iostream>

#ifndef TYPE_T
    #define TYPE_T float
#endif

int main() {
    using type_t = TYPE_T;
    type_t d = static_cast<__half>((double)2.0 / 3.0);
    type_t s = (((float)d + 1.f) * 3 - 1) / 2;

    printf("%.15f %.15f\n", (double)d, (double)s);
}
```

Outputs are
```
./float.out
0.666503906250000 1.999755859375000

./half.out
0.666503906250000 2.000000000000000
```

To resolve the discussion back in https://github.com/pytorch/pytorch/pull/90427/files#r1048876708, I've also increased the test tolerance in the failed unit test `issue_24823_1(torch.half)`.

For the original script in #96429, I got more accurate results with `align_corners = True`
```
align_corners = True
Expected result has mean absolute value of 0.5285 and maximum absolute value of 3.2067.
Half precision result is off by 0.0001 (0.02%) on average and 0.0010 (0.03%) at maximum.

align_corners = False
Expected result has mean absolute value of 0.5189 and maximum absolute value of 3.0101.
Half precision result is off by 0.0001 (0.02%) on average and 0.0010 (0.03%) at maximum.
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/96586
Approved by: https://github.com/ngimel
This commit is contained in:
Xiao Wang
2023-03-15 19:25:18 +00:00
committed by PyTorch MergeBot
parent 54cd4a67d0
commit 1716709d46
2 changed files with 15 additions and 11 deletions

View File

@ -58,8 +58,8 @@ namespace {
const index_t grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
// get the corresponding input x, y co-ordinates from grid
scalar_t x = grid.data[grid_offset];
scalar_t y = grid.data[grid_offset + grid_sCoor];
opmath_t x = grid.data[grid_offset];
opmath_t y = grid.data[grid_offset + grid_sCoor];
opmath_t ix = grid_sampler_compute_source_index(x, inp_W, padding_mode, align_corners);
opmath_t iy = grid_sampler_compute_source_index(y, inp_H, padding_mode, align_corners);
@ -194,9 +194,9 @@ namespace {
const index_t grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
// get the corresponding input x, y, z co-ordinates from grid
scalar_t x = grid.data[grid_offset];
scalar_t y = grid.data[grid_offset + grid_sCoor];
scalar_t z = grid.data[grid_offset + 2 * grid_sCoor];
opmath_t x = grid.data[grid_offset];
opmath_t y = grid.data[grid_offset + grid_sCoor];
opmath_t z = grid.data[grid_offset + 2 * grid_sCoor];
opmath_t ix = grid_sampler_compute_source_index(x, inp_W, padding_mode, align_corners);
opmath_t iy = grid_sampler_compute_source_index(y, inp_H, padding_mode, align_corners);

View File

@ -9954,10 +9954,11 @@ class TestNNDeviceType(NNTestCase):
(1, 1, 3, 3, 3))
grid[:, 1, 1, 1, 0] = float('inf')
result = torch.nn.functional.grid_sample(image, grid, padding_mode='zeros')
tol_override = {'atol': 0.005, 'rtol': 0} if dtype == torch.half else {}
self.assertEqual(result, torch.tensor([[[[[27., 26., 25.], [24., 23., 22.], [21., 20., 19.]],
[[18., 17., 16.], [15., 0., 13.], [12., 11., 10.]],
[[9., 8., 7.], [6., 5., 4.], [3., 2., 1.]]]]],
device=device, dtype=dtype))
device=device, dtype=dtype), **tol_override)
result.backward(torch.ones_like(result))
expected_grad = torch.ones_like(image)
expected_grad[0, 0, 1, 1, 1] = 0
@ -10066,20 +10067,23 @@ class TestNNDeviceType(NNTestCase):
@onlyCUDA
def test_grid_sample_half_precision(self):
def helper(shape_in, shape_out):
def helper(shape_in, shape_out, align_corners):
for mode in ('bilinear', 'nearest', 'bicubic'):
if len(shape_in) != 4 and mode == 'bicubic':
continue
data = torch.randn(shape_in, device='cuda', dtype=torch.half)
grid = torch.rand(shape_out, device='cuda', dtype=torch.half) * 2.0 - 1.0
out_half = F.grid_sample(data, grid, mode=mode, padding_mode='zeros', align_corners=False)
out_double = F.grid_sample(data.double(), grid.double(), mode=mode, padding_mode='zeros', align_corners=False)
out_half = F.grid_sample(data, grid, mode=mode, padding_mode='zeros', align_corners=align_corners)
out_double = F.grid_sample(data.double(), grid.double(), mode=mode, padding_mode='zeros',
align_corners=align_corners)
self.assertEqual(out_half, out_double.half(), msg="grid_sample with mode = {} doesn't match".format(mode))
helper((32, 64, 16, 16), (32, 8, 8, 2))
helper((32, 64, 16, 16, 16), (32, 8, 8, 8, 3))
helper((32, 64, 16, 16), (32, 8, 8, 2), True)
helper((32, 64, 16, 16, 16), (32, 8, 8, 8, 3), True)
helper((32, 64, 16, 16), (32, 8, 8, 2), False)
helper((32, 64, 16, 16, 16), (32, 8, 8, 8, 3), False)
def _test_gumbel_softmax_st_shapes(self, device, dtype, shape, dim, count_expected):
logits = torch.randn(shape, dtype=torch.float, device=device)