Add uint8 support for interpolate for CPU images (#90771)

Joint work with @vfdev-5

This PR introduces native uint8 support for `interpolate()`, for `bilinear` ~and `bicubic`~ modes for CPU images (`mode=nearest[_exact]` was already supported ).

On a typical torchvision training job on ImageNet, the speedup are ~4X when AVX2 is supported, comparing the uint8 native (this PR) vs torchvision's current `Resize()`:

```
AA = antialias
float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does)

input_size         output_size channels_last AA    mode       num_threads  speed-up float vs uint8 (this PR)
(1, 3, 270, 268) -> (224, 224)     True    True    bilinear   num_threads=1   4X    2.6ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     True    False   bilinear   num_threads=1   2.1X  1.3ms vs 0.6ms
(1, 3, 270, 268) -> (224, 224)     False   True    bilinear   num_threads=1   3X    2.1ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     False   False   bilinear   num_threads=1   4X    2.4ms vs 0.6ms

(Note: we removed bicubic support for now)
(1, 3, 270, 268) -> (224, 224)     True    True    bicubic    num_threads=1   4X    2.9ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     True    False   bicubic    num_threads=1   5X    3.1ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     False   True    bicubic    num_threads=1   3X    2.4ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     False   False   bicubic    num_threads=1   4X    2.8ms vs 0.7ms

```

There is still room for further speed-ups (see TODOs in the code).

#### More benchmark details

with AVX2 support - speedups typically range from 1.5X to 10X. A few edge-cases are slower, worth investigating why.

<details>

```
AA = antialias
float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does)

input_size         output_size channels_last AA    mode       num_threads  speed-up float vs uint8 (this PR)
(1, 3, 64, 64) -> (224, 224)       True    True    bilinear   num_threads=1   5X    1.1ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224)       True    False   bilinear   num_threads=1   5X    1.2ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224)       False   True    bilinear   num_threads=1   2.8X  0.6ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224)       False   False   bilinear   num_threads=1   7X    1.6ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224)       True    True    bicubic    num_threads=1   5X    1.2ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224)       True    False   bicubic    num_threads=1   12X   2.9ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224)       False   True    bicubic    num_threads=1   3X    0.8ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224)       False   False   bicubic    num_threads=1   7X    1.8ms vs 0.2ms

(1, 3, 64, 64) -> (224, 224)       True    True    bilinear   num_threads=2   2.6X  0.6ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224)       True    False   bilinear   num_threads=2   2.8X  0.6ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224)       False   True    bilinear   num_threads=2   1.7X  0.4ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224)       False   False   bilinear   num_threads=2   1.4X  0.3ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224)       True    True    bicubic    num_threads=2   2.7X  0.7ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224)       True    False   bicubic    num_threads=2   7X    1.6ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224)       False   True    bicubic    num_threads=2   1.8X  0.4ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224)       False   False   bicubic    num_threads=2   4X    1.0ms vs 0.2ms

(1, 3, 224, 224) -> (270, 268)     True    True    bilinear   num_threads=1   4X    2.5ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268)     True    False   bilinear   num_threads=1   3.0X  1.8ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268)     False   True    bilinear   num_threads=1   3X    1.8ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268)     False   False   bilinear   num_threads=1   4X    2.3ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268)     True    True    bicubic    num_threads=1   4X    2.7ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268)     True    False   bicubic    num_threads=1   7X    4.3ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268)     False   True    bicubic    num_threads=1   3X    2.1ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268)     False   False   bicubic    num_threads=1   4X    2.6ms vs 0.6ms

(1, 3, 224, 224) -> (270, 268)     True    True    bilinear   num_threads=2   2.7X  1.6ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268)     True    False   bilinear   num_threads=2   2.6X  1.5ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268)     False   True    bilinear   num_threads=2   2.1X  1.2ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268)     False   False   bilinear   num_threads=2   1.6X  0.9ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268)     True    True    bicubic    num_threads=2   2.8X  1.7ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268)     True    False   bicubic    num_threads=2   5X    2.8ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268)     False   True    bicubic    num_threads=2   2.3X  1.4ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268)     False   False   bicubic    num_threads=2   3X    1.9ms vs 0.6ms

(1, 3, 256, 256) -> (1024, 1024)   True    True    bilinear   num_threads=1   4X    26.6ms vs 6.7ms
(1, 3, 256, 256) -> (1024, 1024)   True    False   bilinear   num_threads=1   4X    23.9ms vs 6.8ms
(1, 3, 256, 256) -> (1024, 1024)   False   True    bilinear   num_threads=1   2.5X  16.8ms vs 6.8ms
(1, 3, 256, 256) -> (1024, 1024)   False   False   bilinear   num_threads=1   5X    33.1ms vs 6.8ms
(1, 3, 256, 256) -> (1024, 1024)   True    True    bicubic    num_threads=1   4X    25.9ms vs 7.3ms
(1, 3, 256, 256) -> (1024, 1024)   True    False   bicubic    num_threads=1   8X    59.6ms vs 7.3ms
(1, 3, 256, 256) -> (1024, 1024)   False   True    bicubic    num_threads=1   1.9X  14.3ms vs 7.4ms
(1, 3, 256, 256) -> (1024, 1024)   False   False   bicubic    num_threads=1   5X    35.4ms vs 7.3ms

(1, 3, 256, 256) -> (1024, 1024)   True    True    bilinear   num_threads=2   2.0X  13.6ms vs 6.8ms
(1, 3, 256, 256) -> (1024, 1024)   True    False   bilinear   num_threads=2   2.2X  14.8ms vs 6.7ms
(1, 3, 256, 256) -> (1024, 1024)   False   True    bilinear   num_threads=2   1.3X  8.8ms vs 6.9ms
(1, 3, 256, 256) -> (1024, 1024)   False   False   bilinear   num_threads=2   1.2X  8.4ms vs 6.8ms
(1, 3, 256, 256) -> (1024, 1024)   True    True    bicubic    num_threads=2   1.8X  12.8ms vs 7.3ms
(1, 3, 256, 256) -> (1024, 1024)   True    False   bicubic    num_threads=2   4X    32.1ms vs 7.2ms
(1, 3, 256, 256) -> (1024, 1024)   False   True    bicubic    num_threads=2   1.4X  10.1ms vs 7.3ms
(1, 3, 256, 256) -> (1024, 1024)   False   False   bicubic    num_threads=2   2.9X  20.9ms vs 7.3ms

(1, 3, 224, 224) -> (64, 64)       True    True    bilinear   num_threads=1   1.4X  0.5ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       True    False   bilinear   num_threads=1   0.7X  0.2ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       False   True    bilinear   num_threads=1   1.3X  0.4ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       False   False   bilinear   num_threads=1   1.4X  0.4ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       True    True    bicubic    num_threads=1   2.1X  0.7ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       True    False   bicubic    num_threads=1   1.3X  0.4ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       False   True    bicubic    num_threads=1   1.9X  0.6ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       False   False   bicubic    num_threads=1   1.0X  0.3ms vs 0.3ms

(1, 3, 224, 224) -> (64, 64)       True    True    bilinear   num_threads=2   1.0X  0.3ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       True    False   bilinear   num_threads=2   0.6X  0.2ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       False   True    bilinear   num_threads=2   0.8X  0.3ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       False   False   bilinear   num_threads=2   1.4X  0.4ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       True    True    bicubic    num_threads=2   1.4X  0.5ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       True    False   bicubic    num_threads=2   1.2X  0.4ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       False   True    bicubic    num_threads=2   1.2X  0.4ms vs 0.4ms
(1, 3, 224, 224) -> (64, 64)       False   False   bicubic    num_threads=2   0.9X  0.3ms vs 0.3ms

(1, 3, 270, 268) -> (224, 224)     True    True    bilinear   num_threads=1   4X    2.6ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     True    False   bilinear   num_threads=1   2.1X  1.3ms vs 0.6ms
(1, 3, 270, 268) -> (224, 224)     False   True    bilinear   num_threads=1   3X    2.1ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     False   False   bilinear   num_threads=1   4X    2.4ms vs 0.6ms
(1, 3, 270, 268) -> (224, 224)     True    True    bicubic    num_threads=1   4X    2.9ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     True    False   bicubic    num_threads=1   5X    3.1ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     False   True    bicubic    num_threads=1   3X    2.4ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     False   False   bicubic    num_threads=1   4X    2.8ms vs 0.7ms

(1, 3, 270, 268) -> (224, 224)     True    True    bilinear   num_threads=2   1.5X  1.0ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     True    False   bilinear   num_threads=2   1.2X  0.8ms vs 0.6ms
(1, 3, 270, 268) -> (224, 224)     False   True    bilinear   num_threads=2   2.3X  1.5ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     False   False   bilinear   num_threads=2   1.9X  1.2ms vs 0.6ms
(1, 3, 270, 268) -> (224, 224)     True    True    bicubic    num_threads=2   1.6X  1.2ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     True    False   bicubic    num_threads=2   4X    2.4ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     False   True    bicubic    num_threads=2   2.4X  1.6ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224)     False   False   bicubic    num_threads=2   2.8X  1.8ms vs 0.6ms

(1, 3, 1024, 1024) -> (256, 256)   True    True    bilinear   num_threads=1   2.1X  12.8ms vs 6.1ms
(1, 3, 1024, 1024) -> (256, 256)   True    False   bilinear   num_threads=1   0.6X  3.8ms vs 5.9ms
(1, 3, 1024, 1024) -> (256, 256)   False   True    bilinear   num_threads=1   1.2X  7.1ms vs 6.1ms
(1, 3, 1024, 1024) -> (256, 256)   False   False   bilinear   num_threads=1   1.9X  11.0ms vs 5.9ms
(1, 3, 1024, 1024) -> (256, 256)   True    True    bicubic    num_threads=1   2.0X  12.6ms vs 6.4ms
(1, 3, 1024, 1024) -> (256, 256)   True    False   bicubic    num_threads=1   1.0X  6.1ms vs 6.0ms
(1, 3, 1024, 1024) -> (256, 256)   False   True    bicubic    num_threads=1   1.8X  11.3ms vs 6.4ms
(1, 3, 1024, 1024) -> (256, 256)   False   False   bicubic    num_threads=1   0.8X  4.6ms vs 6.0ms

(1, 3, 1024, 1024) -> (256, 256)   True    True    bilinear   num_threads=2   1.6X  9.3ms vs 6.0ms
(1, 3, 1024, 1024) -> (256, 256)   True    False   bilinear   num_threads=2   0.3X  2.0ms vs 5.8ms
(1, 3, 1024, 1024) -> (256, 256)   False   True    bilinear   num_threads=2   1.2X  7.2ms vs 6.0ms
(1, 3, 1024, 1024) -> (256, 256)   False   False   bilinear   num_threads=2   0.3X  1.6ms vs 5.8ms
(1, 3, 1024, 1024) -> (256, 256)   True    True    bicubic    num_threads=2   1.1X  7.1ms vs 6.5ms
(1, 3, 1024, 1024) -> (256, 256)   True    False   bicubic    num_threads=2   0.6X  3.3ms vs 5.9ms
(1, 3, 1024, 1024) -> (256, 256)   False   True    bicubic    num_threads=2   0.9X  5.9ms vs 6.3ms
(1, 3, 1024, 1024) -> (256, 256)   False   False   bicubic    num_threads=2   0.4X  2.4ms vs 5.9ms
```

</details>

without AVX2 support - no significant speed-up, but there are various possible improvements (see TODOs)

<details>

```
AA = antialias
float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does)

input_size         output_size channels_last AA    mode       num_threads  speed-up float vs uint8 (this PR)
(1, 3, 64, 64) -> (224, 224)       True    True    bilinear   num_threads=1   0.9X  1.5ms vs 1.6ms
(1, 3, 64, 64) -> (224, 224)       True    False   bilinear   num_threads=1   0.9X  1.5ms vs 1.6ms
(1, 3, 64, 64) -> (224, 224)       False   True    bilinear   num_threads=1   0.8X  0.9ms vs 1.1ms
(1, 3, 64, 64) -> (224, 224)       False   False   bilinear   num_threads=1   1.5X  1.7ms vs 1.1ms
(1, 3, 64, 64) -> (224, 224)       True    True    bicubic    num_threads=1   0.9X  1.6ms vs 1.8ms
(1, 3, 64, 64) -> (224, 224)       True    False   bicubic    num_threads=1   2.1X  3.9ms vs 1.9ms
(1, 3, 64, 64) -> (224, 224)       False   True    bicubic    num_threads=1   0.8X  1.1ms vs 1.4ms
(1, 3, 64, 64) -> (224, 224)       False   False   bicubic    num_threads=1   1.7X  2.4ms vs 1.5ms

(1, 3, 64, 64) -> (224, 224)       True    True    bilinear   num_threads=2   0.9X  0.8ms vs 0.8ms
(1, 3, 64, 64) -> (224, 224)       True    False   bilinear   num_threads=2   0.9X  0.8ms vs 0.8ms
(1, 3, 64, 64) -> (224, 224)       False   True    bilinear   num_threads=2   0.9X  0.5ms vs 0.6ms
(1, 3, 64, 64) -> (224, 224)       False   False   bilinear   num_threads=2   0.7X  0.5ms vs 0.7ms
(1, 3, 64, 64) -> (224, 224)       True    True    bicubic    num_threads=2   0.9X  0.9ms vs 1.0ms
(1, 3, 64, 64) -> (224, 224)       True    False   bicubic    num_threads=2   2.1X  2.0ms vs 1.0ms
(1, 3, 64, 64) -> (224, 224)       False   True    bicubic    num_threads=2   0.8X  0.6ms vs 0.8ms
(1, 3, 64, 64) -> (224, 224)       False   False   bicubic    num_threads=2   1.7X  1.3ms vs 0.8ms

(1, 3, 224, 224) -> (270, 268)     True    True    bilinear   num_threads=1   1.0X  3.0ms vs 3.0ms
(1, 3, 224, 224) -> (270, 268)     True    False   bilinear   num_threads=1   1.0X  2.8ms vs 2.9ms
(1, 3, 224, 224) -> (270, 268)     False   True    bilinear   num_threads=1   1.0X  2.3ms vs 2.2ms
(1, 3, 224, 224) -> (270, 268)     False   False   bilinear   num_threads=1   1.4X  3.3ms vs 2.3ms
(1, 3, 224, 224) -> (270, 268)     True    True    bicubic    num_threads=1   1.0X  3.5ms vs 3.5ms
(1, 3, 224, 224) -> (270, 268)     True    False   bicubic    num_threads=1   1.7X  6.1ms vs 3.5ms
(1, 3, 224, 224) -> (270, 268)     False   True    bicubic    num_threads=1   0.9X  2.6ms vs 2.9ms
(1, 3, 224, 224) -> (270, 268)     False   False   bicubic    num_threads=1   1.4X  4.2ms vs 2.9ms

(1, 3, 224, 224) -> (270, 268)     True    True    bilinear   num_threads=2   1.0X  1.7ms vs 1.7ms
(1, 3, 224, 224) -> (270, 268)     True    False   bilinear   num_threads=2   0.9X  1.6ms vs 1.8ms
(1, 3, 224, 224) -> (270, 268)     False   True    bilinear   num_threads=2   0.9X  1.3ms vs 1.4ms
(1, 3, 224, 224) -> (270, 268)     False   False   bilinear   num_threads=2   0.7X  1.1ms vs 1.6ms
(1, 3, 224, 224) -> (270, 268)     True    True    bicubic    num_threads=2   1.0X  2.0ms vs 2.0ms
(1, 3, 224, 224) -> (270, 268)     True    False   bicubic    num_threads=2   1.7X  3.2ms vs 1.9ms
(1, 3, 224, 224) -> (270, 268)     False   True    bicubic    num_threads=2   0.8X  1.5ms vs 1.9ms
(1, 3, 224, 224) -> (270, 268)     False   False   bicubic    num_threads=2   1.2X  2.3ms vs 1.9ms

(1, 3, 256, 256) -> (1024, 1024)   True    True    bilinear   num_threads=1   1.1X  34.7ms vs 32.4ms
(1, 3, 256, 256) -> (1024, 1024)   True    False   bilinear   num_threads=1   1.0X  31.2ms vs 32.4ms
(1, 3, 256, 256) -> (1024, 1024)   False   True    bilinear   num_threads=1   1.0X  23.5ms vs 22.7ms
(1, 3, 256, 256) -> (1024, 1024)   False   False   bilinear   num_threads=1   1.9X  42.5ms vs 22.7ms
(1, 3, 256, 256) -> (1024, 1024)   True    True    bicubic    num_threads=1   0.9X  33.9ms vs 37.4ms
(1, 3, 256, 256) -> (1024, 1024)   True    False   bicubic    num_threads=1   2.2X  84.0ms vs 37.5ms
(1, 3, 256, 256) -> (1024, 1024)   False   True    bicubic    num_threads=1   1.0X  28.4ms vs 28.8ms
(1, 3, 256, 256) -> (1024, 1024)   False   False   bicubic    num_threads=1   2.0X  56.7ms vs 28.8ms

(1, 3, 256, 256) -> (1024, 1024)   True    True    bilinear   num_threads=2   1.1X  17.5ms vs 16.4ms
(1, 3, 256, 256) -> (1024, 1024)   True    False   bilinear   num_threads=2   1.1X  17.7ms vs 16.4ms
(1, 3, 256, 256) -> (1024, 1024)   False   True    bilinear   num_threads=2   0.8X  8.8ms vs 11.4ms
(1, 3, 256, 256) -> (1024, 1024)   False   False   bilinear   num_threads=2   1.0X  11.1ms vs 11.4ms
(1, 3, 256, 256) -> (1024, 1024)   True    True    bicubic    num_threads=2   1.1X  19.9ms vs 18.8ms
(1, 3, 256, 256) -> (1024, 1024)   True    False   bicubic    num_threads=2   2.3X  42.5ms vs 18.7ms
(1, 3, 256, 256) -> (1024, 1024)   False   True    bicubic    num_threads=2   1.0X  14.1ms vs 14.5ms
(1, 3, 256, 256) -> (1024, 1024)   False   False   bicubic    num_threads=2   2.0X  28.4ms vs 14.5ms

(1, 3, 224, 224) -> (64, 64)       True    True    bilinear   num_threads=1   1.0X  0.6ms vs 0.6ms
(1, 3, 224, 224) -> (64, 64)       True    False   bilinear   num_threads=1   0.7X  0.3ms vs 0.4ms
(1, 3, 224, 224) -> (64, 64)       False   True    bilinear   num_threads=1   0.9X  0.5ms vs 0.6ms
(1, 3, 224, 224) -> (64, 64)       False   False   bilinear   num_threads=1   1.7X  0.6ms vs 0.4ms
(1, 3, 224, 224) -> (64, 64)       True    True    bicubic    num_threads=1   1.0X  0.8ms vs 0.8ms
(1, 3, 224, 224) -> (64, 64)       True    False   bicubic    num_threads=1   1.1X  0.5ms vs 0.5ms
(1, 3, 224, 224) -> (64, 64)       False   True    bicubic    num_threads=1   0.9X  0.7ms vs 0.8ms
(1, 3, 224, 224) -> (64, 64)       False   False   bicubic    num_threads=1   0.9X  0.4ms vs 0.4ms

(1, 3, 224, 224) -> (64, 64)       True    True    bilinear   num_threads=2   1.0X  0.4ms vs 0.4ms
(1, 3, 224, 224) -> (64, 64)       True    False   bilinear   num_threads=2   0.8X  0.2ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       False   True    bilinear   num_threads=2   0.9X  0.3ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       False   False   bilinear   num_threads=2   1.3X  0.3ms vs 0.2ms
(1, 3, 224, 224) -> (64, 64)       True    True    bicubic    num_threads=2   1.0X  0.5ms vs 0.5ms
(1, 3, 224, 224) -> (64, 64)       True    False   bicubic    num_threads=2   1.3X  0.4ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64)       False   True    bicubic    num_threads=2   0.9X  0.5ms vs 0.5ms
(1, 3, 224, 224) -> (64, 64)       False   False   bicubic    num_threads=2   1.2X  0.3ms vs 0.3ms

(1, 3, 270, 268) -> (224, 224)     True    True    bilinear   num_threads=1   0.8X  2.1ms vs 2.5ms
(1, 3, 270, 268) -> (224, 224)     True    False   bilinear   num_threads=1   0.7X  1.6ms vs 2.4ms
(1, 3, 270, 268) -> (224, 224)     False   True    bilinear   num_threads=1   1.2X  2.4ms vs 2.1ms
(1, 3, 270, 268) -> (224, 224)     False   False   bilinear   num_threads=1   1.3X  2.6ms vs 2.0ms
(1, 3, 270, 268) -> (224, 224)     True    True    bicubic    num_threads=1   1.1X  3.4ms vs 3.0ms
(1, 3, 270, 268) -> (224, 224)     True    False   bicubic    num_threads=1   1.7X  4.8ms vs 2.8ms
(1, 3, 270, 268) -> (224, 224)     False   True    bicubic    num_threads=1   1.1X  2.9ms vs 2.7ms
(1, 3, 270, 268) -> (224, 224)     False   False   bicubic    num_threads=1   1.4X  3.5ms vs 2.4ms

(1, 3, 270, 268) -> (224, 224)     True    True    bilinear   num_threads=2   0.9X  1.2ms vs 1.3ms
(1, 3, 270, 268) -> (224, 224)     True    False   bilinear   num_threads=2   1.3X  1.6ms vs 1.2ms
(1, 3, 270, 268) -> (224, 224)     False   True    bilinear   num_threads=2   0.8X  0.9ms vs 1.1ms
(1, 3, 270, 268) -> (224, 224)     False   False   bilinear   num_threads=2   1.3X  1.3ms vs 1.0ms
(1, 3, 270, 268) -> (224, 224)     True    True    bicubic    num_threads=2   1.4X  2.2ms vs 1.6ms
(1, 3, 270, 268) -> (224, 224)     True    False   bicubic    num_threads=2   1.9X  2.8ms vs 1.5ms
(1, 3, 270, 268) -> (224, 224)     False   True    bicubic    num_threads=2   0.8X  1.1ms vs 1.4ms
(1, 3, 270, 268) -> (224, 224)     False   False   bicubic    num_threads=2   1.7X  2.1ms vs 1.3ms

(1, 3, 1024, 1024) -> (256, 256)   True    True    bilinear   num_threads=1   1.0X  10.0ms vs 9.9ms
(1, 3, 1024, 1024) -> (256, 256)   True    False   bilinear   num_threads=1   0.7X  4.6ms vs 6.2ms
(1, 3, 1024, 1024) -> (256, 256)   False   True    bilinear   num_threads=1   0.9X  9.1ms vs 9.8ms
(1, 3, 1024, 1024) -> (256, 256)   False   False   bilinear   num_threads=1   1.7X  9.4ms vs 5.7ms
(1, 3, 1024, 1024) -> (256, 256)   True    True    bicubic    num_threads=1   1.0X  15.2ms vs 14.8ms
(1, 3, 1024, 1024) -> (256, 256)   True    False   bicubic    num_threads=1   1.0X  7.6ms vs 7.5ms
(1, 3, 1024, 1024) -> (256, 256)   False   True    bicubic    num_threads=1   0.9X  13.3ms vs 14.4ms
(1, 3, 1024, 1024) -> (256, 256)   False   False   bicubic    num_threads=1   0.8X  5.9ms vs 7.0ms

(1, 3, 1024, 1024) -> (256, 256)   True    True    bilinear   num_threads=2   1.2X  6.0ms vs 5.2ms
(1, 3, 1024, 1024) -> (256, 256)   True    False   bilinear   num_threads=2   0.7X  2.3ms vs 3.2ms
(1, 3, 1024, 1024) -> (256, 256)   False   True    bilinear   num_threads=2   1.0X  4.8ms vs 5.0ms
(1, 3, 1024, 1024) -> (256, 256)   False   False   bilinear   num_threads=2   0.7X  1.9ms vs 2.9ms
(1, 3, 1024, 1024) -> (256, 256)   True    True    bicubic    num_threads=2   1.6X  12.3ms vs 7.5ms
(1, 3, 1024, 1024) -> (256, 256)   True    False   bicubic    num_threads=2   1.0X  3.9ms vs 3.9ms
(1, 3, 1024, 1024) -> (256, 256)   False   True    bicubic    num_threads=2   1.0X  7.0ms vs 7.3ms
(1, 3, 1024, 1024) -> (256, 256)   False   False   bicubic    num_threads=2   0.9X  3.0ms vs 3.5ms

```

</details>

Benchmark code
<details>

```py
import operator_benchmark as op_bench
import torch

"""Microbenchmarks for interpolate operator."""

class InterpolateBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, input_size, output_size, channels_last=False, mode='linear', antialias=False, dtype=torch.float):

        input_image = torch.randint(0, 256, size=input_size, dtype=torch.uint8, device='cpu')

        if channels_last:
            input_image = input_image.contiguous(memory_format=torch.channels_last)

        self.inputs = {
            "input_image": input_image,
            "output_size": output_size,
            "mode": mode,
            "antialias": antialias,
            "dtype":dtype,
        }

        self.set_module_name("interpolate")

    def forward(self, input_image, output_size, mode, antialias, dtype):
        if dtype == torch.float:
            input_image = input_image.float()

        out = torch.nn.functional.interpolate(input_image, size=output_size, mode=mode, align_corners=False, antialias=antialias)
        if dtype == torch.float:
            out = out.round().clamp(min=0, max=256).to(torch.uint8)

def make_config():
    sizes = (
        ((224, 224), (64, 64)),
        ((270, 268), (224, 224)),
        ((256, 256), (1024, 1024)),
    )

    attrs = []
    for (HW1, HW2) in sizes:
        attrs.append([(1, 3, *HW1), HW2])  # 3 channels
        # attrs.append([(1, 1, *HW1), HW2])  # 1 channel

        attrs.append([(1, 3, *HW2), HW1])  # 3 channels
        # attrs.append([(1, 1, *HW2), HW1])  # 1 channel

    config = op_bench.config_list(
        attr_names=["input_size", "output_size"],
        attrs=attrs,
        cross_product_configs={
            'channels_last': [True, False],
            'mode': ["bilinear", "bicubic"],
            'antialias': [True, False],
            # 'dtype': [torch.float, torch.uint8]
            # 'dtype': [torch.uint8]
            'dtype': [torch.float]
        },
        tags=["short"],
    )

    return config

config = make_config()
op_bench.generate_pt_test(config, InterpolateBenchmark)

if __name__ == "__main__":
    op_bench.benchmark_runner.main()

```

```py
import re
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("f1", nargs="?", default="main")
parser.add_argument("f2", nargs="?", default="new")
args = parser.parse_args()

with open(args.f1) as f:
    main = f.readlines()
with open(args.f2) as f:
    new = f.readlines()

out = []

for main_line, new_line in zip(main, new):
    # num_threads=1  # TODO: remove
    if main_line.startswith("num_threads="):
        num_threads = int(main_line.split("=")[-1])
    if main_line.startswith("# Input"):
        deets = f"{main_line.strip()}, {num_threads=}"
    if main_line.startswith("Forward"):
        main_time = float(main_line.split()[-1])
        new_time = float(new_line.split()[-1])
        ratio = main_time / new_time
        fmt = ".1f" if ratio < 3 else ".0f"
        improv = f"{ratio:{fmt}}X"
        time_fmt = ",.3f" if new_time < 100 else ",.1f"
        deets = deets.strip().replace("# Input: ", "")
        deets = deets.replace(": ", "=")
        deets = deets.replace("input_size=", "")
        deets = deets.replace(", output_size=", " -> ")
        deets = deets.replace("dtype=torch.", "")
        deets = deets.replace("mode=", "")
        deets = deets.replace("antialias=", "")
        deets = deets.replace("channels_last=", "")
        # deets = deets.replace("channels_last=True, ", "")
        split = deets.split(",")

        # size = ','.join(split[:-3])
        # mode, dtype, threads = split[-3:]
        # deets = f"{size:<30} {mode:<15} {dtype:<10} {threads:<15}"

        size = ','.join(split[:-5])
        channels_last, mode, antialias, dtype, threads= split[-5:]
        deets = f"{size:<33} {channels_last:<7} {antialias:<7} {mode:<10} {threads:<15}"

        l = f"{deets}  {improv:<5} {main_time / 1000:{time_fmt}}ms vs {new_time / 1000:{time_fmt}}ms"
        out.append(l)

def key(s):
    # s = ''.join(s.split()[1:]) # remove "N.nX" part
    num_threads = (int(re.findall(r"num_threads=(\d+)", s)[0]),)

    input_shape, output_shape = re.findall("\(.*?\)", s)
    input_shape = input_shape[1:-1]  # remove parenthesis
    input_HW = tuple(int(x) for x in input_shape.split(",")[-2:])
    input_C = (-int(input_shape.split(",")[1]),)

    output_HW = tuple(int(x) for x in output_shape[1:-1].split(","))
    is_downsample = (output_HW[0] < input_HW[0],)
    if "linear" in s:
        mode = "linear"
    elif "nearest-exact" in s:
        mode = "nearest-exact"
    else:
        # assert "nearest" in s
        mode = "nearest"
    mode = (mode,)
    return is_downsample + input_HW + output_HW + num_threads + input_C + mode

for i, l in enumerate(sorted(out, key=key)):
    if i % 8 == 0:
        print()
    # if i % 10 == 0 and i % 40 != 0:
    #     print()
    # if i % 40 == 0:
    #     print("-" * 100)
    print(l)

```

</details>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90771
Approved by: https://github.com/peterbell10, https://github.com/ngimel
This commit is contained in:
Nicolas Hug
2023-02-10 01:43:49 +00:00
committed by PyTorch MergeBot
parent 782e4f5c02
commit 544c04f2df
5 changed files with 1327 additions and 141 deletions

38
NOTICE
View File

@ -416,3 +416,41 @@ derivation and reference the following license:
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=======================================================================
PILLOW-SIMD Software License
=======================================================================
Code derived from implementations in PILLOW-SIMD should mention its derivation
and reference the following license:
The Python Imaging Library (PIL) is
Copyright © 1997-2011 by Secret Labs AB
Copyright © 1995-2011 by Fredrik Lundh
Pillow is the friendly PIL fork. It is
Copyright © 2010-2022 by Alex Clark and contributors
Like PIL, Pillow is licensed under the open source HPND License:
By obtaining, using, and/or copying this software and/or its associated
documentation, you agree that you have read, understood, and will comply
with the following terms and conditions:
Permission to use, copy, modify, and distribute this software and its
associated documentation for any purpose and without fee is hereby granted,
provided that the above copyright notice appears in all copies, and that
both that copyright notice and this permission notice appear in supporting
documentation, and that the name of Secret Labs AB or the author not be
used in advertising or publicity pertaining to distribution of the software
without specific, written prior permission.
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL,
INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.

View File

@ -8,6 +8,7 @@
#include <ATen/native/UpSample.h>
#include <ATen/native/cpu/utils.h>
#include <c10/util/irange.h>
#include <ATen/native/cpu/UpSampleKernelAVXAntialias.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
@ -22,12 +23,53 @@ namespace {
using scale_t = std::vector<c10::optional<double>>;
// TODO: this file could benefit from a global renaming of its functions /
// classes and terms, as well as from adding more comments. In particular:
// - It's not obvious that despite their names (and the file name), all these
// kernels don't just do upsampling: they do general interpolation, i.e. they
// also all support downscaling.
// - the term "horizontal" or "within dims" or "contiguous dim" refers to the
// last dimension.
// It's not specific to 2D images and applies to 3D (and 1D??) inputs as well.
// Similarly "vertical" or "across dims" refers to all dims that aren't the
// last one. In other kernels these are also referred to as "zero-stride" and
// "non-zero-stride" - we should unify all this.
// - the terms "zero-stride" and "non-zero strides" refer to the weights and
// indices, not to the contiguity of input or output
// - It's not always clear which kernel is vectorized and which one isn't.
// - The functions like _use_vectorized_kernel_cond() should be renamed and
// their description updated, because they're not the only "fork" in the
// code-path where a choice is made between a vectorized kernel vs a
// non-vectorized one. See e.g. upsample_bilinear2d_kernel_impl() where we
// already make a similar check, before the one in
// _use_vectorized_kernel_cond().
// - It's not always clear which code is part of a "separable interpolation"
// code-path.
// - Some names need to be more specific. For example
// "cpu_upsample_generic_aa()" looks like a super generic name, but the function
// is instead fairly specific - we need to make that clearer.
// - Some functions have a "aa" suffix but it doesn't mean that they only
// support antialias. Some of them also support antialias=False now.
// - Various comments are outdated. Case in point: the one just below about the
// `Interpolate` struct being used for cpu_upsample_linear:
// cpu_upsample_linear doesn't exist anymore, and these structs are used for
// various modes, *not* just linear.
// - It'd be useful to document how interpolation works in general, and in particular state explicitly:
// - that the weights and indices across a given dimension are the same for
// all pixels (hence the benefit of pre-computing them)
// - that it can be "separated", i.e. we can do the horizontal pass and the
// vertical pass independently (and that some kernels are written this way,
// while some aren't.)
// - we can probably remove the template over index_t, because it's always
// hard-coded as int64_t
// Helper structs and methods for cpu_upsample_linear
//
// Interpolation methods that used below are separable, and as such we can compute the interpolation
// independently per dimension in a recursive way. Please, refer to #10482 for more context.
//
// Linear Interpolation structure to compute output value in n-dimensional case.
// Interpolation structure to compute output value in n-dimensional case.
// - recursively compute interpolated output for each dimension
// - we rely a lot on compiler's code optimization such that implemented operations
// can be automatically factorized and vectorized using SSE and AVX2
@ -255,48 +297,129 @@ static inline void basic_loop(char** data, const int64_t* strides, int64_t n) {
}
}
template <typename scalar_t, typename index_t>
static inline void basic_loop_aa_single_dim_zero_strides(
template <typename scalar_t>
static inline void basic_loop_aa_vertical(
char** data,
const int64_t* strides,
int64_t n) {
int64_t n,
unsigned int weights_precision) {
char* dst = data[0];
char* src = data[1];
// index stride is constant for the given dimension
const index_t ids_stride = *(index_t*)&data[2 + 2][0];
const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];
for (const auto i : c10::irange(n)) {
*(scalar_t*)&dst[i * strides[0]] =
interpolate_aa_single_dim_zero_strides<scalar_t, index_t>(
interpolate_aa_single_dim_zero_strides<scalar_t, int64_t>(
src + i * strides[1], &data[2], ids_stride);
}
}
template <typename scalar_t, typename index_t>
static inline void basic_loop_aa_single_dim_nonzero_strides(
template <>
inline void basic_loop_aa_vertical<uint8_t>(
char** data,
const int64_t* strides,
int64_t n) {
int64_t n,
unsigned int weights_precision) {
// See Note [ Weights computation for uint8_t and multiplication trick ]
char* dst = data[0];
char* src = data[1];
// index stride is constant for the given dimension
const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];
const int64_t ids_size = *(int64_t*)&data[2 + 1][0];
const int64_t ids_min = *(int64_t*)&data[2 + 0][0];
int64_t i = 0;
for (; i<n; i++) {
char* src_min = src + i * strides[1] + ids_min;
uint8_t t = *(uint8_t*)&src_min[0];
int64_t wts_idx = *(int64_t*)&data[2 + 4][0];
int16_t* wts_ptr = (int16_t*)&data[2 + 3][wts_idx];
int16_t wts = wts_ptr[0];
// Intermediate computations are using integer type
int output = 1 << (weights_precision - 1); // accounts for the +0.5 part
output += t * wts;
for (const auto j : c10::irange(1, ids_size)) {
wts = wts_ptr[j];
t = *(uint8_t*)&src_min[j * ids_stride];
output += t * wts;
}
*(uint8_t*)&dst[i * strides[0]] = (uint8_t)std::clamp(output >> weights_precision, 0, 255);
}
}
template <typename scalar_t>
static inline void basic_loop_aa_horizontal(
char** data,
const int64_t* strides,
int64_t n,
unsigned int weights_precision) {
char* dst = data[0];
char* src = data[1];
// index stride is constant for the given dimension
const index_t ids_stride = *(index_t*)&data[2 + 2][0];
const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];
if (strides[1] == 0) {
for (const auto i : c10::irange(n)) {
*(scalar_t*)&dst[i * strides[0]] =
interpolate_aa_single_dim<scalar_t, index_t>(
interpolate_aa_single_dim<scalar_t, int64_t>(
src, &data[2], &strides[2], i, ids_stride);
}
} else {
for (const auto i : c10::irange(n)) {
*(scalar_t*)&dst[i * strides[0]] =
interpolate_aa_single_dim<scalar_t, index_t>(
interpolate_aa_single_dim<scalar_t, int64_t>(
src + i * strides[1], &data[2], &strides[2], i, ids_stride);
}
}
}
template <>
inline void basic_loop_aa_horizontal<uint8_t>(
char** data,
const int64_t* strides,
int64_t n,
unsigned int weights_precision) {
// See Note [ Weights computation for uint8_t and multiplication trick ]
char* dst = data[0];
char* src = data[1];
// index stride is constant for the given dimension
const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];
int64_t i = 0;
// Here we are implementing data interpolation within the same line (vs between the lines)
// output[x, y] = input[xmin[x], y] * W[x] + input[xmin[x] + 1, y] * W[x + 1] + ... + input[xmin[x] + xsize, y] * W[x + xsize]
for (; i<n; i++) {
int64_t ids_min = *(int64_t*)&data[2 + 0][i * strides[2 + 0]];
int64_t ids_size = *(int64_t*)&data[2 + 1][i * strides[2 + 1]];
char* src_min = src + i * strides[1] + ids_min;
uint8_t t = *(uint8_t*)&src_min[0];
int64_t wts_idx = *(int64_t*)&data[2 + 4][i * strides[2 + 4]];
int16_t* wts_ptr = (int16_t*)&data[2 + 3][wts_idx];
int16_t wts = wts_ptr[0];
// Intermediate computations are using integer type
int output = 1 << (weights_precision - 1); // accounts for the +0.5 part
output += t * wts;
for (const auto j : c10::irange(1, ids_size)) {
wts = wts_ptr[j];
t = *(uint8_t*)&src_min[j * ids_stride];
output += t * wts;
}
*(uint8_t*)&dst[i * strides[0]] = (uint8_t)std::clamp(output >> weights_precision, 0, 255);
}
}
// Generic upsampling computation method using TensorIterator for Nd case.
// Supports: nearest, linear, cubic modes with interp_size template argument: 1, 2, 4
//
@ -621,21 +744,23 @@ struct HelperInterpBase {
template <typename scalar_t, typename aa_filter_fn_t>
static inline void _compute_weights_aa(
const int64_t i, const int64_t input_size, const scalar_t scale, const scalar_t support,
scalar_t* wt_ptr, const int64_t interp_size, aa_filter_fn_t filter_fn,
int64_t& xmin, int64_t& xsize
scalar_t* wt_ptr, const int64_t max_interp_size, aa_filter_fn_t filter_fn,
int64_t& xmin, int64_t& xsize, bool antialias, double align_corners_delta
) {
scalar_t center = scale * (i + 0.5);
// align_corners_delta is 0.5 for uint8 and align_corners=true and antialias=false
// is 0.0 otherwise
scalar_t center = scale * (i + 0.5 - align_corners_delta);
scalar_t total_w = 0.0;
scalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0;
scalar_t invscale = (scale >= 1.0 && antialias) ? 1.0 / scale : 1.0;
xmin = std::max(
static_cast<int64_t>(center - support + 0.5), static_cast<int64_t>(0));
xsize = std::min(static_cast<int64_t>(center + support + 0.5), input_size) -
xmin;
static_cast<int64_t>(center - support + 0.5 + align_corners_delta), static_cast<int64_t>(0));
xsize = std::min(
static_cast<int64_t>(center + support + 0.5 + align_corners_delta), input_size) - xmin;
int64_t j = 0;
for (; j < xsize; j++) {
scalar_t w = filter_fn((j + xmin - center + 0.5) * invscale);
scalar_t w = filter_fn((j + xmin - center + 0.5 - align_corners_delta) * invscale);
wt_ptr[j] = w;
total_w += w;
}
@ -644,23 +769,39 @@ struct HelperInterpBase {
wt_ptr[j] /= total_w;
}
}
for (; j < interp_size; j++) {
for (; j < max_interp_size; j++) {
wt_ptr[j] = static_cast<scalar_t>(0.0);
}
}
template <typename scalar_t, typename aa_filter_fn_t>
static inline std::vector<Tensor> _compute_indices_weights_aa(
// Note [ Support for antialias=False as a subcase of antilias=True ]
// This function was originally written with the hard assumption that
// antialias=True (hence the aa in the name). It was later extended to support
// antialias=False. The only difference between aa and no-aa is in how the
// weights and indices are computed (and their number). In aa their number is
// variable but with no-aa, they're fixed to interp_size. The same "filters"
// can be used otherwise. HOWEVER, support for antialias=False here may not be
// optimally optimized: the code assumes an arbitrary number of weights and
// indices, but this can be optimized further when aa=False since we know
// their actual dimensions.
template <typename scalar_t, typename aa_filter_fn_t, int weight_index_stride=sizeof(scalar_t)>
static inline std::tuple<std::vector<Tensor>, int> _compute_indices_weights_aa(
int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
int64_t reshape_dim, scalar_t scale,
int interp_size, aa_filter_fn_t aa_filter_fn
int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, double align_corners_delta
) {
std::vector<Tensor> output;
scalar_t support =
(scale >= 1.0) ? (interp_size * 0.5) * scale : interp_size * 0.5;
interp_size = (int)ceilf(support) * 2 + 1;
scalar_t support;
int max_interp_size;
if (antialias) {
support = (scale >= 1.0) ? (interp_size * 0.5) * scale : interp_size * 0.5;
max_interp_size = (int) std::ceil(support) * 2 + 1;
} else {
support = interp_size * 0.5;
max_interp_size = interp_size;
}
auto new_shape = std::vector<int64_t>(ndims, 1);
new_shape[reshape_dim] = output_size;
@ -675,7 +816,7 @@ struct HelperInterpBase {
{
// Weights
new_shape[reshape_dim] = output_size * interp_size;
new_shape[reshape_dim] = output_size * max_interp_size;
auto wts = empty(new_shape, CPU(c10::CppTypeToScalarType<scalar_t>()));
auto strides = wts.strides().vec();
strides[reshape_dim] = 0;
@ -701,20 +842,130 @@ struct HelperInterpBase {
input_size,
scale,
support,
wt_ptr + i * interp_size,
interp_size,
wt_ptr + i * max_interp_size,
max_interp_size,
aa_filter_fn,
xmin,
xmax);
xmax,
antialias,
align_corners_delta);
idx_ptr_xmin[i] = xmin * stride;
idx_ptr_size[i] = xmax;
idx_ptr_stride[i] = stride;
wt_idx_ptr[i] = i * interp_size * sizeof(scalar_t);
wt_idx_ptr[i] = i * max_interp_size * weight_index_stride;
}
return output;
return {output, max_interp_size};
}
/*
NOTE [ Weights computation for uint8_t and multiplication trick ]
When the input/output dtype is uint8_t, we still compute the interpolation
weights as double, but then convert them to int16 via some conversion logic
detailed below. This allows us to compute all interpolation operation (sum of
multiplications) as ints instead of floats. The result is converted back into
uint8 in basic_loop_aa_horizontal<uint8_t> (and vertical)
In essence the idea is to avoid a multiplication between a float (the
weight) and an int (the pixel value) and instead run a multpilication between
2 ints:
```py
COEF_PREC = 16
def mul(a:float, b:int) -> Tuple[float, int]:
# return a * b, round(a * b)
actual = a * b
assert a > 0 # I'm lazy
int_a = floor(0.5 + a * (1 << COEF_PREC))
with_trick = ((int_a * b) + (1 << (COEF_PREC - 1))) >> COEF_PREC
return actual, with_trick # round(actual) == with_trick!!
```
Here's how it works:
N == COEFF_PREC
1 << N == 2**N
floor(0.5 + x) == round(x)
So the operation is something like
int_a = round(a * 2**N) -- let's just say it's `a * 2**N` for simplicity
res = ((int_a * b) + (1 << (N - 1))) >> N
= ((a * 2**N * b + 2**(N - 1)) / 2**N
= a * b + 0.5
= round(a * b)
= what we wanted
*/
template <typename aa_filter_fn_t>
static inline std::tuple<std::vector<Tensor>, int, unsigned int> _compute_indices_int16_weights_aa(
int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
int64_t reshape_dim, bool align_corners, const c10::optional<double> opt_scale,
int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, bool align_i32=false
) {
double scale = area_pixel_compute_scale<double>(
input_size, output_size, align_corners, opt_scale);
std::vector<Tensor> indices_weights;
auto align_corners_delta = (align_corners && !antialias) ? 0.5 : 0.0;
std::tie(indices_weights, interp_size) = HelperInterpBase::_compute_indices_weights_aa<double, aa_filter_fn_t, sizeof(int16_t)>(
input_size, output_size, stride, ndims, reshape_dim, scale, interp_size, aa_filter_fn, antialias, align_corners_delta);
// Rescale float weights to int16 and compute weights precision
auto weights_f64 = indices_weights[3];
double * data_f64 = weights_f64.data_ptr<double>();
int64_t weights_f64_size = output_size * interp_size;
// can't use weights_f64.max() here as tensor is restrided
double w_max = data_f64[0];
for (const auto i : c10::irange(weights_f64_size)) {
double v = data_f64[i];
if (w_max < v) {
w_max = v;
}
}
unsigned int weights_precision = 0;
for (weights_precision = 0; weights_precision < 22; weights_precision += 1) {
int next_value = (int) (0.5 + w_max * (1 << (weights_precision + 1)));
if (next_value >= (1 << 15))
break;
}
// Rescale float values to int16
int16_t * data_i16 = (int16_t *) data_f64;
auto aligned_interp_size = interp_size;
if (align_i32) {
// We should respect int32 alignment as
// we will load data as int32 with AVX2
// See ImagingResampleHorizontalConvolution8u4x, mmk0 = _mm256_set1_epi32(*(int32_t*)&k[x]);
// compute aligned_interp_size = nearest pair value to interp_size
while (aligned_interp_size % sizeof(int32_t) != 0) {
aligned_interp_size += 1;
}
// assert that we wont go out of bounds
TORCH_INTERNAL_ASSERT(aligned_interp_size * sizeof(int16_t) < interp_size * sizeof(double));
}
for (const auto j : c10::irange(output_size)) {
for (const auto k : c10::irange(interp_size)) {
double v = data_f64[j * interp_size + k];
if (v < 0) {
data_i16[j * aligned_interp_size + k] = (int) (-0.5 + v * (1 << weights_precision));
} else {
data_i16[j * aligned_interp_size + k] = (int) (0.5 + v * (1 << weights_precision));
}
}
}
return {indices_weights, aligned_interp_size, weights_precision};
}
};
struct HelperInterpNearest : public HelperInterpBase {
@ -923,8 +1174,9 @@ struct HelperInterpLinear : public HelperInterpBase {
input_size, output_size, align_corners, opt_scale);
auto interp_size = HelperInterpLinear::interp_size;
int unused;
indices_weights = HelperInterpLinear::_compute_indices_weights_aa<scalar_t>(
std::tie(indices_weights, unused) = HelperInterpLinear::_compute_indices_weights_aa<scalar_t>(
input_size,
output_size,
stride,
@ -932,11 +1184,32 @@ struct HelperInterpLinear : public HelperInterpBase {
reshape_dim,
scale,
interp_size,
&HelperInterpLinear::aa_filter<scalar_t>);
&HelperInterpLinear::aa_filter<scalar_t>,
/*antialias=*/true,
/*align_corners_delta=*/0.0);
}
);
return indices_weights;
}
static inline std::tuple<std::vector<Tensor>, int, unsigned int> compute_indices_int16_weights_aa(
int64_t input_size,
int64_t output_size,
int64_t stride,
int64_t ndims,
int64_t reshape_dim,
bool align_corners,
const c10::optional<double> opt_scale,
bool antialias,
bool align_i32=false
) {
auto interp_size = HelperInterpLinear::interp_size;
auto fn = HelperInterpLinear::aa_filter<double>;
return HelperInterpLinear::_compute_indices_int16_weights_aa(
input_size, output_size, stride, ndims, reshape_dim,
align_corners, opt_scale, interp_size, fn, antialias, align_i32);
}
};
struct HelperInterpCubic : public HelperInterpBase {
@ -1033,8 +1306,9 @@ struct HelperInterpCubic : public HelperInterpBase {
input_size, output_size, align_corners, opt_scale);
auto interp_size = HelperInterpCubic::interp_size;
int unused;
indices_weights = HelperInterpCubic::_compute_indices_weights_aa<scalar_t>(
std::tie(indices_weights, unused) = HelperInterpCubic::_compute_indices_weights_aa<scalar_t>(
input_size,
output_size,
stride,
@ -1042,11 +1316,14 @@ struct HelperInterpCubic : public HelperInterpBase {
reshape_dim,
scale,
interp_size,
&HelperInterpCubic::aa_filter<scalar_t>);
&HelperInterpCubic::aa_filter<scalar_t>,
/*antialias=*/true,
/*align_corners_delta*/0.0);
}
);
return indices_weights;
}
};
// Generic upsampling interpolation kernel for N-d case.
@ -1133,31 +1410,50 @@ void upsample_generic_Nd_kernel_impl(
}
}
template <typename scalar_t>
void cpu_upsample_generic_aa(at::TensorIterator& iter) {
template <typename scalar_t, bool is_horizontal>
void cpu_upsample_generic_aa(at::TensorIterator& iter, unsigned int weights_precision) {
auto loop = [&](char** data, const int64_t* strides, int64_t n) {
if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) &&
is_zero_stride<3 + 2>(&strides[2])) {
basic_loop_aa_single_dim_zero_strides<scalar_t, int64_t>(
data, strides, n);
if (is_horizontal) {
// Strides are : X 0 | 8 8 8 0 8 (Channels first)
// Strides are : X X | 0 0 0 0 0 (Channels last)
// upsampling data within a contiguous dimension (aka horizontal resampling)
if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) &&
is_zero_stride<3 + 2>(&strides[2])) {
// channels last case
basic_loop_aa_horizontal<scalar_t>(
data, strides, n, weights_precision);
} else {
basic_loop_aa_horizontal<scalar_t>(
data, strides, n, weights_precision);
}
} else {
basic_loop_aa_single_dim_nonzero_strides<scalar_t, int64_t>(
data, strides, n);
// Strides are : X Y | 0 0 0 0 0 (Channels first)
// Strides are : X X | 0 0 0 0 0 (Channels last)
// upsampling data between contiguous dimensions (aka vertical resampling)
if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) &&
is_zero_stride<3 + 2>(&strides[2])) {
basic_loop_aa_vertical<scalar_t>(
data, strides, n, weights_precision);
} else {
basic_loop_aa_vertical<scalar_t>(
data, strides, n, weights_precision);
}
}
};
iter.for_each(loop);
}
// Generic separable upsampling interpolation kernels for N-d case with anti-aliasing
template <int out_ndims, typename scale_type, class F>
template <int out_ndims, typename scale_type, class F, bool is_horizontal>
void _separable_upsample_generic_Nd_kernel_impl_single_dim(
const Tensor& output,
const Tensor& input,
int interp_dim,
bool align_corners,
const scale_type& scales) {
const scale_type& scales,
bool antialias) {
// input can be NCHW, NCL or NCKHW
auto shape = input.sizes().vec();
@ -1174,21 +1470,29 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim(
strides[interp_dim] = 0;
auto restrided_input = input.as_strided(shape, strides);
std::vector<std::vector<Tensor>> indices_weights;
int interp_size = F::interp_size;
auto input_scalar_type = input.scalar_type();
if (interp_size == 1 && input_scalar_type == at::ScalarType::Byte) {
// nearest also supports uint8 tensor, but we have to use float
// with compute_indices_weights
input_scalar_type = at::ScalarType::Float;
}
indices_weights.emplace_back(
std::vector<Tensor> indices_weights;
unsigned int weights_precision = 0;
int unused;
if (input_scalar_type == at::kByte) {
std::tie(indices_weights, unused, weights_precision) =
// TODO: change that to F:: once / if bicubic mode supports uint8 after all
HelperInterpLinear::compute_indices_int16_weights_aa(
input.size(interp_dim), oshape[interp_dim],
input.stride(interp_dim) * input.element_size(),
input.dim(), interp_dim, align_corners, scales[interp_dim - 2],
antialias);
TORCH_INTERNAL_ASSERT(weights_precision > 0);
} else {
TORCH_INTERNAL_ASSERT(antialias);
indices_weights =
F::compute_indices_weights_aa(
input_scalar_type, input.size(interp_dim), oshape[interp_dim],
input.stride(interp_dim) * input.element_size(),
input.dim(), interp_dim, align_corners, scales[interp_dim - 2]));
input.dim(), interp_dim, align_corners, scales[interp_dim - 2]);
}
TensorIteratorConfig config;
config.check_all_same_dtype(false)
@ -1196,51 +1500,95 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim(
.add_output(output)
.add_input(restrided_input);
for (auto& idx_weight : indices_weights) {
for (auto& tensor : idx_weight) {
config.add_input(tensor);
}
for (auto& tensor : indices_weights) {
config.add_input(tensor);
}
auto iter = config.build();
if (interp_size > 1) {
// Nearest also supports uint8 tensor, so need to handle it separately
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "upsample_generic_Nd_aa", [&] {
cpu_upsample_generic_aa<scalar_t>(iter);
});
} else {
AT_DISPATCH_FLOATING_TYPES_AND(
at::ScalarType::Byte, iter.dtype(), "upsample_generic_Nd_aa", [&] {
cpu_upsample_generic_aa<scalar_t>(iter);
});
}
AT_DISPATCH_FLOATING_TYPES_AND(
at::ScalarType::Byte, iter.dtype(), "upsample_generic_Nd_aa", [&] {
cpu_upsample_generic_aa<scalar_t, is_horizontal>(iter, weights_precision);
});
}
// Generic separable upsampling interpolation kernel for N-d case with anti-aliasing.
// It also supports antialias=False iff
// (dtype == uint8 and mode in ("bilinear", "bicubic")): this is used as
// fallback in these settings when AVX isn't supported.
template <int out_ndims, typename scale_type, class F>
void separable_upsample_generic_Nd_kernel_impl(
const Tensor& output,
const Tensor& input,
bool align_corners,
const scale_type& scales) {
const scale_type& scales,
bool antialias) {
auto output_shape = output.sizes();
auto input_shape = input.sizes();
auto temp_oshape = input_shape.vec();
if (output_shape == input_shape) {
output.copy_(input);
return;
}
auto temp_oshape = input.sizes().vec();
at::Tensor temp_output, temp_input = input;
for (const auto i : c10::irange(out_ndims - 1)) {
int interp_dim = 2 + out_ndims - 1 - i;
temp_oshape[interp_dim] = output.sizes()[interp_dim];
temp_output = at::empty(temp_oshape, input.options().memory_format(input.suggest_memory_format()));
int interp_dim = 0;
// Precompute the number of single dim resize method invocations
// to avoid copying temporary buffer to output
int num_single_dim_ops = 0;
for (const auto i : c10::irange(out_ndims)) {
interp_dim = 2 + out_ndims - 1 - i;
if (output_shape[interp_dim] != input_shape[interp_dim]) {
num_single_dim_ops += 1;
}
}
// upsampling data within the contiguous dimension (aka horizontal resampling)
interp_dim = 2 + out_ndims - 1;
if (output_shape[interp_dim] != input_shape[interp_dim]) {
num_single_dim_ops -= 1;
if (num_single_dim_ops > 0) {
temp_oshape[interp_dim] = output_shape[interp_dim];
temp_output = at::empty(temp_oshape, input.options());
} else {
temp_output = output;
}
_separable_upsample_generic_Nd_kernel_impl_single_dim<
out_ndims,
scale_t,
F>(
temp_output, temp_input, interp_dim, align_corners, scales);
F,
true>(
temp_output, temp_input, interp_dim, align_corners, scales, antialias);
temp_input = temp_output;
}
_separable_upsample_generic_Nd_kernel_impl_single_dim<
out_ndims,
scale_t,
F>(output, temp_input, 2, align_corners, scales);
// upsampling data between contiguous dimensions (aka vertical resampling)
for (const auto i : c10::irange(1, out_ndims)) {
interp_dim = 2 + out_ndims - 1 - i;
if (output_shape[interp_dim] != input_shape[interp_dim]) {
num_single_dim_ops -= 1;
if (num_single_dim_ops > 0) {
temp_oshape[interp_dim] = output_shape[interp_dim];
temp_output = at::empty(temp_oshape, input.options());
} else {
temp_output = output;
}
_separable_upsample_generic_Nd_kernel_impl_single_dim<
out_ndims,
scale_t,
F,
false>(
temp_output, temp_input, interp_dim, align_corners, scales, antialias);
temp_input = temp_output;
}
}
}
void upsample_nearest1d_kernel_impl(
@ -1356,7 +1704,8 @@ void upsample_linear1d_kernel_impl(
output, input, align_corners, {scales_w});
}
void upsample_bilinear2d_kernel_impl(
void upsample_bilinear2d_kernel_impl_float(
const Tensor& output,
const Tensor& input,
bool align_corners,
@ -1378,15 +1727,56 @@ void upsample_bilinear2d_kernel_impl(
}
}
void upsample_bilinear2d_aa_kernel_impl(
void upsample_bilinear2d_kernel_impl(
const Tensor& output,
const Tensor& input,
bool align_corners,
c10::optional<double> scales_h,
c10::optional<double> scales_w) {
if (input.dtype() == at::kByte){
#ifdef CPU_CAPABILITY_AVX2
if (input.size(1) <= 4) {
upsample_avx_bilinear_uint8<scale_t, HelperInterpLinear>(input,
output, align_corners, {scales_h, scales_w},
/*antialias=*/false);
} else {
separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
output, input, align_corners, {scales_h, scales_w},
/*antialias=*/false);
}
#else // CPU_CAPABILITY_AVX2
separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
output, input, align_corners, {scales_h, scales_w},
/*antialias=*/false);
#endif // CPU_CAPABILITY_AVX2
} else {
upsample_bilinear2d_kernel_impl_float(output, input, align_corners, scales_h, scales_w);
}
}
void upsample_bilinear2d_aa_kernel_impl(
const Tensor& output,
const Tensor& input,
bool align_corners,
c10::optional<double> scales_h,
c10::optional<double> scales_w) {
#ifdef CPU_CAPABILITY_AVX2
if (input.dtype() == at::kByte && input.size(1) <= 4) {
upsample_avx_bilinear_uint8<scale_t, HelperInterpLinear>(
input, output, align_corners, {scales_h, scales_w},
/*antialias=*/true);
} else {
separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
output, input, align_corners, {scales_h, scales_w},
/*antialias=*/true);
}
#else // CPU_CAPABILITY_AVX2
separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
output, input, align_corners, {scales_h, scales_w});
output, input, align_corners, {scales_h, scales_w},
/*antialias=*/true);
#endif // CPU_CAPABILITY_AVX2
}
void upsample_trilinear3d_kernel_impl(
@ -1424,7 +1814,8 @@ void upsample_bicubic2d_aa_kernel_impl(
c10::optional<double> scales_w) {
separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpCubic>(
output, input, align_corners, {scales_h, scales_w});
output, input, align_corners, {scales_h, scales_w},
/*antialias=*/true);
}
template <
@ -1500,7 +1891,9 @@ void cpu_upsample_genNd_backward_aa(
interp_height,
filter_fn,
ymin,
ysize);
ysize,
/*antialias=*/true,
/*align_corners_delta=*/0.0);
for (const auto ow : c10::irange(output_width)) {
F::_compute_weights_aa(
@ -1512,7 +1905,9 @@ void cpu_upsample_genNd_backward_aa(
interp_width,
filter_fn,
xmin,
xsize);
xsize,
/*antialias=*/true,
/*align_corners_delta=*/0.0);
for (const auto c : c10::irange(begin, end)) {
scalar_t grad_output_value =

View File

@ -0,0 +1,719 @@
/*
The Python Imaging Library (PIL) is
Copyright © 1997-2011 by Secret Labs AB
Copyright © 1995-2011 by Fredrik Lundh
Pillow is the friendly PIL fork. It is
Copyright © 2010-2022 by Alex Clark and contributors
Like PIL, Pillow is licensed under the open source HPND License
*/
// This code is heavily inspired from PILLOW-SIMD's implementation:
// https://github.com/uploadcare/pillow-simd/blob/simd/master/src/libImaging/Resample.c
#pragma once
#ifdef CPU_CAPABILITY_AVX2
// TODO: This file only supports AVX2. We could split the AVX kernels into
// smaller logical blocks in order to port them into the Vec.h logic. This would
// allow to support other vectorization architectures and perhaps also support
// the non-vectorized fallback (we'd need to make sure it's not slower than the
// current fallback).
#include <ATen/core/Tensor.h>
#include <ATen/cpu/vec/intrinsics.h>
#include <c10/util/irange.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#else
#include <ATen/ops/empty.h>
#endif
namespace {
static __m128i inline mm_cvtepu8_epi32(const uint32_t* C10_RESTRICT ptr) {
return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)ptr));
}
// TODO: We may want to hard-code an unrolled version for the case where
// num_channels=3 to hint the compiler to vectorize this (looks at original
// PIL-SIMD's code).
at::Tensor unpack_rgb(const at::Tensor& packed_tensor) {
// Convert a "packed" tensor (typically RGBRGBRGB if channels_last) into
// RGBARGBARGBA format where A is hard-coded to 255. Each pixel is encoded
// into as 32bits. This generalizes to num_channels <= 4 and also works for
// non-channels_last tensors.
const uint8_t* packed = (const uint8_t*)packed_tensor.data_ptr<uint8_t>();
auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2);
auto num_channels = packed_tensor.size(0);
constexpr int rgba_size = 4;
auto unpacked_tensor = at::empty({rgba_size, packed_tensor.size(1), packed_tensor.size(2)}, at::CPU(at::kByte));
uint8_t* unpacked = (uint8_t*) unpacked_tensor.data_ptr<uint8_t>();
auto stride_i = packed_tensor.stride(2);
auto stride_j = packed_tensor.stride(0);
for (const auto i : c10::irange(num_pixels)) {
for (const auto j : c10::irange(rgba_size)) {
unpacked[rgba_size * i + j] = (j < num_channels) ? packed[stride_i * i + stride_j * j] : 0;
}
}
return unpacked_tensor;
}
void pack_rgb(
const at::Tensor& unpacked_tensor, // IN
const at::Tensor& packed_tensor // OUT
) {
constexpr int rgba_size = 4;
uint8_t* unpacked = (uint8_t*)unpacked_tensor.data_ptr<uint8_t>();
uint8_t* packed = (uint8_t*)packed_tensor.data_ptr<uint8_t>();
auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2);
auto num_channels = packed_tensor.size(0);
auto packed_increment = packed_tensor.stride(2);
auto packed_stride = packed_tensor.stride(0);
for (const auto i C10_UNUSED : c10::irange(num_pixels)) {
for (const auto j : c10::irange(num_channels)) {
packed[j * packed_stride] = unpacked[j];
}
unpacked += rgba_size;
packed += packed_increment;
}
}
void ImagingResampleHorizontalConvolution8u4x(
uint32_t* C10_RESTRICT lineOut0,
uint32_t* C10_RESTRICT lineOut1,
uint32_t* C10_RESTRICT lineOut2,
uint32_t* C10_RESTRICT lineOut3,
const uint32_t* C10_RESTRICT lineIn0,
const uint32_t* C10_RESTRICT lineIn1,
const uint32_t* C10_RESTRICT lineIn2,
const uint32_t* C10_RESTRICT lineIn3,
int xsize,
int* xbounds,
int16_t* kk,
int kmax,
int coefs_precision);
void ImagingResampleHorizontalConvolution8u(
uint32_t* C10_RESTRICT lineOut,
const uint32_t* C10_RESTRICT lineIn,
int xsize,
int* xbounds,
int16_t* kk,
int kmax,
int coefs_precision);
void ImagingResampleVerticalConvolution8u(
uint32_t* C10_RESTRICT lineOut,
const uint32_t* C10_RESTRICT imIn,
int xmin,
int xmax,
int16_t* k,
int coefs_precision,
int xin);
void ImagingResampleHorizontal(
const at::Tensor & unpacked_output,
const at::Tensor & unpacked_input,
int ksize,
const std::vector<at::Tensor>& horiz_indices_weights,
unsigned int horiz_weights_precision) {
// TODO: we may want to merge that into the fallback code (currently called
// basic_loop_aa_horizontal<uint8_t>)
// Although this may not be needed if / when we port all this code to use
// Vec.h since this would potentially give us another fall-back implem
int yy;
int16_t* kk = (int16_t*)(horiz_indices_weights[3].data_ptr<double>());
auto xout = unpacked_output.size(2);
auto yout = unpacked_output.size(1);
auto xin = unpacked_input.size(2);
std::vector<int> bounds_vec(2 * xout, 0);
int* bounds = bounds_vec.data();
int64_t* idx_ptr_xmin = horiz_indices_weights[0].data_ptr<int64_t>();
int64_t* idx_ptr_size = horiz_indices_weights[1].data_ptr<int64_t>();
for (int i = 0; i < xout; i++) {
bounds[2 * i + 0] = idx_ptr_xmin[i];
bounds[2 * i + 1] = idx_ptr_size[i];
}
uint32_t* unpacked_input_p = (uint32_t*) unpacked_input.data_ptr<uint8_t>();
uint32_t* unpacked_output_p = (uint32_t*) unpacked_output.data_ptr<uint8_t>();
yy = 0;
for (; yy < yout - 3; yy += 4) {
ImagingResampleHorizontalConvolution8u4x(
unpacked_output_p + yy * xout,
unpacked_output_p + (yy + 1) * xout,
unpacked_output_p + (yy + 2) * xout,
unpacked_output_p + (yy + 3) * xout,
unpacked_input_p + yy * xin,
unpacked_input_p + (yy + 1) * xin,
unpacked_input_p + (yy + 2) * xin,
unpacked_input_p + (yy + 3) * xin,
xout,
bounds,
kk,
ksize,
(int)horiz_weights_precision);
}
for (; yy < yout; yy++) {
ImagingResampleHorizontalConvolution8u(
unpacked_output_p + yy * xout,
unpacked_input_p + yy * xin,
xout,
bounds,
kk,
ksize,
(int)horiz_weights_precision);
}
}
void ImagingResampleVertical(
const at::Tensor & unpacked_output,
const at::Tensor & unpacked_input,
int ksize,
const std::vector<at::Tensor>& vert_indices_weights,
unsigned int vert_weights_precision) {
// TODO: we may want to merge that into the fallback code (currently called
// basic_loop_aa_vertical<uint8_t>)
// Although this may not be needed if / when we port all this code to use
// Vec.h since this would potentially give us another fall-back implem
int ymin, ymax;
int16_t* k = nullptr;
int16_t* kk = (int16_t*)(vert_indices_weights[3].data_ptr<double>());
int64_t* idx_ptr_xmin = vert_indices_weights[0].data_ptr<int64_t>();
int64_t* idx_ptr_size = vert_indices_weights[1].data_ptr<int64_t>();
uint32_t* unpacked_output_p = (uint32_t*) unpacked_output.data_ptr<uint8_t>();
uint32_t* unpacked_input_p = (uint32_t*) unpacked_input.data_ptr<uint8_t>();
auto xout = unpacked_output.size(2);
auto yout = unpacked_output.size(1);
for (const auto yy : c10::irange(yout)) {
k = &kk[yy * ksize];
ymin = idx_ptr_xmin[yy];
ymax = idx_ptr_size[yy];
ImagingResampleVerticalConvolution8u(
unpacked_output_p + yy * xout,
unpacked_input_p,
ymin,
ymax,
k,
(int)vert_weights_precision,
xout);
}
}
// This is the only public entry point in this file. It supports bilinear
// mode for uint8 dtype when C <= 4, with or without antialias. The
// implem is based on PIL-SIMD.
// Its equivalent implementation (fallback) for when AVX isn't supported or when
// C > 4 is separable_upsample_generic_Nd_kernel_impl() There are a bunch of
// future improvement that can be done: look for the TODOs in this file.
// For details on how the weights are computed and how the multiplications are
// run on int (instead of float weights), see
// [ Weights computation for uint8_t and multiplication trick ]
// For details on how the AVX kernels are implemented, see
// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
// See also [ Support for antialias=False as a subcase of antilias=True ] to
// learn more about how the antialias=False case is computed. The same holds
// here: all these kernels are general enough to handle an arbitrary number of
// weights, but when aa=False they could be optimized further.
template <typename scale_type, class F>
void upsample_avx_bilinear_uint8(
const at::Tensor& input,
const at::Tensor& output,
bool align_corners,
const scale_type& scales,
bool antialias) {
auto batch_size = input.size(0);
auto num_channels = input.size(1);
auto xin = input.size(3);
auto yin = input.size(2);
auto xout = output.size(3);
auto yout = output.size(2);
if (xin == xout && yin == yout) {
output.copy_(input);
return;
}
auto need_horizontal = xout != xin;
auto need_vertical = yout != yin;
int ksize_horiz, ksize_vert;
std::vector<at::Tensor> horiz_indices_weights, vert_indices_weights;
unsigned int horiz_weights_precision, vert_weights_precision;
if (need_horizontal) {
int interp_dim = 3;
std::tie(horiz_indices_weights, ksize_horiz, horiz_weights_precision) =
F::compute_indices_int16_weights_aa(
/*input_size=*/xin,
/*output_size=*/xout,
/*stride=*/1,
/*ndims=*/4,
/*reshape_dim=*/interp_dim,
/*align_corners=*/align_corners,
/*opt_scale=*/scales[interp_dim - 2],
/*antialias=*/antialias,
/*align_i32=*/true);
}
if (need_vertical) {
int interp_dim = 2;
std::tie(vert_indices_weights, ksize_vert, vert_weights_precision) =
F::compute_indices_int16_weights_aa(
/*input_size=*/yin,
/*output_size=*/yout,
/*stride=*/1,
/*ndims=*/4,
/*reshape_dim=*/interp_dim,
/*align_corners=*/align_corners,
/*opt_scale=*/scales[interp_dim - 2],
/*antialias=*/antialias,
/*align_i32=*/true);
}
bool is_rgba = num_channels == 4 && input.is_contiguous(at::MemoryFormat::ChannelsLast);
at::Tensor buffer_horiz, buffer_vert;
if (need_horizontal && !(is_rgba && !need_vertical)) {
buffer_horiz = at::empty({4, yin, xout}, input.options());
}
if (need_vertical && !is_rgba) {
buffer_vert = at::empty({4, yout, xout}, input.options());
}
// TODO: The unpack / pack operations create a copy of the original input and
// output tensor. There should be a way to avoid these copies by instead
// modifying the low-level kernels. Or maybe at least avoid copying the entire
// tensors and just copy part of them (line by line).
for (const auto i : c10::irange(batch_size)) {
at::Tensor unpacked_input = (is_rgba) ? input[i] : unpack_rgb(input[i]);
at::Tensor unpacked_output;
if (need_horizontal) {
at::Tensor unpacked_output_temp = (is_rgba && !need_vertical) ? output[i] : buffer_horiz;
ImagingResampleHorizontal(
unpacked_output_temp,
unpacked_input,
ksize_horiz,
horiz_indices_weights,
horiz_weights_precision);
unpacked_output = unpacked_input = unpacked_output_temp;
}
if (need_vertical) {
unpacked_output = (is_rgba) ? output[i] : buffer_vert;
ImagingResampleVertical(
unpacked_output,
unpacked_input,
ksize_vert,
vert_indices_weights,
vert_weights_precision);
}
TORCH_INTERNAL_ASSERT(unpacked_output.defined());
if (!is_rgba) {
pack_rgb(unpacked_output, output[i]);
}
}
}
// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
void ImagingResampleHorizontalConvolution8u4x(
uint32_t* C10_RESTRICT lineOut0,
uint32_t* C10_RESTRICT lineOut1,
uint32_t* C10_RESTRICT lineOut2,
uint32_t* C10_RESTRICT lineOut3,
const uint32_t* C10_RESTRICT lineIn0,
const uint32_t* C10_RESTRICT lineIn1,
const uint32_t* C10_RESTRICT lineIn2,
const uint32_t* C10_RESTRICT lineIn3,
int xsize,
int* xbounds,
int16_t* kk,
int kmax,
int coefs_precision) {
int xmin, xmax, x;
int16_t* k;
for (const auto xx : c10::irange(xsize)) {
xmin = xbounds[xx * 2 + 0];
xmax = xbounds[xx * 2 + 1];
k = &kk[xx * kmax];
x = 0;
__m256i sss0, sss1;
__m256i zero = _mm256_setzero_si256();
__m256i initial = _mm256_set1_epi32(1 << (coefs_precision - 1));
sss0 = initial;
sss1 = initial;
for (; x < xmax - 3; x += 4) {
__m256i pix, mmk0, mmk1, source;
mmk0 = _mm256_set1_epi32(*(int32_t*)&k[x]);
mmk1 = _mm256_set1_epi32(*(int32_t*)&k[x + 2]);
source = _mm256_inserti128_si256(
_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lineIn0[x + xmin])),
_mm_loadu_si128((__m128i*)&lineIn1[x + xmin]),
1);
// clang-format off
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk0));
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk1));
source = _mm256_inserti128_si256(
_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lineIn2[x + xmin])),
_mm_loadu_si128((__m128i*)&lineIn3[x + xmin]),
1);
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk0));
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk1));
}
for (; x < xmax - 1; x += 2) {
__m256i pix, mmk;
mmk = _mm256_set1_epi32(*(int32_t*)&k[x]);
pix = _mm256_inserti128_si256(
_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lineIn0[x + xmin])),
_mm_loadl_epi64((__m128i*)&lineIn1[x + xmin]),
1);
pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
pix = _mm256_inserti128_si256(
_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lineIn2[x + xmin])),
_mm_loadl_epi64((__m128i*)&lineIn3[x + xmin]),
1);
pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
// clang-format on
}
for (; x < xmax; x++) {
__m256i pix, mmk;
// [16] xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0
mmk = _mm256_set1_epi32(k[x]);
// [16] xx a0 xx b0 xx g0 xx r0 xx a0 xx b0 xx g0 xx r0
pix = _mm256_inserti128_si256(
_mm256_castsi128_si256(mm_cvtepu8_epi32(&lineIn0[x + xmin])),
mm_cvtepu8_epi32(&lineIn1[x + xmin]),
1);
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
pix = _mm256_inserti128_si256(
_mm256_castsi128_si256(mm_cvtepu8_epi32(&lineIn2[x + xmin])),
mm_cvtepu8_epi32(&lineIn3[x + xmin]),
1);
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
}
sss0 = _mm256_srai_epi32(sss0, coefs_precision);
sss1 = _mm256_srai_epi32(sss1, coefs_precision);
sss0 = _mm256_packs_epi32(sss0, zero);
sss1 = _mm256_packs_epi32(sss1, zero);
sss0 = _mm256_packus_epi16(sss0, zero);
sss1 = _mm256_packus_epi16(sss1, zero);
lineOut0[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 0));
lineOut1[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1));
lineOut2[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 0));
lineOut3[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1));
}
}
// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
void ImagingResampleHorizontalConvolution8u(
uint32_t* C10_RESTRICT lineOut,
const uint32_t* C10_RESTRICT lineIn,
int xsize,
int* xbounds,
int16_t* kk,
int kmax,
int coefs_precision) {
int xmin, xmax, x;
int16_t* k;
for (const auto xx : c10::irange(xsize)) {
__m128i sss;
xmin = xbounds[xx * 2 + 0];
xmax = xbounds[xx * 2 + 1];
k = &kk[xx * kmax];
x = 0;
if (xmax < 8) {
sss = _mm_set1_epi32(1 << (coefs_precision - 1));
} else {
// Lower part will be added to higher, use only half of the error
__m256i sss256 = _mm256_set1_epi32(1 << (coefs_precision - 2));
for (; x < xmax - 7; x += 8) {
__m256i pix, mmk, source;
__m128i tmp = _mm_loadu_si128((__m128i*)&k[x]);
__m256i ksource =
_mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
// clang-format off
source = _mm256_loadu_si256((__m256i*)&lineIn[x + xmin]);
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
11,10, 9,8, 11,10, 9,8, 11,10, 9,8, 11,10, 9,8,
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
15,14, 13,12, 15,14, 13,12, 15,14, 13,12, 15,14, 13,12,
7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4));
sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
// clang-format on
}
for (; x < xmax - 3; x += 4) {
__m256i pix, mmk, source;
__m128i tmp = _mm_loadl_epi64((__m128i*)&k[x]);
__m256i ksource =
_mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
tmp = _mm_loadu_si128((__m128i*)&lineIn[x + xmin]);
source = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
// clang-format off
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4,
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
// clang-format on
}
sss = _mm_add_epi32(
_mm256_extracti128_si256(sss256, 0),
_mm256_extracti128_si256(sss256, 1));
}
for (; x < xmax - 1; x += 2) {
__m128i mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
__m128i source = _mm_loadl_epi64((__m128i*)&lineIn[x + xmin]);
__m128i pix = _mm_shuffle_epi8(
source,
_mm_set_epi8(-1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0));
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
}
for (; x < xmax; x++) {
__m128i pix = mm_cvtepu8_epi32(&lineIn[x + xmin]);
__m128i mmk = _mm_set1_epi32(k[x]);
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
}
sss = _mm_srai_epi32(sss, coefs_precision);
sss = _mm_packs_epi32(sss, sss);
lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss));
}
}
// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
void ImagingResampleVerticalConvolution8u(
uint32_t* C10_RESTRICT lineOut,
const uint32_t* C10_RESTRICT imIn,
int xmin,
int xmax,
int16_t* k,
int coefs_precision,
int xin) {
int x;
int xx = 0;
int xsize = xin;
__m128i initial = _mm_set1_epi32(1 << (coefs_precision - 1));
__m256i initial_256 = _mm256_set1_epi32(1 << (coefs_precision - 1));
for (; xx < xsize - 7; xx += 8) {
__m256i sss0 = initial_256;
__m256i sss1 = initial_256;
__m256i sss2 = initial_256;
__m256i sss3 = initial_256;
x = 0;
for (; x < xmax - 1; x += 2) {
__m256i source, source1, source2;
__m256i pix, mmk;
// Load two coefficients at once
mmk = _mm256_set1_epi32(*(int32_t*)&k[x]);
// Load 2 lines
// (__m256i *) &imIn->image32[x + xmin][xx]
source1 = _mm256_loadu_si256((__m256i*)(imIn + (x + xmin) * xin + xx));
// (__m256i *) &imIn->image32[x + 1 + xmin][xx]
source2 =
_mm256_loadu_si256((__m256i*)(imIn + (x + 1 + xmin) * xin + xx));
source = _mm256_unpacklo_epi8(source1, source2);
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
source = _mm256_unpackhi_epi8(source1, source2);
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk));
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk));
}
for (; x < xmax; x += 1) {
__m256i source, source1, pix, mmk;
mmk = _mm256_set1_epi32(k[x]);
// (__m256i *) &imIn->image32[x + xmin][xx])
source1 = _mm256_loadu_si256((__m256i*)(imIn + (x + xmin) * xin + xx));
source = _mm256_unpacklo_epi8(source1, _mm256_setzero_si256());
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
source = _mm256_unpackhi_epi8(source1, _mm256_setzero_si256());
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk));
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk));
}
sss0 = _mm256_srai_epi32(sss0, coefs_precision);
sss1 = _mm256_srai_epi32(sss1, coefs_precision);
sss2 = _mm256_srai_epi32(sss2, coefs_precision);
sss3 = _mm256_srai_epi32(sss3, coefs_precision);
sss0 = _mm256_packs_epi32(sss0, sss1);
sss2 = _mm256_packs_epi32(sss2, sss3);
sss0 = _mm256_packus_epi16(sss0, sss2);
_mm256_storeu_si256((__m256i*)&lineOut[xx], sss0);
}
for (; xx < xsize - 1; xx += 2) {
__m128i sss0 = initial; // left row
__m128i sss1 = initial; // right row
x = 0;
for (; x < xmax - 1; x += 2) {
__m128i source, source1, source2;
__m128i pix, mmk;
// Load two coefficients at once
mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
// Load 2 lines
// (__m128i *) &imIn->image32[x + xmin][xx])
source1 = _mm_loadl_epi64((__m128i*)(imIn + (x + xmin) * xin + xx));
// (__m128i *) &imIn->image32[x + 1 + xmin][xx]
source2 = _mm_loadl_epi64((__m128i*)(imIn + (x + 1 + xmin) * xin + xx));
source = _mm_unpacklo_epi8(source1, source2);
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
}
for (; x < xmax; x += 1) {
__m128i source, source1, pix, mmk;
mmk = _mm_set1_epi32(k[x]);
// (__m128i *) &imIn->image32[x + xmin][xx]);
source1 = _mm_loadl_epi64((__m128i*)(imIn + (x + xmin) * xin + xx));
source = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
}
sss0 = _mm_srai_epi32(sss0, coefs_precision);
sss1 = _mm_srai_epi32(sss1, coefs_precision);
sss0 = _mm_packs_epi32(sss0, sss1);
sss0 = _mm_packus_epi16(sss0, sss0);
_mm_storel_epi64((__m128i*)&lineOut[xx], sss0);
}
for (; xx < xsize; xx++) {
__m128i sss = initial;
x = 0;
for (; x < xmax - 1; x += 2) {
__m128i source, source1, source2;
__m128i pix, mmk;
// Load two coefficients at once
mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
// Load 2 lines
// *(int *) &imIn->image32[x + xmin][xx]
source1 = _mm_cvtsi32_si128(*(int*)(imIn + (x + xmin) * xin + xx));
// *(int *) &imIn->image32[x + 1 + xmin][xx]
source2 = _mm_cvtsi32_si128(*(int*)(imIn + (x + 1 + xmin) * xin + xx));
source = _mm_unpacklo_epi8(source1, source2);
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
}
for (; x < xmax; x++) {
// &imIn->image32[x + xmin][xx]
__m128i pix = mm_cvtepu8_epi32(imIn + (x + xmin) * xin + xx);
__m128i mmk = _mm_set1_epi32(k[x]);
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
}
sss = _mm_srai_epi32(sss, coefs_precision);
sss = _mm_packs_epi32(sss, sss);
lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss));
}
}
} // anonymous namespace
#endif // CPU_CAPABILITY_AVX2

View File

@ -9367,67 +9367,67 @@ class TestNNDeviceType(NNTestCase):
@parametrize_test("antialias", [True, False])
@parametrize_test("align_corners", [True, False])
@parametrize_test("mode", ["bilinear", "bicubic"])
@parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
@onlyNativeDeviceTypes
def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode):
def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory_format):
# Forward AD does not support XLA because XLA tensors don't have storage
check_forward_ad = torch.device(device).type != 'xla'
kwargs = dict(mode=mode, align_corners=align_corners, antialias=antialias)
for memory_format in [torch.contiguous_format, torch.channels_last]:
# test float scale factor up & downsampling
for scale_factor in [0.5, 1.5, 2]:
in_t = torch.ones(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
out_size = int(math.floor(in_t.shape[-1] * scale_factor))
with warnings.catch_warnings(record=True) as w:
out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
expected_out = torch.ones(2, 3, out_size, out_size, device=device)
self.assertEqual(expected_out, out_t)
# Assert that memory format is carried through to the output
self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
out_t.backward(torch.randn_like(out_t))
self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))
# test float scale factor up & downsampling
for scale_factor in [0.5, 1.5, 2]:
in_t = torch.ones(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
out_size = int(math.floor(in_t.shape[-1] * scale_factor))
with warnings.catch_warnings(record=True) as w:
out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
expected_out = torch.ones(2, 3, out_size, out_size, device=device)
self.assertEqual(expected_out, out_t)
# Assert that memory format is carried through to the output
self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
out_t.backward(torch.randn_like(out_t))
self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))
if torch.device(device).type == 'cuda':
# Bilinear backward is nondeterministic because of atomicAdd usage
nondet_tol = 1e-5
else:
nondet_tol = 0.0
if torch.device(device).type == 'cuda':
# Bilinear backward is nondeterministic because of atomicAdd usage
nondet_tol = 1e-5
else:
nondet_tol = 0.0
input = torch.randn(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
gradcheck(
lambda x: F.interpolate(x, out_size, **kwargs),
[input],
check_forward_ad=check_forward_ad, nondet_tol=nondet_tol
)
gradgradcheck(
lambda x: F.interpolate(x, out_size, **kwargs),
[input],
check_fwd_over_rev=check_forward_ad, nondet_tol=nondet_tol
)
input = torch.randn(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
gradcheck(
lambda x: F.interpolate(x, out_size, **kwargs),
[input],
check_forward_ad=check_forward_ad, nondet_tol=nondet_tol
)
gradgradcheck(
lambda x: F.interpolate(x, out_size, **kwargs),
[input],
check_fwd_over_rev=check_forward_ad, nondet_tol=nondet_tol
)
# Assert that cpu and cuda give same results
if torch.device(device).type == 'cuda':
for shapes in [
(2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2)
]:
a_cuda = torch.randn(
*shapes, device=device
).contiguous(memory_format=memory_format).requires_grad_()
a_cpu = a_cuda.detach().cpu().requires_grad_()
# Assert that cpu and cuda give same results
if torch.device(device).type == 'cuda':
for shapes in [
(2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2)
]:
a_cuda = torch.randn(
*shapes, device=device
).contiguous(memory_format=memory_format).requires_grad_()
a_cpu = a_cuda.detach().cpu().requires_grad_()
with warnings.catch_warnings(record=True):
out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, **kwargs)
out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, **kwargs)
with warnings.catch_warnings(record=True):
out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, **kwargs)
out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, **kwargs)
self.assertEqual(out_cpu, out_cuda.cpu())
self.assertEqual(out_cpu, out_cuda.cpu())
g_cuda = torch.randn_like(out_cuda)
g_cpu = g_cuda.cpu()
g_cuda = torch.randn_like(out_cuda)
g_cpu = g_cuda.cpu()
out_cuda.backward(g_cuda)
out_cpu.backward(g_cpu)
out_cuda.backward(g_cuda)
out_cpu.backward(g_cpu)
self.assertEqual(a_cuda.grad, a_cpu.grad)
self.assertEqual(a_cuda.grad, a_cpu.grad)
@parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
def test_upsamplingBilinear2d_aa_correctness(self, device, memory_format):
@ -9445,6 +9445,40 @@ class TestNNDeviceType(NNTestCase):
t_out = F.interpolate(t_in, size=(2, 2), mode="bilinear", align_corners=False, antialias=True)
self.assertEqual(expected_out, t_out)
@parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
@parametrize_test("antialias", [True, False])
@parametrize_test("align_corners", [True, False])
@parametrize_test("num_channels", [3, 5])
@parametrize_test("output_size", [32, 600])
def test_upsamplingBiLinear2d_consistency(self, device, memory_format, antialias, align_corners, num_channels, output_size):
if torch.device(device).type == "cuda":
raise SkipTest("CUDA implementation is not yet supporting uint8")
mode = "bilinear"
# Check if Max Abs Error between resized input_uint8 and resized input_float is smaller than a tolerated value, e.g. 1.0
input_ui8 = torch.randint(0, 256, size=(1, num_channels, 400, 400), dtype=torch.uint8, device=device)
input_ui8 = input_ui8.contiguous(memory_format=memory_format)
input_f32 = input_ui8.float()
output_f32 = F.interpolate(
input_f32, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias
)
output_ui8 = F.interpolate(
input_ui8, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias
)
mae_tol = 0.5
max_abs_err_tol = 1.0
num_wrong_pixels_tol = 5
abs_diff = torch.abs(output_f32.round() - output_ui8.float())
mae = torch.mean(abs_diff)
max_abs_err = torch.max(abs_diff)
num_wrong_pixels = (abs_diff > max_abs_err_tol).sum()
self.assertTrue(mae < mae_tol, msg=f"mae={mae}")
self.assertTrue(max_abs_err < max_abs_err_tol + 1e-5, msg=f"max ae={max_abs_err}")
self.assertTrue(num_wrong_pixels < num_wrong_pixels_tol, msg=f"num_wrong_pixels={num_wrong_pixels}")
def test_upsamplingBicubic2d_correctness(self, device):
# test output against known input: align_corners=False result must match opencv
in_t = torch.arange(8., device=device).view(1, 2, 2, 2)

View File

@ -12118,7 +12118,7 @@ op_db: List[OpInfo] = [
supports_fwgrad_bwgrad=True,
supports_autograd=True,
supports_forward_ad=True,
dtypes=floating_types_and(torch.bfloat16),
dtypes=floating_types_and(torch.uint8, torch.bfloat16),
dtypesIfCUDA=floating_types_and(torch.half),
gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
sample_inputs_func=partial(sample_inputs_interpolate, 'bilinear'),
@ -12184,7 +12184,7 @@ op_db: List[OpInfo] = [
supports_autograd=True,
supports_forward_ad=True,
supports_fwgrad_bwgrad=True,
dtypes=floating_types_and(torch.bfloat16),
dtypes=floating_types_and(torch.uint8, torch.bfloat16),
dtypesIfCUDA=floating_types_and(torch.half),
gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
sample_inputs_func=partial(sample_inputs_upsample, 'bilinear'),