From 544c04f2dfb0e0a1eeba128e0f012ec31aab549c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 10 Feb 2023 01:43:49 +0000 Subject: [PATCH] Add uint8 support for interpolate for CPU images (#90771) Joint work with @vfdev-5 This PR introduces native uint8 support for `interpolate()`, for `bilinear` ~and `bicubic`~ modes for CPU images (`mode=nearest[_exact]` was already supported ). On a typical torchvision training job on ImageNet, the speedup are ~4X when AVX2 is supported, comparing the uint8 native (this PR) vs torchvision's current `Resize()`: ``` AA = antialias float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does) input_size output_size channels_last AA mode num_threads speed-up float vs uint8 (this PR) (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=1 4X 2.6ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=1 2.1X 1.3ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=1 3X 2.1ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=1 4X 2.4ms vs 0.6ms (Note: we removed bicubic support for now) (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=1 4X 2.9ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=1 5X 3.1ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=1 3X 2.4ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=1 4X 2.8ms vs 0.7ms ``` There is still room for further speed-ups (see TODOs in the code). #### More benchmark details with AVX2 support - speedups typically range from 1.5X to 10X. A few edge-cases are slower, worth investigating why.
``` AA = antialias float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does) input_size output_size channels_last AA mode num_threads speed-up float vs uint8 (this PR) (1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=1 5X 1.1ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=1 5X 1.2ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=1 2.8X 0.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=1 7X 1.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=1 5X 1.2ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=1 12X 2.9ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=1 3X 0.8ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=1 7X 1.8ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=2 2.6X 0.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=2 2.8X 0.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=2 1.7X 0.4ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=2 1.4X 0.3ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=2 2.7X 0.7ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=2 7X 1.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=2 1.8X 0.4ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=2 4X 1.0ms vs 0.2ms (1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=1 4X 2.5ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=1 3.0X 1.8ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=1 3X 1.8ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=1 4X 2.3ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=1 4X 2.7ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=1 7X 4.3ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=1 3X 2.1ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=1 4X 2.6ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=2 2.7X 1.6ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=2 2.6X 1.5ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=2 2.1X 1.2ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=2 1.6X 0.9ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=2 2.8X 1.7ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=2 5X 2.8ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=2 2.3X 1.4ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=2 3X 1.9ms vs 0.6ms (1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=1 4X 26.6ms vs 6.7ms (1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=1 4X 23.9ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=1 2.5X 16.8ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=1 5X 33.1ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=1 4X 25.9ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=1 8X 59.6ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=1 1.9X 14.3ms vs 7.4ms (1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=1 5X 35.4ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=2 2.0X 13.6ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=2 2.2X 14.8ms vs 6.7ms (1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=2 1.3X 8.8ms vs 6.9ms (1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=2 1.2X 8.4ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=2 1.8X 12.8ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=2 4X 32.1ms vs 7.2ms (1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=2 1.4X 10.1ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=2 2.9X 20.9ms vs 7.3ms (1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=1 1.4X 0.5ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=1 0.7X 0.2ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=1 1.3X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=1 1.4X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=1 2.1X 0.7ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=1 1.3X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=1 1.9X 0.6ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=1 1.0X 0.3ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=2 1.0X 0.3ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=2 0.6X 0.2ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=2 0.8X 0.3ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=2 1.4X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=2 1.4X 0.5ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=2 1.2X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=2 1.2X 0.4ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=2 0.9X 0.3ms vs 0.3ms (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=1 4X 2.6ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=1 2.1X 1.3ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=1 3X 2.1ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=1 4X 2.4ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=1 4X 2.9ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=1 5X 3.1ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=1 3X 2.4ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=1 4X 2.8ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=2 1.5X 1.0ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=2 1.2X 0.8ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=2 2.3X 1.5ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=2 1.9X 1.2ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=2 1.6X 1.2ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=2 4X 2.4ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=2 2.4X 1.6ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=2 2.8X 1.8ms vs 0.6ms (1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=1 2.1X 12.8ms vs 6.1ms (1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=1 0.6X 3.8ms vs 5.9ms (1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=1 1.2X 7.1ms vs 6.1ms (1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=1 1.9X 11.0ms vs 5.9ms (1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=1 2.0X 12.6ms vs 6.4ms (1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=1 1.0X 6.1ms vs 6.0ms (1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=1 1.8X 11.3ms vs 6.4ms (1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=1 0.8X 4.6ms vs 6.0ms (1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=2 1.6X 9.3ms vs 6.0ms (1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=2 0.3X 2.0ms vs 5.8ms (1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=2 1.2X 7.2ms vs 6.0ms (1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=2 0.3X 1.6ms vs 5.8ms (1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=2 1.1X 7.1ms vs 6.5ms (1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=2 0.6X 3.3ms vs 5.9ms (1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=2 0.9X 5.9ms vs 6.3ms (1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=2 0.4X 2.4ms vs 5.9ms ```
without AVX2 support - no significant speed-up, but there are various possible improvements (see TODOs)
``` AA = antialias float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does) input_size output_size channels_last AA mode num_threads speed-up float vs uint8 (this PR) (1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=1 0.9X 1.5ms vs 1.6ms (1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=1 0.9X 1.5ms vs 1.6ms (1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=1 0.8X 0.9ms vs 1.1ms (1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=1 1.5X 1.7ms vs 1.1ms (1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=1 0.9X 1.6ms vs 1.8ms (1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=1 2.1X 3.9ms vs 1.9ms (1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=1 0.8X 1.1ms vs 1.4ms (1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=1 1.7X 2.4ms vs 1.5ms (1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=2 0.9X 0.8ms vs 0.8ms (1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=2 0.9X 0.8ms vs 0.8ms (1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=2 0.9X 0.5ms vs 0.6ms (1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=2 0.7X 0.5ms vs 0.7ms (1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=2 0.9X 0.9ms vs 1.0ms (1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=2 2.1X 2.0ms vs 1.0ms (1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=2 0.8X 0.6ms vs 0.8ms (1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=2 1.7X 1.3ms vs 0.8ms (1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=1 1.0X 3.0ms vs 3.0ms (1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=1 1.0X 2.8ms vs 2.9ms (1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=1 1.0X 2.3ms vs 2.2ms (1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=1 1.4X 3.3ms vs 2.3ms (1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=1 1.0X 3.5ms vs 3.5ms (1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=1 1.7X 6.1ms vs 3.5ms (1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=1 0.9X 2.6ms vs 2.9ms (1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=1 1.4X 4.2ms vs 2.9ms (1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=2 1.0X 1.7ms vs 1.7ms (1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=2 0.9X 1.6ms vs 1.8ms (1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=2 0.9X 1.3ms vs 1.4ms (1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=2 0.7X 1.1ms vs 1.6ms (1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=2 1.0X 2.0ms vs 2.0ms (1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=2 1.7X 3.2ms vs 1.9ms (1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=2 0.8X 1.5ms vs 1.9ms (1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=2 1.2X 2.3ms vs 1.9ms (1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=1 1.1X 34.7ms vs 32.4ms (1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=1 1.0X 31.2ms vs 32.4ms (1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=1 1.0X 23.5ms vs 22.7ms (1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=1 1.9X 42.5ms vs 22.7ms (1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=1 0.9X 33.9ms vs 37.4ms (1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=1 2.2X 84.0ms vs 37.5ms (1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=1 1.0X 28.4ms vs 28.8ms (1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=1 2.0X 56.7ms vs 28.8ms (1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=2 1.1X 17.5ms vs 16.4ms (1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=2 1.1X 17.7ms vs 16.4ms (1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=2 0.8X 8.8ms vs 11.4ms (1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=2 1.0X 11.1ms vs 11.4ms (1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=2 1.1X 19.9ms vs 18.8ms (1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=2 2.3X 42.5ms vs 18.7ms (1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=2 1.0X 14.1ms vs 14.5ms (1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=2 2.0X 28.4ms vs 14.5ms (1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=1 1.0X 0.6ms vs 0.6ms (1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=1 0.7X 0.3ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=1 0.9X 0.5ms vs 0.6ms (1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=1 1.7X 0.6ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=1 1.0X 0.8ms vs 0.8ms (1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=1 1.1X 0.5ms vs 0.5ms (1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=1 0.9X 0.7ms vs 0.8ms (1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=1 0.9X 0.4ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=2 1.0X 0.4ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=2 0.8X 0.2ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=2 0.9X 0.3ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=2 1.3X 0.3ms vs 0.2ms (1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=2 1.0X 0.5ms vs 0.5ms (1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=2 1.3X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=2 0.9X 0.5ms vs 0.5ms (1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=2 1.2X 0.3ms vs 0.3ms (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=1 0.8X 2.1ms vs 2.5ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=1 0.7X 1.6ms vs 2.4ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=1 1.2X 2.4ms vs 2.1ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=1 1.3X 2.6ms vs 2.0ms (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=1 1.1X 3.4ms vs 3.0ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=1 1.7X 4.8ms vs 2.8ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=1 1.1X 2.9ms vs 2.7ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=1 1.4X 3.5ms vs 2.4ms (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=2 0.9X 1.2ms vs 1.3ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=2 1.3X 1.6ms vs 1.2ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=2 0.8X 0.9ms vs 1.1ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=2 1.3X 1.3ms vs 1.0ms (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=2 1.4X 2.2ms vs 1.6ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=2 1.9X 2.8ms vs 1.5ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=2 0.8X 1.1ms vs 1.4ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=2 1.7X 2.1ms vs 1.3ms (1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=1 1.0X 10.0ms vs 9.9ms (1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=1 0.7X 4.6ms vs 6.2ms (1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=1 0.9X 9.1ms vs 9.8ms (1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=1 1.7X 9.4ms vs 5.7ms (1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=1 1.0X 15.2ms vs 14.8ms (1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=1 1.0X 7.6ms vs 7.5ms (1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=1 0.9X 13.3ms vs 14.4ms (1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=1 0.8X 5.9ms vs 7.0ms (1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=2 1.2X 6.0ms vs 5.2ms (1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=2 0.7X 2.3ms vs 3.2ms (1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=2 1.0X 4.8ms vs 5.0ms (1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=2 0.7X 1.9ms vs 2.9ms (1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=2 1.6X 12.3ms vs 7.5ms (1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=2 1.0X 3.9ms vs 3.9ms (1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=2 1.0X 7.0ms vs 7.3ms (1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=2 0.9X 3.0ms vs 3.5ms ```
Benchmark code
```py import operator_benchmark as op_bench import torch """Microbenchmarks for interpolate operator.""" class InterpolateBenchmark(op_bench.TorchBenchmarkBase): def init(self, input_size, output_size, channels_last=False, mode='linear', antialias=False, dtype=torch.float): input_image = torch.randint(0, 256, size=input_size, dtype=torch.uint8, device='cpu') if channels_last: input_image = input_image.contiguous(memory_format=torch.channels_last) self.inputs = { "input_image": input_image, "output_size": output_size, "mode": mode, "antialias": antialias, "dtype":dtype, } self.set_module_name("interpolate") def forward(self, input_image, output_size, mode, antialias, dtype): if dtype == torch.float: input_image = input_image.float() out = torch.nn.functional.interpolate(input_image, size=output_size, mode=mode, align_corners=False, antialias=antialias) if dtype == torch.float: out = out.round().clamp(min=0, max=256).to(torch.uint8) def make_config(): sizes = ( ((224, 224), (64, 64)), ((270, 268), (224, 224)), ((256, 256), (1024, 1024)), ) attrs = [] for (HW1, HW2) in sizes: attrs.append([(1, 3, *HW1), HW2]) # 3 channels # attrs.append([(1, 1, *HW1), HW2]) # 1 channel attrs.append([(1, 3, *HW2), HW1]) # 3 channels # attrs.append([(1, 1, *HW2), HW1]) # 1 channel config = op_bench.config_list( attr_names=["input_size", "output_size"], attrs=attrs, cross_product_configs={ 'channels_last': [True, False], 'mode': ["bilinear", "bicubic"], 'antialias': [True, False], # 'dtype': [torch.float, torch.uint8] # 'dtype': [torch.uint8] 'dtype': [torch.float] }, tags=["short"], ) return config config = make_config() op_bench.generate_pt_test(config, InterpolateBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main() ``` ```py import re import argparse parser = argparse.ArgumentParser() parser.add_argument("f1", nargs="?", default="main") parser.add_argument("f2", nargs="?", default="new") args = parser.parse_args() with open(args.f1) as f: main = f.readlines() with open(args.f2) as f: new = f.readlines() out = [] for main_line, new_line in zip(main, new): # num_threads=1 # TODO: remove if main_line.startswith("num_threads="): num_threads = int(main_line.split("=")[-1]) if main_line.startswith("# Input"): deets = f"{main_line.strip()}, {num_threads=}" if main_line.startswith("Forward"): main_time = float(main_line.split()[-1]) new_time = float(new_line.split()[-1]) ratio = main_time / new_time fmt = ".1f" if ratio < 3 else ".0f" improv = f"{ratio:{fmt}}X" time_fmt = ",.3f" if new_time < 100 else ",.1f" deets = deets.strip().replace("# Input: ", "") deets = deets.replace(": ", "=") deets = deets.replace("input_size=", "") deets = deets.replace(", output_size=", " -> ") deets = deets.replace("dtype=torch.", "") deets = deets.replace("mode=", "") deets = deets.replace("antialias=", "") deets = deets.replace("channels_last=", "") # deets = deets.replace("channels_last=True, ", "") split = deets.split(",") # size = ','.join(split[:-3]) # mode, dtype, threads = split[-3:] # deets = f"{size:<30} {mode:<15} {dtype:<10} {threads:<15}" size = ','.join(split[:-5]) channels_last, mode, antialias, dtype, threads= split[-5:] deets = f"{size:<33} {channels_last:<7} {antialias:<7} {mode:<10} {threads:<15}" l = f"{deets} {improv:<5} {main_time / 1000:{time_fmt}}ms vs {new_time / 1000:{time_fmt}}ms" out.append(l) def key(s): # s = ''.join(s.split()[1:]) # remove "N.nX" part num_threads = (int(re.findall(r"num_threads=(\d+)", s)[0]),) input_shape, output_shape = re.findall("\(.*?\)", s) input_shape = input_shape[1:-1] # remove parenthesis input_HW = tuple(int(x) for x in input_shape.split(",")[-2:]) input_C = (-int(input_shape.split(",")[1]),) output_HW = tuple(int(x) for x in output_shape[1:-1].split(",")) is_downsample = (output_HW[0] < input_HW[0],) if "linear" in s: mode = "linear" elif "nearest-exact" in s: mode = "nearest-exact" else: # assert "nearest" in s mode = "nearest" mode = (mode,) return is_downsample + input_HW + output_HW + num_threads + input_C + mode for i, l in enumerate(sorted(out, key=key)): if i % 8 == 0: print() # if i % 10 == 0 and i % 40 != 0: # print() # if i % 40 == 0: # print("-" * 100) print(l) ```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90771 Approved by: https://github.com/peterbell10, https://github.com/ngimel --- NOTICE | 38 + aten/src/ATen/native/cpu/UpSampleKernel.cpp | 577 +++++++++++--- .../native/cpu/UpSampleKernelAVXAntialias.h | 719 ++++++++++++++++++ test/test_nn.py | 130 ++-- .../_internal/common_methods_invocations.py | 4 +- 5 files changed, 1327 insertions(+), 141 deletions(-) create mode 100644 aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h diff --git a/NOTICE b/NOTICE index 5abaac479a75..6effb8b5d707 100644 --- a/NOTICE +++ b/NOTICE @@ -416,3 +416,41 @@ derivation and reference the following license: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +======================================================================= +PILLOW-SIMD Software License +======================================================================= + +Code derived from implementations in PILLOW-SIMD should mention its derivation +and reference the following license: + + The Python Imaging Library (PIL) is + + Copyright © 1997-2011 by Secret Labs AB + Copyright © 1995-2011 by Fredrik Lundh + + Pillow is the friendly PIL fork. It is + + Copyright © 2010-2022 by Alex Clark and contributors + + Like PIL, Pillow is licensed under the open source HPND License: + + By obtaining, using, and/or copying this software and/or its associated + documentation, you agree that you have read, understood, and will comply + with the following terms and conditions: + + Permission to use, copy, modify, and distribute this software and its + associated documentation for any purpose and without fee is hereby granted, + provided that the above copyright notice appears in all copies, and that + both that copyright notice and this permission notice appear in supporting + documentation, and that the name of Secret Labs AB or the author not be + used in advertising or publicity pertaining to distribution of the software + without specific, written prior permission. + + SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS + SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. + IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL, + INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + PERFORMANCE OF THIS SOFTWARE. diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp index 7b8bd9ad65d3..1f471d495df7 100644 --- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp +++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -22,12 +23,53 @@ namespace { using scale_t = std::vector>; +// TODO: this file could benefit from a global renaming of its functions / +// classes and terms, as well as from adding more comments. In particular: +// - It's not obvious that despite their names (and the file name), all these +// kernels don't just do upsampling: they do general interpolation, i.e. they +// also all support downscaling. +// - the term "horizontal" or "within dims" or "contiguous dim" refers to the +// last dimension. +// It's not specific to 2D images and applies to 3D (and 1D??) inputs as well. +// Similarly "vertical" or "across dims" refers to all dims that aren't the +// last one. In other kernels these are also referred to as "zero-stride" and +// "non-zero-stride" - we should unify all this. +// - the terms "zero-stride" and "non-zero strides" refer to the weights and +// indices, not to the contiguity of input or output +// - It's not always clear which kernel is vectorized and which one isn't. +// - The functions like _use_vectorized_kernel_cond() should be renamed and +// their description updated, because they're not the only "fork" in the +// code-path where a choice is made between a vectorized kernel vs a +// non-vectorized one. See e.g. upsample_bilinear2d_kernel_impl() where we +// already make a similar check, before the one in +// _use_vectorized_kernel_cond(). +// - It's not always clear which code is part of a "separable interpolation" +// code-path. +// - Some names need to be more specific. For example +// "cpu_upsample_generic_aa()" looks like a super generic name, but the function +// is instead fairly specific - we need to make that clearer. +// - Some functions have a "aa" suffix but it doesn't mean that they only +// support antialias. Some of them also support antialias=False now. +// - Various comments are outdated. Case in point: the one just below about the +// `Interpolate` struct being used for cpu_upsample_linear: +// cpu_upsample_linear doesn't exist anymore, and these structs are used for +// various modes, *not* just linear. +// - It'd be useful to document how interpolation works in general, and in particular state explicitly: +// - that the weights and indices across a given dimension are the same for +// all pixels (hence the benefit of pre-computing them) +// - that it can be "separated", i.e. we can do the horizontal pass and the +// vertical pass independently (and that some kernels are written this way, +// while some aren't.) +// - we can probably remove the template over index_t, because it's always +// hard-coded as int64_t + + // Helper structs and methods for cpu_upsample_linear // // Interpolation methods that used below are separable, and as such we can compute the interpolation // independently per dimension in a recursive way. Please, refer to #10482 for more context. // -// Linear Interpolation structure to compute output value in n-dimensional case. +// Interpolation structure to compute output value in n-dimensional case. // - recursively compute interpolated output for each dimension // - we rely a lot on compiler's code optimization such that implemented operations // can be automatically factorized and vectorized using SSE and AVX2 @@ -255,48 +297,129 @@ static inline void basic_loop(char** data, const int64_t* strides, int64_t n) { } } -template -static inline void basic_loop_aa_single_dim_zero_strides( +template +static inline void basic_loop_aa_vertical( char** data, const int64_t* strides, - int64_t n) { + int64_t n, + unsigned int weights_precision) { char* dst = data[0]; char* src = data[1]; // index stride is constant for the given dimension - const index_t ids_stride = *(index_t*)&data[2 + 2][0]; + const int64_t ids_stride = *(int64_t*)&data[2 + 2][0]; for (const auto i : c10::irange(n)) { *(scalar_t*)&dst[i * strides[0]] = - interpolate_aa_single_dim_zero_strides( + interpolate_aa_single_dim_zero_strides( src + i * strides[1], &data[2], ids_stride); } } -template -static inline void basic_loop_aa_single_dim_nonzero_strides( +template <> +inline void basic_loop_aa_vertical( char** data, const int64_t* strides, - int64_t n) { + int64_t n, + unsigned int weights_precision) { + // See Note [ Weights computation for uint8_t and multiplication trick ] + char* dst = data[0]; + char* src = data[1]; + + // index stride is constant for the given dimension + const int64_t ids_stride = *(int64_t*)&data[2 + 2][0]; + const int64_t ids_size = *(int64_t*)&data[2 + 1][0]; + const int64_t ids_min = *(int64_t*)&data[2 + 0][0]; + + int64_t i = 0; + + for (; i> weights_precision, 0, 255); + } +} + +template +static inline void basic_loop_aa_horizontal( + char** data, + const int64_t* strides, + int64_t n, + unsigned int weights_precision) { char* dst = data[0]; char* src = data[1]; // index stride is constant for the given dimension - const index_t ids_stride = *(index_t*)&data[2 + 2][0]; + const int64_t ids_stride = *(int64_t*)&data[2 + 2][0]; if (strides[1] == 0) { for (const auto i : c10::irange(n)) { *(scalar_t*)&dst[i * strides[0]] = - interpolate_aa_single_dim( + interpolate_aa_single_dim( src, &data[2], &strides[2], i, ids_stride); } } else { for (const auto i : c10::irange(n)) { *(scalar_t*)&dst[i * strides[0]] = - interpolate_aa_single_dim( + interpolate_aa_single_dim( src + i * strides[1], &data[2], &strides[2], i, ids_stride); } } } +template <> +inline void basic_loop_aa_horizontal( + char** data, + const int64_t* strides, + int64_t n, + unsigned int weights_precision) { + // See Note [ Weights computation for uint8_t and multiplication trick ] + char* dst = data[0]; + char* src = data[1]; + // index stride is constant for the given dimension + const int64_t ids_stride = *(int64_t*)&data[2 + 2][0]; + + int64_t i = 0; + + // Here we are implementing data interpolation within the same line (vs between the lines) + // output[x, y] = input[xmin[x], y] * W[x] + input[xmin[x] + 1, y] * W[x + 1] + ... + input[xmin[x] + xsize, y] * W[x + xsize] + + for (; i> weights_precision, 0, 255); + } +} + // Generic upsampling computation method using TensorIterator for Nd case. // Supports: nearest, linear, cubic modes with interp_size template argument: 1, 2, 4 // @@ -621,21 +744,23 @@ struct HelperInterpBase { template static inline void _compute_weights_aa( const int64_t i, const int64_t input_size, const scalar_t scale, const scalar_t support, - scalar_t* wt_ptr, const int64_t interp_size, aa_filter_fn_t filter_fn, - int64_t& xmin, int64_t& xsize + scalar_t* wt_ptr, const int64_t max_interp_size, aa_filter_fn_t filter_fn, + int64_t& xmin, int64_t& xsize, bool antialias, double align_corners_delta ) { - scalar_t center = scale * (i + 0.5); + // align_corners_delta is 0.5 for uint8 and align_corners=true and antialias=false + // is 0.0 otherwise + scalar_t center = scale * (i + 0.5 - align_corners_delta); scalar_t total_w = 0.0; - scalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0; + scalar_t invscale = (scale >= 1.0 && antialias) ? 1.0 / scale : 1.0; xmin = std::max( - static_cast(center - support + 0.5), static_cast(0)); - xsize = std::min(static_cast(center + support + 0.5), input_size) - - xmin; + static_cast(center - support + 0.5 + align_corners_delta), static_cast(0)); + xsize = std::min( + static_cast(center + support + 0.5 + align_corners_delta), input_size) - xmin; int64_t j = 0; for (; j < xsize; j++) { - scalar_t w = filter_fn((j + xmin - center + 0.5) * invscale); + scalar_t w = filter_fn((j + xmin - center + 0.5 - align_corners_delta) * invscale); wt_ptr[j] = w; total_w += w; } @@ -644,23 +769,39 @@ struct HelperInterpBase { wt_ptr[j] /= total_w; } } - for (; j < interp_size; j++) { + for (; j < max_interp_size; j++) { wt_ptr[j] = static_cast(0.0); } } - template - static inline std::vector _compute_indices_weights_aa( + // Note [ Support for antialias=False as a subcase of antilias=True ] + // This function was originally written with the hard assumption that + // antialias=True (hence the aa in the name). It was later extended to support + // antialias=False. The only difference between aa and no-aa is in how the + // weights and indices are computed (and their number). In aa their number is + // variable but with no-aa, they're fixed to interp_size. The same "filters" + // can be used otherwise. HOWEVER, support for antialias=False here may not be + // optimally optimized: the code assumes an arbitrary number of weights and + // indices, but this can be optimized further when aa=False since we know + // their actual dimensions. + template + static inline std::tuple, int> _compute_indices_weights_aa( int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, int64_t reshape_dim, scalar_t scale, - int interp_size, aa_filter_fn_t aa_filter_fn + int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, double align_corners_delta ) { std::vector output; - scalar_t support = - (scale >= 1.0) ? (interp_size * 0.5) * scale : interp_size * 0.5; - interp_size = (int)ceilf(support) * 2 + 1; + scalar_t support; + int max_interp_size; + if (antialias) { + support = (scale >= 1.0) ? (interp_size * 0.5) * scale : interp_size * 0.5; + max_interp_size = (int) std::ceil(support) * 2 + 1; + } else { + support = interp_size * 0.5; + max_interp_size = interp_size; + } auto new_shape = std::vector(ndims, 1); new_shape[reshape_dim] = output_size; @@ -675,7 +816,7 @@ struct HelperInterpBase { { // Weights - new_shape[reshape_dim] = output_size * interp_size; + new_shape[reshape_dim] = output_size * max_interp_size; auto wts = empty(new_shape, CPU(c10::CppTypeToScalarType())); auto strides = wts.strides().vec(); strides[reshape_dim] = 0; @@ -701,20 +842,130 @@ struct HelperInterpBase { input_size, scale, support, - wt_ptr + i * interp_size, - interp_size, + wt_ptr + i * max_interp_size, + max_interp_size, aa_filter_fn, xmin, - xmax); + xmax, + antialias, + align_corners_delta); idx_ptr_xmin[i] = xmin * stride; idx_ptr_size[i] = xmax; idx_ptr_stride[i] = stride; - wt_idx_ptr[i] = i * interp_size * sizeof(scalar_t); + wt_idx_ptr[i] = i * max_interp_size * weight_index_stride; } - return output; + return {output, max_interp_size}; } + /* + NOTE [ Weights computation for uint8_t and multiplication trick ] + When the input/output dtype is uint8_t, we still compute the interpolation + weights as double, but then convert them to int16 via some conversion logic + detailed below. This allows us to compute all interpolation operation (sum of + multiplications) as ints instead of floats. The result is converted back into + uint8 in basic_loop_aa_horizontal (and vertical) + + In essence the idea is to avoid a multiplication between a float (the + weight) and an int (the pixel value) and instead run a multpilication between + 2 ints: + + ```py + COEF_PREC = 16 + + def mul(a:float, b:int) -> Tuple[float, int]: + # return a * b, round(a * b) + actual = a * b + + assert a > 0 # I'm lazy + int_a = floor(0.5 + a * (1 << COEF_PREC)) + with_trick = ((int_a * b) + (1 << (COEF_PREC - 1))) >> COEF_PREC + + return actual, with_trick # round(actual) == with_trick!! + ``` + + Here's how it works: + N == COEFF_PREC + 1 << N == 2**N + floor(0.5 + x) == round(x) + + So the operation is something like + + int_a = round(a * 2**N) -- let's just say it's `a * 2**N` for simplicity + + res = ((int_a * b) + (1 << (N - 1))) >> N + = ((a * 2**N * b + 2**(N - 1)) / 2**N + = a * b + 0.5 + = round(a * b) + = what we wanted + */ + template + static inline std::tuple, int, unsigned int> _compute_indices_int16_weights_aa( + int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, + int64_t reshape_dim, bool align_corners, const c10::optional opt_scale, + int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, bool align_i32=false + ) { + + double scale = area_pixel_compute_scale( + input_size, output_size, align_corners, opt_scale); + + std::vector indices_weights; + auto align_corners_delta = (align_corners && !antialias) ? 0.5 : 0.0; + std::tie(indices_weights, interp_size) = HelperInterpBase::_compute_indices_weights_aa( + input_size, output_size, stride, ndims, reshape_dim, scale, interp_size, aa_filter_fn, antialias, align_corners_delta); + + // Rescale float weights to int16 and compute weights precision + auto weights_f64 = indices_weights[3]; + double * data_f64 = weights_f64.data_ptr(); + int64_t weights_f64_size = output_size * interp_size; + // can't use weights_f64.max() here as tensor is restrided + double w_max = data_f64[0]; + for (const auto i : c10::irange(weights_f64_size)) { + double v = data_f64[i]; + if (w_max < v) { + w_max = v; + } + } + + unsigned int weights_precision = 0; + for (weights_precision = 0; weights_precision < 22; weights_precision += 1) { + int next_value = (int) (0.5 + w_max * (1 << (weights_precision + 1))); + if (next_value >= (1 << 15)) + break; + } + + // Rescale float values to int16 + int16_t * data_i16 = (int16_t *) data_f64; + auto aligned_interp_size = interp_size; + + if (align_i32) { + // We should respect int32 alignment as + // we will load data as int32 with AVX2 + // See ImagingResampleHorizontalConvolution8u4x, mmk0 = _mm256_set1_epi32(*(int32_t*)&k[x]); + // compute aligned_interp_size = nearest pair value to interp_size + while (aligned_interp_size % sizeof(int32_t) != 0) { + aligned_interp_size += 1; + } + // assert that we wont go out of bounds + TORCH_INTERNAL_ASSERT(aligned_interp_size * sizeof(int16_t) < interp_size * sizeof(double)); + } + + for (const auto j : c10::irange(output_size)) { + for (const auto k : c10::irange(interp_size)) { + double v = data_f64[j * interp_size + k]; + if (v < 0) { + data_i16[j * aligned_interp_size + k] = (int) (-0.5 + v * (1 << weights_precision)); + } else { + data_i16[j * aligned_interp_size + k] = (int) (0.5 + v * (1 << weights_precision)); + } + } + } + + return {indices_weights, aligned_interp_size, weights_precision}; + } + + + }; struct HelperInterpNearest : public HelperInterpBase { @@ -923,8 +1174,9 @@ struct HelperInterpLinear : public HelperInterpBase { input_size, output_size, align_corners, opt_scale); auto interp_size = HelperInterpLinear::interp_size; + int unused; - indices_weights = HelperInterpLinear::_compute_indices_weights_aa( + std::tie(indices_weights, unused) = HelperInterpLinear::_compute_indices_weights_aa( input_size, output_size, stride, @@ -932,11 +1184,32 @@ struct HelperInterpLinear : public HelperInterpBase { reshape_dim, scale, interp_size, - &HelperInterpLinear::aa_filter); + &HelperInterpLinear::aa_filter, + /*antialias=*/true, + /*align_corners_delta=*/0.0); } ); return indices_weights; } + + static inline std::tuple, int, unsigned int> compute_indices_int16_weights_aa( + int64_t input_size, + int64_t output_size, + int64_t stride, + int64_t ndims, + int64_t reshape_dim, + bool align_corners, + const c10::optional opt_scale, + bool antialias, + bool align_i32=false + ) { + + auto interp_size = HelperInterpLinear::interp_size; + auto fn = HelperInterpLinear::aa_filter; + return HelperInterpLinear::_compute_indices_int16_weights_aa( + input_size, output_size, stride, ndims, reshape_dim, + align_corners, opt_scale, interp_size, fn, antialias, align_i32); + } }; struct HelperInterpCubic : public HelperInterpBase { @@ -1033,8 +1306,9 @@ struct HelperInterpCubic : public HelperInterpBase { input_size, output_size, align_corners, opt_scale); auto interp_size = HelperInterpCubic::interp_size; + int unused; - indices_weights = HelperInterpCubic::_compute_indices_weights_aa( + std::tie(indices_weights, unused) = HelperInterpCubic::_compute_indices_weights_aa( input_size, output_size, stride, @@ -1042,11 +1316,14 @@ struct HelperInterpCubic : public HelperInterpBase { reshape_dim, scale, interp_size, - &HelperInterpCubic::aa_filter); + &HelperInterpCubic::aa_filter, + /*antialias=*/true, + /*align_corners_delta*/0.0); } ); return indices_weights; } + }; // Generic upsampling interpolation kernel for N-d case. @@ -1133,31 +1410,50 @@ void upsample_generic_Nd_kernel_impl( } } -template -void cpu_upsample_generic_aa(at::TensorIterator& iter) { +template +void cpu_upsample_generic_aa(at::TensorIterator& iter, unsigned int weights_precision) { auto loop = [&](char** data, const int64_t* strides, int64_t n) { - if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) && - is_zero_stride<3 + 2>(&strides[2])) { - basic_loop_aa_single_dim_zero_strides( - data, strides, n); + if (is_horizontal) { + + // Strides are : X 0 | 8 8 8 0 8 (Channels first) + // Strides are : X X | 0 0 0 0 0 (Channels last) + // upsampling data within a contiguous dimension (aka horizontal resampling) + if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) && + is_zero_stride<3 + 2>(&strides[2])) { + // channels last case + basic_loop_aa_horizontal( + data, strides, n, weights_precision); + } else { + basic_loop_aa_horizontal( + data, strides, n, weights_precision); + } } else { - basic_loop_aa_single_dim_nonzero_strides( - data, strides, n); + // Strides are : X Y | 0 0 0 0 0 (Channels first) + // Strides are : X X | 0 0 0 0 0 (Channels last) + // upsampling data between contiguous dimensions (aka vertical resampling) + if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) && + is_zero_stride<3 + 2>(&strides[2])) { + basic_loop_aa_vertical( + data, strides, n, weights_precision); + } else { + basic_loop_aa_vertical( + data, strides, n, weights_precision); + } } }; iter.for_each(loop); } -// Generic separable upsampling interpolation kernels for N-d case with anti-aliasing -template +template void _separable_upsample_generic_Nd_kernel_impl_single_dim( const Tensor& output, const Tensor& input, int interp_dim, bool align_corners, - const scale_type& scales) { + const scale_type& scales, + bool antialias) { // input can be NCHW, NCL or NCKHW auto shape = input.sizes().vec(); @@ -1174,21 +1470,29 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim( strides[interp_dim] = 0; auto restrided_input = input.as_strided(shape, strides); - std::vector> indices_weights; - - int interp_size = F::interp_size; auto input_scalar_type = input.scalar_type(); - if (interp_size == 1 && input_scalar_type == at::ScalarType::Byte) { - // nearest also supports uint8 tensor, but we have to use float - // with compute_indices_weights - input_scalar_type = at::ScalarType::Float; - } - indices_weights.emplace_back( + std::vector indices_weights; + unsigned int weights_precision = 0; + int unused; + + if (input_scalar_type == at::kByte) { + std::tie(indices_weights, unused, weights_precision) = + // TODO: change that to F:: once / if bicubic mode supports uint8 after all + HelperInterpLinear::compute_indices_int16_weights_aa( + input.size(interp_dim), oshape[interp_dim], + input.stride(interp_dim) * input.element_size(), + input.dim(), interp_dim, align_corners, scales[interp_dim - 2], + antialias); + TORCH_INTERNAL_ASSERT(weights_precision > 0); + } else { + TORCH_INTERNAL_ASSERT(antialias); + indices_weights = F::compute_indices_weights_aa( input_scalar_type, input.size(interp_dim), oshape[interp_dim], input.stride(interp_dim) * input.element_size(), - input.dim(), interp_dim, align_corners, scales[interp_dim - 2])); + input.dim(), interp_dim, align_corners, scales[interp_dim - 2]); + } TensorIteratorConfig config; config.check_all_same_dtype(false) @@ -1196,51 +1500,95 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim( .add_output(output) .add_input(restrided_input); - for (auto& idx_weight : indices_weights) { - for (auto& tensor : idx_weight) { - config.add_input(tensor); - } + for (auto& tensor : indices_weights) { + config.add_input(tensor); } auto iter = config.build(); - if (interp_size > 1) { - // Nearest also supports uint8 tensor, so need to handle it separately - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "upsample_generic_Nd_aa", [&] { - cpu_upsample_generic_aa(iter); - }); - } else { - AT_DISPATCH_FLOATING_TYPES_AND( - at::ScalarType::Byte, iter.dtype(), "upsample_generic_Nd_aa", [&] { - cpu_upsample_generic_aa(iter); - }); - } + AT_DISPATCH_FLOATING_TYPES_AND( + at::ScalarType::Byte, iter.dtype(), "upsample_generic_Nd_aa", [&] { + cpu_upsample_generic_aa(iter, weights_precision); + }); } +// Generic separable upsampling interpolation kernel for N-d case with anti-aliasing. +// It also supports antialias=False iff +// (dtype == uint8 and mode in ("bilinear", "bicubic")): this is used as +// fallback in these settings when AVX isn't supported. template void separable_upsample_generic_Nd_kernel_impl( const Tensor& output, const Tensor& input, bool align_corners, - const scale_type& scales) { + const scale_type& scales, + bool antialias) { + + auto output_shape = output.sizes(); + auto input_shape = input.sizes(); + auto temp_oshape = input_shape.vec(); + + if (output_shape == input_shape) { + output.copy_(input); + return; + } - auto temp_oshape = input.sizes().vec(); at::Tensor temp_output, temp_input = input; - for (const auto i : c10::irange(out_ndims - 1)) { - int interp_dim = 2 + out_ndims - 1 - i; - temp_oshape[interp_dim] = output.sizes()[interp_dim]; - temp_output = at::empty(temp_oshape, input.options().memory_format(input.suggest_memory_format())); + + int interp_dim = 0; + // Precompute the number of single dim resize method invocations + // to avoid copying temporary buffer to output + int num_single_dim_ops = 0; + for (const auto i : c10::irange(out_ndims)) { + interp_dim = 2 + out_ndims - 1 - i; + if (output_shape[interp_dim] != input_shape[interp_dim]) { + num_single_dim_ops += 1; + } + } + + // upsampling data within the contiguous dimension (aka horizontal resampling) + interp_dim = 2 + out_ndims - 1; + if (output_shape[interp_dim] != input_shape[interp_dim]) { + + num_single_dim_ops -= 1; + if (num_single_dim_ops > 0) { + temp_oshape[interp_dim] = output_shape[interp_dim]; + temp_output = at::empty(temp_oshape, input.options()); + } else { + temp_output = output; + } + _separable_upsample_generic_Nd_kernel_impl_single_dim< out_ndims, scale_t, - F>( - temp_output, temp_input, interp_dim, align_corners, scales); + F, + true>( + temp_output, temp_input, interp_dim, align_corners, scales, antialias); temp_input = temp_output; } - _separable_upsample_generic_Nd_kernel_impl_single_dim< - out_ndims, - scale_t, - F>(output, temp_input, 2, align_corners, scales); + + // upsampling data between contiguous dimensions (aka vertical resampling) + for (const auto i : c10::irange(1, out_ndims)) { + interp_dim = 2 + out_ndims - 1 - i; + if (output_shape[interp_dim] != input_shape[interp_dim]) { + + num_single_dim_ops -= 1; + if (num_single_dim_ops > 0) { + temp_oshape[interp_dim] = output_shape[interp_dim]; + temp_output = at::empty(temp_oshape, input.options()); + } else { + temp_output = output; + } + + _separable_upsample_generic_Nd_kernel_impl_single_dim< + out_ndims, + scale_t, + F, + false>( + temp_output, temp_input, interp_dim, align_corners, scales, antialias); + temp_input = temp_output; + } + } } void upsample_nearest1d_kernel_impl( @@ -1356,7 +1704,8 @@ void upsample_linear1d_kernel_impl( output, input, align_corners, {scales_w}); } -void upsample_bilinear2d_kernel_impl( + +void upsample_bilinear2d_kernel_impl_float( const Tensor& output, const Tensor& input, bool align_corners, @@ -1378,15 +1727,56 @@ void upsample_bilinear2d_kernel_impl( } } -void upsample_bilinear2d_aa_kernel_impl( +void upsample_bilinear2d_kernel_impl( const Tensor& output, const Tensor& input, bool align_corners, c10::optional scales_h, c10::optional scales_w) { + if (input.dtype() == at::kByte){ + #ifdef CPU_CAPABILITY_AVX2 + if (input.size(1) <= 4) { + upsample_avx_bilinear_uint8(input, + output, align_corners, {scales_h, scales_w}, + /*antialias=*/false); + } else { + separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>( + output, input, align_corners, {scales_h, scales_w}, + /*antialias=*/false); + } + #else // CPU_CAPABILITY_AVX2 + separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>( + output, input, align_corners, {scales_h, scales_w}, + /*antialias=*/false); + #endif // CPU_CAPABILITY_AVX2 + } else { + upsample_bilinear2d_kernel_impl_float(output, input, align_corners, scales_h, scales_w); + } +} + + +void upsample_bilinear2d_aa_kernel_impl( + const Tensor& output, + const Tensor& input, + bool align_corners, + c10::optional scales_h, + c10::optional scales_w) { +#ifdef CPU_CAPABILITY_AVX2 + if (input.dtype() == at::kByte && input.size(1) <= 4) { + upsample_avx_bilinear_uint8( + input, output, align_corners, {scales_h, scales_w}, + /*antialias=*/true); + } else { + separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>( + output, input, align_corners, {scales_h, scales_w}, + /*antialias=*/true); + } +#else // CPU_CAPABILITY_AVX2 separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>( - output, input, align_corners, {scales_h, scales_w}); + output, input, align_corners, {scales_h, scales_w}, + /*antialias=*/true); +#endif // CPU_CAPABILITY_AVX2 } void upsample_trilinear3d_kernel_impl( @@ -1424,7 +1814,8 @@ void upsample_bicubic2d_aa_kernel_impl( c10::optional scales_w) { separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpCubic>( - output, input, align_corners, {scales_h, scales_w}); + output, input, align_corners, {scales_h, scales_w}, + /*antialias=*/true); } template < @@ -1500,7 +1891,9 @@ void cpu_upsample_genNd_backward_aa( interp_height, filter_fn, ymin, - ysize); + ysize, + /*antialias=*/true, + /*align_corners_delta=*/0.0); for (const auto ow : c10::irange(output_width)) { F::_compute_weights_aa( @@ -1512,7 +1905,9 @@ void cpu_upsample_genNd_backward_aa( interp_width, filter_fn, xmin, - xsize); + xsize, + /*antialias=*/true, + /*align_corners_delta=*/0.0); for (const auto c : c10::irange(begin, end)) { scalar_t grad_output_value = diff --git a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h new file mode 100644 index 000000000000..e8239cf6b86c --- /dev/null +++ b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h @@ -0,0 +1,719 @@ +/* +The Python Imaging Library (PIL) is + + Copyright © 1997-2011 by Secret Labs AB + Copyright © 1995-2011 by Fredrik Lundh + +Pillow is the friendly PIL fork. It is + + Copyright © 2010-2022 by Alex Clark and contributors + +Like PIL, Pillow is licensed under the open source HPND License +*/ + +// This code is heavily inspired from PILLOW-SIMD's implementation: +// https://github.com/uploadcare/pillow-simd/blob/simd/master/src/libImaging/Resample.c + +#pragma once +#ifdef CPU_CAPABILITY_AVX2 +// TODO: This file only supports AVX2. We could split the AVX kernels into +// smaller logical blocks in order to port them into the Vec.h logic. This would +// allow to support other vectorization architectures and perhaps also support +// the non-vectorized fallback (we'd need to make sure it's not slower than the +// current fallback). + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + + +namespace { + +static __m128i inline mm_cvtepu8_epi32(const uint32_t* C10_RESTRICT ptr) { + return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)ptr)); +} + +// TODO: We may want to hard-code an unrolled version for the case where +// num_channels=3 to hint the compiler to vectorize this (looks at original +// PIL-SIMD's code). +at::Tensor unpack_rgb(const at::Tensor& packed_tensor) { + // Convert a "packed" tensor (typically RGBRGBRGB if channels_last) into + // RGBARGBARGBA format where A is hard-coded to 255. Each pixel is encoded + // into as 32bits. This generalizes to num_channels <= 4 and also works for + // non-channels_last tensors. + + const uint8_t* packed = (const uint8_t*)packed_tensor.data_ptr(); + auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2); + auto num_channels = packed_tensor.size(0); + + constexpr int rgba_size = 4; + auto unpacked_tensor = at::empty({rgba_size, packed_tensor.size(1), packed_tensor.size(2)}, at::CPU(at::kByte)); + uint8_t* unpacked = (uint8_t*) unpacked_tensor.data_ptr(); + + auto stride_i = packed_tensor.stride(2); + auto stride_j = packed_tensor.stride(0); + + for (const auto i : c10::irange(num_pixels)) { + for (const auto j : c10::irange(rgba_size)) { + unpacked[rgba_size * i + j] = (j < num_channels) ? packed[stride_i * i + stride_j * j] : 0; + } + } + return unpacked_tensor; +} + +void pack_rgb( + const at::Tensor& unpacked_tensor, // IN + const at::Tensor& packed_tensor // OUT +) { + constexpr int rgba_size = 4; + uint8_t* unpacked = (uint8_t*)unpacked_tensor.data_ptr(); + uint8_t* packed = (uint8_t*)packed_tensor.data_ptr(); + auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2); + auto num_channels = packed_tensor.size(0); + + auto packed_increment = packed_tensor.stride(2); + auto packed_stride = packed_tensor.stride(0); + + for (const auto i C10_UNUSED : c10::irange(num_pixels)) { + for (const auto j : c10::irange(num_channels)) { + packed[j * packed_stride] = unpacked[j]; + } + unpacked += rgba_size; + packed += packed_increment; + } +} + +void ImagingResampleHorizontalConvolution8u4x( + uint32_t* C10_RESTRICT lineOut0, + uint32_t* C10_RESTRICT lineOut1, + uint32_t* C10_RESTRICT lineOut2, + uint32_t* C10_RESTRICT lineOut3, + const uint32_t* C10_RESTRICT lineIn0, + const uint32_t* C10_RESTRICT lineIn1, + const uint32_t* C10_RESTRICT lineIn2, + const uint32_t* C10_RESTRICT lineIn3, + int xsize, + int* xbounds, + int16_t* kk, + int kmax, + int coefs_precision); + +void ImagingResampleHorizontalConvolution8u( + uint32_t* C10_RESTRICT lineOut, + const uint32_t* C10_RESTRICT lineIn, + int xsize, + int* xbounds, + int16_t* kk, + int kmax, + int coefs_precision); + +void ImagingResampleVerticalConvolution8u( + uint32_t* C10_RESTRICT lineOut, + const uint32_t* C10_RESTRICT imIn, + int xmin, + int xmax, + int16_t* k, + int coefs_precision, + int xin); + +void ImagingResampleHorizontal( + const at::Tensor & unpacked_output, + const at::Tensor & unpacked_input, + int ksize, + const std::vector& horiz_indices_weights, + unsigned int horiz_weights_precision) { + // TODO: we may want to merge that into the fallback code (currently called + // basic_loop_aa_horizontal) + // Although this may not be needed if / when we port all this code to use + // Vec.h since this would potentially give us another fall-back implem + int yy; + + int16_t* kk = (int16_t*)(horiz_indices_weights[3].data_ptr()); + + auto xout = unpacked_output.size(2); + auto yout = unpacked_output.size(1); + auto xin = unpacked_input.size(2); + + std::vector bounds_vec(2 * xout, 0); + int* bounds = bounds_vec.data(); + + int64_t* idx_ptr_xmin = horiz_indices_weights[0].data_ptr(); + int64_t* idx_ptr_size = horiz_indices_weights[1].data_ptr(); + for (int i = 0; i < xout; i++) { + bounds[2 * i + 0] = idx_ptr_xmin[i]; + bounds[2 * i + 1] = idx_ptr_size[i]; + } + + uint32_t* unpacked_input_p = (uint32_t*) unpacked_input.data_ptr(); + uint32_t* unpacked_output_p = (uint32_t*) unpacked_output.data_ptr(); + + yy = 0; + for (; yy < yout - 3; yy += 4) { + ImagingResampleHorizontalConvolution8u4x( + unpacked_output_p + yy * xout, + unpacked_output_p + (yy + 1) * xout, + unpacked_output_p + (yy + 2) * xout, + unpacked_output_p + (yy + 3) * xout, + unpacked_input_p + yy * xin, + unpacked_input_p + (yy + 1) * xin, + unpacked_input_p + (yy + 2) * xin, + unpacked_input_p + (yy + 3) * xin, + xout, + bounds, + kk, + ksize, + (int)horiz_weights_precision); + } + for (; yy < yout; yy++) { + ImagingResampleHorizontalConvolution8u( + unpacked_output_p + yy * xout, + unpacked_input_p + yy * xin, + xout, + bounds, + kk, + ksize, + (int)horiz_weights_precision); + } +} + +void ImagingResampleVertical( + const at::Tensor & unpacked_output, + const at::Tensor & unpacked_input, + int ksize, + const std::vector& vert_indices_weights, + unsigned int vert_weights_precision) { + // TODO: we may want to merge that into the fallback code (currently called + // basic_loop_aa_vertical) + // Although this may not be needed if / when we port all this code to use + // Vec.h since this would potentially give us another fall-back implem + int ymin, ymax; + int16_t* k = nullptr; + int16_t* kk = (int16_t*)(vert_indices_weights[3].data_ptr()); + + int64_t* idx_ptr_xmin = vert_indices_weights[0].data_ptr(); + int64_t* idx_ptr_size = vert_indices_weights[1].data_ptr(); + + uint32_t* unpacked_output_p = (uint32_t*) unpacked_output.data_ptr(); + uint32_t* unpacked_input_p = (uint32_t*) unpacked_input.data_ptr(); + + auto xout = unpacked_output.size(2); + auto yout = unpacked_output.size(1); + + for (const auto yy : c10::irange(yout)) { + k = &kk[yy * ksize]; + + ymin = idx_ptr_xmin[yy]; + ymax = idx_ptr_size[yy]; + ImagingResampleVerticalConvolution8u( + unpacked_output_p + yy * xout, + unpacked_input_p, + ymin, + ymax, + k, + (int)vert_weights_precision, + xout); + } +} + +// This is the only public entry point in this file. It supports bilinear +// mode for uint8 dtype when C <= 4, with or without antialias. The +// implem is based on PIL-SIMD. +// Its equivalent implementation (fallback) for when AVX isn't supported or when +// C > 4 is separable_upsample_generic_Nd_kernel_impl() There are a bunch of +// future improvement that can be done: look for the TODOs in this file. +// For details on how the weights are computed and how the multiplications are +// run on int (instead of float weights), see +// [ Weights computation for uint8_t and multiplication trick ] +// For details on how the AVX kernels are implemented, see +// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5 +// See also [ Support for antialias=False as a subcase of antilias=True ] to +// learn more about how the antialias=False case is computed. The same holds +// here: all these kernels are general enough to handle an arbitrary number of +// weights, but when aa=False they could be optimized further. +template +void upsample_avx_bilinear_uint8( + const at::Tensor& input, + const at::Tensor& output, + bool align_corners, + const scale_type& scales, + bool antialias) { + auto batch_size = input.size(0); + auto num_channels = input.size(1); + auto xin = input.size(3); + auto yin = input.size(2); + auto xout = output.size(3); + auto yout = output.size(2); + + if (xin == xout && yin == yout) { + output.copy_(input); + return; + } + + auto need_horizontal = xout != xin; + auto need_vertical = yout != yin; + + int ksize_horiz, ksize_vert; + std::vector horiz_indices_weights, vert_indices_weights; + unsigned int horiz_weights_precision, vert_weights_precision; + + if (need_horizontal) { + int interp_dim = 3; + std::tie(horiz_indices_weights, ksize_horiz, horiz_weights_precision) = + F::compute_indices_int16_weights_aa( + /*input_size=*/xin, + /*output_size=*/xout, + /*stride=*/1, + /*ndims=*/4, + /*reshape_dim=*/interp_dim, + /*align_corners=*/align_corners, + /*opt_scale=*/scales[interp_dim - 2], + /*antialias=*/antialias, + /*align_i32=*/true); + } + + if (need_vertical) { + int interp_dim = 2; + std::tie(vert_indices_weights, ksize_vert, vert_weights_precision) = + F::compute_indices_int16_weights_aa( + /*input_size=*/yin, + /*output_size=*/yout, + /*stride=*/1, + /*ndims=*/4, + /*reshape_dim=*/interp_dim, + /*align_corners=*/align_corners, + /*opt_scale=*/scales[interp_dim - 2], + /*antialias=*/antialias, + /*align_i32=*/true); + } + + bool is_rgba = num_channels == 4 && input.is_contiguous(at::MemoryFormat::ChannelsLast); + + at::Tensor buffer_horiz, buffer_vert; + if (need_horizontal && !(is_rgba && !need_vertical)) { + buffer_horiz = at::empty({4, yin, xout}, input.options()); + } + if (need_vertical && !is_rgba) { + buffer_vert = at::empty({4, yout, xout}, input.options()); + } + + // TODO: The unpack / pack operations create a copy of the original input and + // output tensor. There should be a way to avoid these copies by instead + // modifying the low-level kernels. Or maybe at least avoid copying the entire + // tensors and just copy part of them (line by line). + for (const auto i : c10::irange(batch_size)) { + + at::Tensor unpacked_input = (is_rgba) ? input[i] : unpack_rgb(input[i]); + at::Tensor unpacked_output; + + if (need_horizontal) { + + at::Tensor unpacked_output_temp = (is_rgba && !need_vertical) ? output[i] : buffer_horiz; + + ImagingResampleHorizontal( + unpacked_output_temp, + unpacked_input, + ksize_horiz, + horiz_indices_weights, + horiz_weights_precision); + unpacked_output = unpacked_input = unpacked_output_temp; + } + if (need_vertical) { + unpacked_output = (is_rgba) ? output[i] : buffer_vert; + + ImagingResampleVertical( + unpacked_output, + unpacked_input, + ksize_vert, + vert_indices_weights, + vert_weights_precision); + } + + TORCH_INTERNAL_ASSERT(unpacked_output.defined()); + + if (!is_rgba) { + pack_rgb(unpacked_output, output[i]); + } + } +} + +// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5 +void ImagingResampleHorizontalConvolution8u4x( + uint32_t* C10_RESTRICT lineOut0, + uint32_t* C10_RESTRICT lineOut1, + uint32_t* C10_RESTRICT lineOut2, + uint32_t* C10_RESTRICT lineOut3, + const uint32_t* C10_RESTRICT lineIn0, + const uint32_t* C10_RESTRICT lineIn1, + const uint32_t* C10_RESTRICT lineIn2, + const uint32_t* C10_RESTRICT lineIn3, + int xsize, + int* xbounds, + int16_t* kk, + int kmax, + int coefs_precision) { + int xmin, xmax, x; + int16_t* k; + + for (const auto xx : c10::irange(xsize)) { + xmin = xbounds[xx * 2 + 0]; + xmax = xbounds[xx * 2 + 1]; + k = &kk[xx * kmax]; + x = 0; + + __m256i sss0, sss1; + __m256i zero = _mm256_setzero_si256(); + __m256i initial = _mm256_set1_epi32(1 << (coefs_precision - 1)); + sss0 = initial; + sss1 = initial; + + for (; x < xmax - 3; x += 4) { + __m256i pix, mmk0, mmk1, source; + + mmk0 = _mm256_set1_epi32(*(int32_t*)&k[x]); + mmk1 = _mm256_set1_epi32(*(int32_t*)&k[x + 2]); + + source = _mm256_inserti128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lineIn0[x + xmin])), + _mm_loadu_si128((__m128i*)&lineIn1[x + xmin]), + 1); + // clang-format off + pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0, + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk0)); + pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8, + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8)); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk1)); + + source = _mm256_inserti128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lineIn2[x + xmin])), + _mm_loadu_si128((__m128i*)&lineIn3[x + xmin]), + 1); + pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0, + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk0)); + pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8, + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8)); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk1)); + } + + for (; x < xmax - 1; x += 2) { + __m256i pix, mmk; + + mmk = _mm256_set1_epi32(*(int32_t*)&k[x]); + + pix = _mm256_inserti128_si256( + _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lineIn0[x + xmin])), + _mm_loadl_epi64((__m128i*)&lineIn1[x + xmin]), + 1); + pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8( + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0, + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); + + pix = _mm256_inserti128_si256( + _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lineIn2[x + xmin])), + _mm_loadl_epi64((__m128i*)&lineIn3[x + xmin]), + 1); + pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8( + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0, + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk)); + // clang-format on + } + + for (; x < xmax; x++) { + __m256i pix, mmk; + + // [16] xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 + mmk = _mm256_set1_epi32(k[x]); + + // [16] xx a0 xx b0 xx g0 xx r0 xx a0 xx b0 xx g0 xx r0 + pix = _mm256_inserti128_si256( + _mm256_castsi128_si256(mm_cvtepu8_epi32(&lineIn0[x + xmin])), + mm_cvtepu8_epi32(&lineIn1[x + xmin]), + 1); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); + + pix = _mm256_inserti128_si256( + _mm256_castsi128_si256(mm_cvtepu8_epi32(&lineIn2[x + xmin])), + mm_cvtepu8_epi32(&lineIn3[x + xmin]), + 1); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk)); + } + + sss0 = _mm256_srai_epi32(sss0, coefs_precision); + sss1 = _mm256_srai_epi32(sss1, coefs_precision); + sss0 = _mm256_packs_epi32(sss0, zero); + sss1 = _mm256_packs_epi32(sss1, zero); + sss0 = _mm256_packus_epi16(sss0, zero); + sss1 = _mm256_packus_epi16(sss1, zero); + lineOut0[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 0)); + lineOut1[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1)); + lineOut2[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 0)); + lineOut3[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1)); + } +} + +// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5 +void ImagingResampleHorizontalConvolution8u( + uint32_t* C10_RESTRICT lineOut, + const uint32_t* C10_RESTRICT lineIn, + int xsize, + int* xbounds, + int16_t* kk, + int kmax, + int coefs_precision) { + int xmin, xmax, x; + int16_t* k; + + for (const auto xx : c10::irange(xsize)) { + __m128i sss; + xmin = xbounds[xx * 2 + 0]; + xmax = xbounds[xx * 2 + 1]; + k = &kk[xx * kmax]; + x = 0; + + if (xmax < 8) { + sss = _mm_set1_epi32(1 << (coefs_precision - 1)); + } else { + // Lower part will be added to higher, use only half of the error + __m256i sss256 = _mm256_set1_epi32(1 << (coefs_precision - 2)); + + for (; x < xmax - 7; x += 8) { + __m256i pix, mmk, source; + __m128i tmp = _mm_loadu_si128((__m128i*)&k[x]); + __m256i ksource = + _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1); + + // clang-format off + source = _mm256_loadu_si256((__m256i*)&lineIn[x + xmin]); + pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0, + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); + mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8( + 11,10, 9,8, 11,10, 9,8, 11,10, 9,8, 11,10, 9,8, + 3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0)); + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk)); + + pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8, + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8)); + mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8( + 15,14, 13,12, 15,14, 13,12, 15,14, 13,12, 15,14, 13,12, + 7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4)); + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk)); + // clang-format on + } + + for (; x < xmax - 3; x += 4) { + __m256i pix, mmk, source; + __m128i tmp = _mm_loadl_epi64((__m128i*)&k[x]); + __m256i ksource = + _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1); + + tmp = _mm_loadu_si128((__m128i*)&lineIn[x + xmin]); + source = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1); + + // clang-format off + pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8, + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); + mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8( + 7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4, + 3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0)); + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk)); + // clang-format on + } + + sss = _mm_add_epi32( + _mm256_extracti128_si256(sss256, 0), + _mm256_extracti128_si256(sss256, 1)); + } + + for (; x < xmax - 1; x += 2) { + __m128i mmk = _mm_set1_epi32(*(int32_t*)&k[x]); + __m128i source = _mm_loadl_epi64((__m128i*)&lineIn[x + xmin]); + __m128i pix = _mm_shuffle_epi8( + source, + _mm_set_epi8(-1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0)); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + for (; x < xmax; x++) { + __m128i pix = mm_cvtepu8_epi32(&lineIn[x + xmin]); + __m128i mmk = _mm_set1_epi32(k[x]); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + sss = _mm_srai_epi32(sss, coefs_precision); + sss = _mm_packs_epi32(sss, sss); + lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss)); + } +} + +// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5 +void ImagingResampleVerticalConvolution8u( + uint32_t* C10_RESTRICT lineOut, + const uint32_t* C10_RESTRICT imIn, + int xmin, + int xmax, + int16_t* k, + int coefs_precision, + int xin) { + int x; + int xx = 0; + int xsize = xin; + + __m128i initial = _mm_set1_epi32(1 << (coefs_precision - 1)); + __m256i initial_256 = _mm256_set1_epi32(1 << (coefs_precision - 1)); + + for (; xx < xsize - 7; xx += 8) { + __m256i sss0 = initial_256; + __m256i sss1 = initial_256; + __m256i sss2 = initial_256; + __m256i sss3 = initial_256; + x = 0; + for (; x < xmax - 1; x += 2) { + __m256i source, source1, source2; + __m256i pix, mmk; + + // Load two coefficients at once + mmk = _mm256_set1_epi32(*(int32_t*)&k[x]); + + // Load 2 lines + // (__m256i *) &imIn->image32[x + xmin][xx] + source1 = _mm256_loadu_si256((__m256i*)(imIn + (x + xmin) * xin + xx)); + // (__m256i *) &imIn->image32[x + 1 + xmin][xx] + source2 = + _mm256_loadu_si256((__m256i*)(imIn + (x + 1 + xmin) * xin + xx)); + + source = _mm256_unpacklo_epi8(source1, source2); + pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); + pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk)); + + source = _mm256_unpackhi_epi8(source1, source2); + pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk)); + pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk)); + } + for (; x < xmax; x += 1) { + __m256i source, source1, pix, mmk; + mmk = _mm256_set1_epi32(k[x]); + + // (__m256i *) &imIn->image32[x + xmin][xx]) + source1 = _mm256_loadu_si256((__m256i*)(imIn + (x + xmin) * xin + xx)); + + source = _mm256_unpacklo_epi8(source1, _mm256_setzero_si256()); + pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); + pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk)); + + source = _mm256_unpackhi_epi8(source1, _mm256_setzero_si256()); + pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk)); + pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk)); + } + sss0 = _mm256_srai_epi32(sss0, coefs_precision); + sss1 = _mm256_srai_epi32(sss1, coefs_precision); + sss2 = _mm256_srai_epi32(sss2, coefs_precision); + sss3 = _mm256_srai_epi32(sss3, coefs_precision); + + sss0 = _mm256_packs_epi32(sss0, sss1); + sss2 = _mm256_packs_epi32(sss2, sss3); + sss0 = _mm256_packus_epi16(sss0, sss2); + _mm256_storeu_si256((__m256i*)&lineOut[xx], sss0); + } + + for (; xx < xsize - 1; xx += 2) { + __m128i sss0 = initial; // left row + __m128i sss1 = initial; // right row + x = 0; + for (; x < xmax - 1; x += 2) { + __m128i source, source1, source2; + __m128i pix, mmk; + + // Load two coefficients at once + mmk = _mm_set1_epi32(*(int32_t*)&k[x]); + + // Load 2 lines + // (__m128i *) &imIn->image32[x + xmin][xx]) + source1 = _mm_loadl_epi64((__m128i*)(imIn + (x + xmin) * xin + xx)); + // (__m128i *) &imIn->image32[x + 1 + xmin][xx] + source2 = _mm_loadl_epi64((__m128i*)(imIn + (x + 1 + xmin) * xin + xx)); + + source = _mm_unpacklo_epi8(source1, source2); + pix = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); + pix = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); + } + for (; x < xmax; x += 1) { + __m128i source, source1, pix, mmk; + mmk = _mm_set1_epi32(k[x]); + + // (__m128i *) &imIn->image32[x + xmin][xx]); + source1 = _mm_loadl_epi64((__m128i*)(imIn + (x + xmin) * xin + xx)); + + source = _mm_unpacklo_epi8(source1, _mm_setzero_si128()); + pix = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); + pix = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); + } + sss0 = _mm_srai_epi32(sss0, coefs_precision); + sss1 = _mm_srai_epi32(sss1, coefs_precision); + + sss0 = _mm_packs_epi32(sss0, sss1); + sss0 = _mm_packus_epi16(sss0, sss0); + _mm_storel_epi64((__m128i*)&lineOut[xx], sss0); + } + + for (; xx < xsize; xx++) { + __m128i sss = initial; + x = 0; + for (; x < xmax - 1; x += 2) { + __m128i source, source1, source2; + __m128i pix, mmk; + + // Load two coefficients at once + mmk = _mm_set1_epi32(*(int32_t*)&k[x]); + + // Load 2 lines + // *(int *) &imIn->image32[x + xmin][xx] + source1 = _mm_cvtsi32_si128(*(int*)(imIn + (x + xmin) * xin + xx)); + // *(int *) &imIn->image32[x + 1 + xmin][xx] + source2 = _mm_cvtsi32_si128(*(int*)(imIn + (x + 1 + xmin) * xin + xx)); + + source = _mm_unpacklo_epi8(source1, source2); + pix = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + for (; x < xmax; x++) { + // &imIn->image32[x + xmin][xx] + __m128i pix = mm_cvtepu8_epi32(imIn + (x + xmin) * xin + xx); + __m128i mmk = _mm_set1_epi32(k[x]); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + sss = _mm_srai_epi32(sss, coefs_precision); + sss = _mm_packs_epi32(sss, sss); + lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss)); + } +} + +} // anonymous namespace +#endif // CPU_CAPABILITY_AVX2 diff --git a/test/test_nn.py b/test/test_nn.py index 2da67352a7f9..cb3197e528ab 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -9367,67 +9367,67 @@ class TestNNDeviceType(NNTestCase): @parametrize_test("antialias", [True, False]) @parametrize_test("align_corners", [True, False]) @parametrize_test("mode", ["bilinear", "bicubic"]) + @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last]) @onlyNativeDeviceTypes - def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode): + def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory_format): # Forward AD does not support XLA because XLA tensors don't have storage check_forward_ad = torch.device(device).type != 'xla' kwargs = dict(mode=mode, align_corners=align_corners, antialias=antialias) - for memory_format in [torch.contiguous_format, torch.channels_last]: - # test float scale factor up & downsampling - for scale_factor in [0.5, 1.5, 2]: - in_t = torch.ones(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_() - out_size = int(math.floor(in_t.shape[-1] * scale_factor)) - with warnings.catch_warnings(record=True) as w: - out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs) - expected_out = torch.ones(2, 3, out_size, out_size, device=device) - self.assertEqual(expected_out, out_t) - # Assert that memory format is carried through to the output - self.assertTrue(out_t.is_contiguous(memory_format=memory_format)) - out_t.backward(torch.randn_like(out_t)) - self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format)) + # test float scale factor up & downsampling + for scale_factor in [0.5, 1.5, 2]: + in_t = torch.ones(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_() + out_size = int(math.floor(in_t.shape[-1] * scale_factor)) + with warnings.catch_warnings(record=True) as w: + out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs) + expected_out = torch.ones(2, 3, out_size, out_size, device=device) + self.assertEqual(expected_out, out_t) + # Assert that memory format is carried through to the output + self.assertTrue(out_t.is_contiguous(memory_format=memory_format)) + out_t.backward(torch.randn_like(out_t)) + self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format)) - if torch.device(device).type == 'cuda': - # Bilinear backward is nondeterministic because of atomicAdd usage - nondet_tol = 1e-5 - else: - nondet_tol = 0.0 + if torch.device(device).type == 'cuda': + # Bilinear backward is nondeterministic because of atomicAdd usage + nondet_tol = 1e-5 + else: + nondet_tol = 0.0 - input = torch.randn(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_() - gradcheck( - lambda x: F.interpolate(x, out_size, **kwargs), - [input], - check_forward_ad=check_forward_ad, nondet_tol=nondet_tol - ) - gradgradcheck( - lambda x: F.interpolate(x, out_size, **kwargs), - [input], - check_fwd_over_rev=check_forward_ad, nondet_tol=nondet_tol - ) + input = torch.randn(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_() + gradcheck( + lambda x: F.interpolate(x, out_size, **kwargs), + [input], + check_forward_ad=check_forward_ad, nondet_tol=nondet_tol + ) + gradgradcheck( + lambda x: F.interpolate(x, out_size, **kwargs), + [input], + check_fwd_over_rev=check_forward_ad, nondet_tol=nondet_tol + ) - # Assert that cpu and cuda give same results - if torch.device(device).type == 'cuda': - for shapes in [ - (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2) - ]: - a_cuda = torch.randn( - *shapes, device=device - ).contiguous(memory_format=memory_format).requires_grad_() - a_cpu = a_cuda.detach().cpu().requires_grad_() + # Assert that cpu and cuda give same results + if torch.device(device).type == 'cuda': + for shapes in [ + (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2) + ]: + a_cuda = torch.randn( + *shapes, device=device + ).contiguous(memory_format=memory_format).requires_grad_() + a_cpu = a_cuda.detach().cpu().requires_grad_() - with warnings.catch_warnings(record=True): - out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, **kwargs) - out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, **kwargs) + with warnings.catch_warnings(record=True): + out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, **kwargs) + out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, **kwargs) - self.assertEqual(out_cpu, out_cuda.cpu()) + self.assertEqual(out_cpu, out_cuda.cpu()) - g_cuda = torch.randn_like(out_cuda) - g_cpu = g_cuda.cpu() + g_cuda = torch.randn_like(out_cuda) + g_cpu = g_cuda.cpu() - out_cuda.backward(g_cuda) - out_cpu.backward(g_cpu) + out_cuda.backward(g_cuda) + out_cpu.backward(g_cpu) - self.assertEqual(a_cuda.grad, a_cpu.grad) + self.assertEqual(a_cuda.grad, a_cpu.grad) @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last]) def test_upsamplingBilinear2d_aa_correctness(self, device, memory_format): @@ -9445,6 +9445,40 @@ class TestNNDeviceType(NNTestCase): t_out = F.interpolate(t_in, size=(2, 2), mode="bilinear", align_corners=False, antialias=True) self.assertEqual(expected_out, t_out) + @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last]) + @parametrize_test("antialias", [True, False]) + @parametrize_test("align_corners", [True, False]) + @parametrize_test("num_channels", [3, 5]) + @parametrize_test("output_size", [32, 600]) + def test_upsamplingBiLinear2d_consistency(self, device, memory_format, antialias, align_corners, num_channels, output_size): + if torch.device(device).type == "cuda": + raise SkipTest("CUDA implementation is not yet supporting uint8") + + mode = "bilinear" + # Check if Max Abs Error between resized input_uint8 and resized input_float is smaller than a tolerated value, e.g. 1.0 + input_ui8 = torch.randint(0, 256, size=(1, num_channels, 400, 400), dtype=torch.uint8, device=device) + input_ui8 = input_ui8.contiguous(memory_format=memory_format) + input_f32 = input_ui8.float() + + output_f32 = F.interpolate( + input_f32, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias + ) + output_ui8 = F.interpolate( + input_ui8, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias + ) + + mae_tol = 0.5 + max_abs_err_tol = 1.0 + num_wrong_pixels_tol = 5 + + abs_diff = torch.abs(output_f32.round() - output_ui8.float()) + mae = torch.mean(abs_diff) + max_abs_err = torch.max(abs_diff) + num_wrong_pixels = (abs_diff > max_abs_err_tol).sum() + self.assertTrue(mae < mae_tol, msg=f"mae={mae}") + self.assertTrue(max_abs_err < max_abs_err_tol + 1e-5, msg=f"max ae={max_abs_err}") + self.assertTrue(num_wrong_pixels < num_wrong_pixels_tol, msg=f"num_wrong_pixels={num_wrong_pixels}") + def test_upsamplingBicubic2d_correctness(self, device): # test output against known input: align_corners=False result must match opencv in_t = torch.arange(8., device=device).view(1, 2, 2, 2) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 138c0b67c951..66ff2938d675 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -12118,7 +12118,7 @@ op_db: List[OpInfo] = [ supports_fwgrad_bwgrad=True, supports_autograd=True, supports_forward_ad=True, - dtypes=floating_types_and(torch.bfloat16), + dtypes=floating_types_and(torch.uint8, torch.bfloat16), dtypesIfCUDA=floating_types_and(torch.half), gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, sample_inputs_func=partial(sample_inputs_interpolate, 'bilinear'), @@ -12184,7 +12184,7 @@ op_db: List[OpInfo] = [ supports_autograd=True, supports_forward_ad=True, supports_fwgrad_bwgrad=True, - dtypes=floating_types_and(torch.bfloat16), + dtypes=floating_types_and(torch.uint8, torch.bfloat16), dtypesIfCUDA=floating_types_and(torch.half), gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, sample_inputs_func=partial(sample_inputs_upsample, 'bilinear'),