mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Add uint8 support for interpolate for CPU images (#90771)
Joint work with @vfdev-5 This PR introduces native uint8 support for `interpolate()`, for `bilinear` ~and `bicubic`~ modes for CPU images (`mode=nearest[_exact]` was already supported ). On a typical torchvision training job on ImageNet, the speedup are ~4X when AVX2 is supported, comparing the uint8 native (this PR) vs torchvision's current `Resize()`: ``` AA = antialias float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does) input_size output_size channels_last AA mode num_threads speed-up float vs uint8 (this PR) (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=1 4X 2.6ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=1 2.1X 1.3ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=1 3X 2.1ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=1 4X 2.4ms vs 0.6ms (Note: we removed bicubic support for now) (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=1 4X 2.9ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=1 5X 3.1ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=1 3X 2.4ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=1 4X 2.8ms vs 0.7ms ``` There is still room for further speed-ups (see TODOs in the code). #### More benchmark details with AVX2 support - speedups typically range from 1.5X to 10X. A few edge-cases are slower, worth investigating why. <details> ``` AA = antialias float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does) input_size output_size channels_last AA mode num_threads speed-up float vs uint8 (this PR) (1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=1 5X 1.1ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=1 5X 1.2ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=1 2.8X 0.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=1 7X 1.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=1 5X 1.2ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=1 12X 2.9ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=1 3X 0.8ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=1 7X 1.8ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=2 2.6X 0.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=2 2.8X 0.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=2 1.7X 0.4ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=2 1.4X 0.3ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=2 2.7X 0.7ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=2 7X 1.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=2 1.8X 0.4ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=2 4X 1.0ms vs 0.2ms (1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=1 4X 2.5ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=1 3.0X 1.8ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=1 3X 1.8ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=1 4X 2.3ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=1 4X 2.7ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=1 7X 4.3ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=1 3X 2.1ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=1 4X 2.6ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=2 2.7X 1.6ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=2 2.6X 1.5ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=2 2.1X 1.2ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=2 1.6X 0.9ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=2 2.8X 1.7ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=2 5X 2.8ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=2 2.3X 1.4ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=2 3X 1.9ms vs 0.6ms (1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=1 4X 26.6ms vs 6.7ms (1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=1 4X 23.9ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=1 2.5X 16.8ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=1 5X 33.1ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=1 4X 25.9ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=1 8X 59.6ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=1 1.9X 14.3ms vs 7.4ms (1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=1 5X 35.4ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=2 2.0X 13.6ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=2 2.2X 14.8ms vs 6.7ms (1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=2 1.3X 8.8ms vs 6.9ms (1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=2 1.2X 8.4ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=2 1.8X 12.8ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=2 4X 32.1ms vs 7.2ms (1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=2 1.4X 10.1ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=2 2.9X 20.9ms vs 7.3ms (1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=1 1.4X 0.5ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=1 0.7X 0.2ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=1 1.3X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=1 1.4X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=1 2.1X 0.7ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=1 1.3X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=1 1.9X 0.6ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=1 1.0X 0.3ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=2 1.0X 0.3ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=2 0.6X 0.2ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=2 0.8X 0.3ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=2 1.4X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=2 1.4X 0.5ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=2 1.2X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=2 1.2X 0.4ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=2 0.9X 0.3ms vs 0.3ms (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=1 4X 2.6ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=1 2.1X 1.3ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=1 3X 2.1ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=1 4X 2.4ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=1 4X 2.9ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=1 5X 3.1ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=1 3X 2.4ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=1 4X 2.8ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=2 1.5X 1.0ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=2 1.2X 0.8ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=2 2.3X 1.5ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=2 1.9X 1.2ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=2 1.6X 1.2ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=2 4X 2.4ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=2 2.4X 1.6ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=2 2.8X 1.8ms vs 0.6ms (1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=1 2.1X 12.8ms vs 6.1ms (1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=1 0.6X 3.8ms vs 5.9ms (1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=1 1.2X 7.1ms vs 6.1ms (1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=1 1.9X 11.0ms vs 5.9ms (1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=1 2.0X 12.6ms vs 6.4ms (1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=1 1.0X 6.1ms vs 6.0ms (1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=1 1.8X 11.3ms vs 6.4ms (1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=1 0.8X 4.6ms vs 6.0ms (1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=2 1.6X 9.3ms vs 6.0ms (1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=2 0.3X 2.0ms vs 5.8ms (1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=2 1.2X 7.2ms vs 6.0ms (1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=2 0.3X 1.6ms vs 5.8ms (1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=2 1.1X 7.1ms vs 6.5ms (1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=2 0.6X 3.3ms vs 5.9ms (1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=2 0.9X 5.9ms vs 6.3ms (1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=2 0.4X 2.4ms vs 5.9ms ``` </details> without AVX2 support - no significant speed-up, but there are various possible improvements (see TODOs) <details> ``` AA = antialias float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does) input_size output_size channels_last AA mode num_threads speed-up float vs uint8 (this PR) (1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=1 0.9X 1.5ms vs 1.6ms (1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=1 0.9X 1.5ms vs 1.6ms (1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=1 0.8X 0.9ms vs 1.1ms (1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=1 1.5X 1.7ms vs 1.1ms (1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=1 0.9X 1.6ms vs 1.8ms (1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=1 2.1X 3.9ms vs 1.9ms (1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=1 0.8X 1.1ms vs 1.4ms (1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=1 1.7X 2.4ms vs 1.5ms (1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=2 0.9X 0.8ms vs 0.8ms (1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=2 0.9X 0.8ms vs 0.8ms (1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=2 0.9X 0.5ms vs 0.6ms (1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=2 0.7X 0.5ms vs 0.7ms (1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=2 0.9X 0.9ms vs 1.0ms (1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=2 2.1X 2.0ms vs 1.0ms (1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=2 0.8X 0.6ms vs 0.8ms (1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=2 1.7X 1.3ms vs 0.8ms (1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=1 1.0X 3.0ms vs 3.0ms (1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=1 1.0X 2.8ms vs 2.9ms (1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=1 1.0X 2.3ms vs 2.2ms (1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=1 1.4X 3.3ms vs 2.3ms (1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=1 1.0X 3.5ms vs 3.5ms (1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=1 1.7X 6.1ms vs 3.5ms (1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=1 0.9X 2.6ms vs 2.9ms (1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=1 1.4X 4.2ms vs 2.9ms (1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=2 1.0X 1.7ms vs 1.7ms (1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=2 0.9X 1.6ms vs 1.8ms (1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=2 0.9X 1.3ms vs 1.4ms (1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=2 0.7X 1.1ms vs 1.6ms (1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=2 1.0X 2.0ms vs 2.0ms (1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=2 1.7X 3.2ms vs 1.9ms (1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=2 0.8X 1.5ms vs 1.9ms (1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=2 1.2X 2.3ms vs 1.9ms (1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=1 1.1X 34.7ms vs 32.4ms (1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=1 1.0X 31.2ms vs 32.4ms (1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=1 1.0X 23.5ms vs 22.7ms (1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=1 1.9X 42.5ms vs 22.7ms (1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=1 0.9X 33.9ms vs 37.4ms (1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=1 2.2X 84.0ms vs 37.5ms (1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=1 1.0X 28.4ms vs 28.8ms (1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=1 2.0X 56.7ms vs 28.8ms (1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=2 1.1X 17.5ms vs 16.4ms (1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=2 1.1X 17.7ms vs 16.4ms (1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=2 0.8X 8.8ms vs 11.4ms (1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=2 1.0X 11.1ms vs 11.4ms (1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=2 1.1X 19.9ms vs 18.8ms (1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=2 2.3X 42.5ms vs 18.7ms (1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=2 1.0X 14.1ms vs 14.5ms (1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=2 2.0X 28.4ms vs 14.5ms (1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=1 1.0X 0.6ms vs 0.6ms (1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=1 0.7X 0.3ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=1 0.9X 0.5ms vs 0.6ms (1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=1 1.7X 0.6ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=1 1.0X 0.8ms vs 0.8ms (1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=1 1.1X 0.5ms vs 0.5ms (1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=1 0.9X 0.7ms vs 0.8ms (1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=1 0.9X 0.4ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=2 1.0X 0.4ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=2 0.8X 0.2ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=2 0.9X 0.3ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=2 1.3X 0.3ms vs 0.2ms (1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=2 1.0X 0.5ms vs 0.5ms (1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=2 1.3X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=2 0.9X 0.5ms vs 0.5ms (1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=2 1.2X 0.3ms vs 0.3ms (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=1 0.8X 2.1ms vs 2.5ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=1 0.7X 1.6ms vs 2.4ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=1 1.2X 2.4ms vs 2.1ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=1 1.3X 2.6ms vs 2.0ms (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=1 1.1X 3.4ms vs 3.0ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=1 1.7X 4.8ms vs 2.8ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=1 1.1X 2.9ms vs 2.7ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=1 1.4X 3.5ms vs 2.4ms (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=2 0.9X 1.2ms vs 1.3ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=2 1.3X 1.6ms vs 1.2ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=2 0.8X 0.9ms vs 1.1ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=2 1.3X 1.3ms vs 1.0ms (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=2 1.4X 2.2ms vs 1.6ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=2 1.9X 2.8ms vs 1.5ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=2 0.8X 1.1ms vs 1.4ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=2 1.7X 2.1ms vs 1.3ms (1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=1 1.0X 10.0ms vs 9.9ms (1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=1 0.7X 4.6ms vs 6.2ms (1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=1 0.9X 9.1ms vs 9.8ms (1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=1 1.7X 9.4ms vs 5.7ms (1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=1 1.0X 15.2ms vs 14.8ms (1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=1 1.0X 7.6ms vs 7.5ms (1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=1 0.9X 13.3ms vs 14.4ms (1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=1 0.8X 5.9ms vs 7.0ms (1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=2 1.2X 6.0ms vs 5.2ms (1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=2 0.7X 2.3ms vs 3.2ms (1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=2 1.0X 4.8ms vs 5.0ms (1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=2 0.7X 1.9ms vs 2.9ms (1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=2 1.6X 12.3ms vs 7.5ms (1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=2 1.0X 3.9ms vs 3.9ms (1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=2 1.0X 7.0ms vs 7.3ms (1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=2 0.9X 3.0ms vs 3.5ms ``` </details> Benchmark code <details> ```py import operator_benchmark as op_bench import torch """Microbenchmarks for interpolate operator.""" class InterpolateBenchmark(op_bench.TorchBenchmarkBase): def init(self, input_size, output_size, channels_last=False, mode='linear', antialias=False, dtype=torch.float): input_image = torch.randint(0, 256, size=input_size, dtype=torch.uint8, device='cpu') if channels_last: input_image = input_image.contiguous(memory_format=torch.channels_last) self.inputs = { "input_image": input_image, "output_size": output_size, "mode": mode, "antialias": antialias, "dtype":dtype, } self.set_module_name("interpolate") def forward(self, input_image, output_size, mode, antialias, dtype): if dtype == torch.float: input_image = input_image.float() out = torch.nn.functional.interpolate(input_image, size=output_size, mode=mode, align_corners=False, antialias=antialias) if dtype == torch.float: out = out.round().clamp(min=0, max=256).to(torch.uint8) def make_config(): sizes = ( ((224, 224), (64, 64)), ((270, 268), (224, 224)), ((256, 256), (1024, 1024)), ) attrs = [] for (HW1, HW2) in sizes: attrs.append([(1, 3, *HW1), HW2]) # 3 channels # attrs.append([(1, 1, *HW1), HW2]) # 1 channel attrs.append([(1, 3, *HW2), HW1]) # 3 channels # attrs.append([(1, 1, *HW2), HW1]) # 1 channel config = op_bench.config_list( attr_names=["input_size", "output_size"], attrs=attrs, cross_product_configs={ 'channels_last': [True, False], 'mode': ["bilinear", "bicubic"], 'antialias': [True, False], # 'dtype': [torch.float, torch.uint8] # 'dtype': [torch.uint8] 'dtype': [torch.float] }, tags=["short"], ) return config config = make_config() op_bench.generate_pt_test(config, InterpolateBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main() ``` ```py import re import argparse parser = argparse.ArgumentParser() parser.add_argument("f1", nargs="?", default="main") parser.add_argument("f2", nargs="?", default="new") args = parser.parse_args() with open(args.f1) as f: main = f.readlines() with open(args.f2) as f: new = f.readlines() out = [] for main_line, new_line in zip(main, new): # num_threads=1 # TODO: remove if main_line.startswith("num_threads="): num_threads = int(main_line.split("=")[-1]) if main_line.startswith("# Input"): deets = f"{main_line.strip()}, {num_threads=}" if main_line.startswith("Forward"): main_time = float(main_line.split()[-1]) new_time = float(new_line.split()[-1]) ratio = main_time / new_time fmt = ".1f" if ratio < 3 else ".0f" improv = f"{ratio:{fmt}}X" time_fmt = ",.3f" if new_time < 100 else ",.1f" deets = deets.strip().replace("# Input: ", "") deets = deets.replace(": ", "=") deets = deets.replace("input_size=", "") deets = deets.replace(", output_size=", " -> ") deets = deets.replace("dtype=torch.", "") deets = deets.replace("mode=", "") deets = deets.replace("antialias=", "") deets = deets.replace("channels_last=", "") # deets = deets.replace("channels_last=True, ", "") split = deets.split(",") # size = ','.join(split[:-3]) # mode, dtype, threads = split[-3:] # deets = f"{size:<30} {mode:<15} {dtype:<10} {threads:<15}" size = ','.join(split[:-5]) channels_last, mode, antialias, dtype, threads= split[-5:] deets = f"{size:<33} {channels_last:<7} {antialias:<7} {mode:<10} {threads:<15}" l = f"{deets} {improv:<5} {main_time / 1000:{time_fmt}}ms vs {new_time / 1000:{time_fmt}}ms" out.append(l) def key(s): # s = ''.join(s.split()[1:]) # remove "N.nX" part num_threads = (int(re.findall(r"num_threads=(\d+)", s)[0]),) input_shape, output_shape = re.findall("\(.*?\)", s) input_shape = input_shape[1:-1] # remove parenthesis input_HW = tuple(int(x) for x in input_shape.split(",")[-2:]) input_C = (-int(input_shape.split(",")[1]),) output_HW = tuple(int(x) for x in output_shape[1:-1].split(",")) is_downsample = (output_HW[0] < input_HW[0],) if "linear" in s: mode = "linear" elif "nearest-exact" in s: mode = "nearest-exact" else: # assert "nearest" in s mode = "nearest" mode = (mode,) return is_downsample + input_HW + output_HW + num_threads + input_C + mode for i, l in enumerate(sorted(out, key=key)): if i % 8 == 0: print() # if i % 10 == 0 and i % 40 != 0: # print() # if i % 40 == 0: # print("-" * 100) print(l) ``` </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/90771 Approved by: https://github.com/peterbell10, https://github.com/ngimel
This commit is contained in:
committed by
PyTorch MergeBot
parent
782e4f5c02
commit
544c04f2df
38
NOTICE
38
NOTICE
@ -416,3 +416,41 @@ derivation and reference the following license:
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
=======================================================================
|
||||
PILLOW-SIMD Software License
|
||||
=======================================================================
|
||||
|
||||
Code derived from implementations in PILLOW-SIMD should mention its derivation
|
||||
and reference the following license:
|
||||
|
||||
The Python Imaging Library (PIL) is
|
||||
|
||||
Copyright © 1997-2011 by Secret Labs AB
|
||||
Copyright © 1995-2011 by Fredrik Lundh
|
||||
|
||||
Pillow is the friendly PIL fork. It is
|
||||
|
||||
Copyright © 2010-2022 by Alex Clark and contributors
|
||||
|
||||
Like PIL, Pillow is licensed under the open source HPND License:
|
||||
|
||||
By obtaining, using, and/or copying this software and/or its associated
|
||||
documentation, you agree that you have read, understood, and will comply
|
||||
with the following terms and conditions:
|
||||
|
||||
Permission to use, copy, modify, and distribute this software and its
|
||||
associated documentation for any purpose and without fee is hereby granted,
|
||||
provided that the above copyright notice appears in all copies, and that
|
||||
both that copyright notice and this permission notice appear in supporting
|
||||
documentation, and that the name of Secret Labs AB or the author not be
|
||||
used in advertising or publicity pertaining to distribution of the software
|
||||
without specific, written prior permission.
|
||||
|
||||
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
|
||||
SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
|
||||
IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL,
|
||||
INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
|
||||
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||||
PERFORMANCE OF THIS SOFTWARE.
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include <ATen/native/UpSample.h>
|
||||
#include <ATen/native/cpu/utils.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/native/cpu/UpSampleKernelAVXAntialias.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
@ -22,12 +23,53 @@ namespace {
|
||||
|
||||
using scale_t = std::vector<c10::optional<double>>;
|
||||
|
||||
// TODO: this file could benefit from a global renaming of its functions /
|
||||
// classes and terms, as well as from adding more comments. In particular:
|
||||
// - It's not obvious that despite their names (and the file name), all these
|
||||
// kernels don't just do upsampling: they do general interpolation, i.e. they
|
||||
// also all support downscaling.
|
||||
// - the term "horizontal" or "within dims" or "contiguous dim" refers to the
|
||||
// last dimension.
|
||||
// It's not specific to 2D images and applies to 3D (and 1D??) inputs as well.
|
||||
// Similarly "vertical" or "across dims" refers to all dims that aren't the
|
||||
// last one. In other kernels these are also referred to as "zero-stride" and
|
||||
// "non-zero-stride" - we should unify all this.
|
||||
// - the terms "zero-stride" and "non-zero strides" refer to the weights and
|
||||
// indices, not to the contiguity of input or output
|
||||
// - It's not always clear which kernel is vectorized and which one isn't.
|
||||
// - The functions like _use_vectorized_kernel_cond() should be renamed and
|
||||
// their description updated, because they're not the only "fork" in the
|
||||
// code-path where a choice is made between a vectorized kernel vs a
|
||||
// non-vectorized one. See e.g. upsample_bilinear2d_kernel_impl() where we
|
||||
// already make a similar check, before the one in
|
||||
// _use_vectorized_kernel_cond().
|
||||
// - It's not always clear which code is part of a "separable interpolation"
|
||||
// code-path.
|
||||
// - Some names need to be more specific. For example
|
||||
// "cpu_upsample_generic_aa()" looks like a super generic name, but the function
|
||||
// is instead fairly specific - we need to make that clearer.
|
||||
// - Some functions have a "aa" suffix but it doesn't mean that they only
|
||||
// support antialias. Some of them also support antialias=False now.
|
||||
// - Various comments are outdated. Case in point: the one just below about the
|
||||
// `Interpolate` struct being used for cpu_upsample_linear:
|
||||
// cpu_upsample_linear doesn't exist anymore, and these structs are used for
|
||||
// various modes, *not* just linear.
|
||||
// - It'd be useful to document how interpolation works in general, and in particular state explicitly:
|
||||
// - that the weights and indices across a given dimension are the same for
|
||||
// all pixels (hence the benefit of pre-computing them)
|
||||
// - that it can be "separated", i.e. we can do the horizontal pass and the
|
||||
// vertical pass independently (and that some kernels are written this way,
|
||||
// while some aren't.)
|
||||
// - we can probably remove the template over index_t, because it's always
|
||||
// hard-coded as int64_t
|
||||
|
||||
|
||||
// Helper structs and methods for cpu_upsample_linear
|
||||
//
|
||||
// Interpolation methods that used below are separable, and as such we can compute the interpolation
|
||||
// independently per dimension in a recursive way. Please, refer to #10482 for more context.
|
||||
//
|
||||
// Linear Interpolation structure to compute output value in n-dimensional case.
|
||||
// Interpolation structure to compute output value in n-dimensional case.
|
||||
// - recursively compute interpolated output for each dimension
|
||||
// - we rely a lot on compiler's code optimization such that implemented operations
|
||||
// can be automatically factorized and vectorized using SSE and AVX2
|
||||
@ -255,48 +297,129 @@ static inline void basic_loop(char** data, const int64_t* strides, int64_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename index_t>
|
||||
static inline void basic_loop_aa_single_dim_zero_strides(
|
||||
template <typename scalar_t>
|
||||
static inline void basic_loop_aa_vertical(
|
||||
char** data,
|
||||
const int64_t* strides,
|
||||
int64_t n) {
|
||||
int64_t n,
|
||||
unsigned int weights_precision) {
|
||||
char* dst = data[0];
|
||||
char* src = data[1];
|
||||
// index stride is constant for the given dimension
|
||||
const index_t ids_stride = *(index_t*)&data[2 + 2][0];
|
||||
const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];
|
||||
|
||||
for (const auto i : c10::irange(n)) {
|
||||
*(scalar_t*)&dst[i * strides[0]] =
|
||||
interpolate_aa_single_dim_zero_strides<scalar_t, index_t>(
|
||||
interpolate_aa_single_dim_zero_strides<scalar_t, int64_t>(
|
||||
src + i * strides[1], &data[2], ids_stride);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename index_t>
|
||||
static inline void basic_loop_aa_single_dim_nonzero_strides(
|
||||
template <>
|
||||
inline void basic_loop_aa_vertical<uint8_t>(
|
||||
char** data,
|
||||
const int64_t* strides,
|
||||
int64_t n) {
|
||||
int64_t n,
|
||||
unsigned int weights_precision) {
|
||||
// See Note [ Weights computation for uint8_t and multiplication trick ]
|
||||
char* dst = data[0];
|
||||
char* src = data[1];
|
||||
|
||||
// index stride is constant for the given dimension
|
||||
const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];
|
||||
const int64_t ids_size = *(int64_t*)&data[2 + 1][0];
|
||||
const int64_t ids_min = *(int64_t*)&data[2 + 0][0];
|
||||
|
||||
int64_t i = 0;
|
||||
|
||||
for (; i<n; i++) {
|
||||
|
||||
char* src_min = src + i * strides[1] + ids_min;
|
||||
|
||||
uint8_t t = *(uint8_t*)&src_min[0];
|
||||
int64_t wts_idx = *(int64_t*)&data[2 + 4][0];
|
||||
int16_t* wts_ptr = (int16_t*)&data[2 + 3][wts_idx];
|
||||
int16_t wts = wts_ptr[0];
|
||||
|
||||
// Intermediate computations are using integer type
|
||||
int output = 1 << (weights_precision - 1); // accounts for the +0.5 part
|
||||
output += t * wts;
|
||||
for (const auto j : c10::irange(1, ids_size)) {
|
||||
wts = wts_ptr[j];
|
||||
t = *(uint8_t*)&src_min[j * ids_stride];
|
||||
output += t * wts;
|
||||
}
|
||||
*(uint8_t*)&dst[i * strides[0]] = (uint8_t)std::clamp(output >> weights_precision, 0, 255);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
static inline void basic_loop_aa_horizontal(
|
||||
char** data,
|
||||
const int64_t* strides,
|
||||
int64_t n,
|
||||
unsigned int weights_precision) {
|
||||
char* dst = data[0];
|
||||
char* src = data[1];
|
||||
// index stride is constant for the given dimension
|
||||
const index_t ids_stride = *(index_t*)&data[2 + 2][0];
|
||||
const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];
|
||||
|
||||
if (strides[1] == 0) {
|
||||
for (const auto i : c10::irange(n)) {
|
||||
*(scalar_t*)&dst[i * strides[0]] =
|
||||
interpolate_aa_single_dim<scalar_t, index_t>(
|
||||
interpolate_aa_single_dim<scalar_t, int64_t>(
|
||||
src, &data[2], &strides[2], i, ids_stride);
|
||||
}
|
||||
} else {
|
||||
for (const auto i : c10::irange(n)) {
|
||||
*(scalar_t*)&dst[i * strides[0]] =
|
||||
interpolate_aa_single_dim<scalar_t, index_t>(
|
||||
interpolate_aa_single_dim<scalar_t, int64_t>(
|
||||
src + i * strides[1], &data[2], &strides[2], i, ids_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void basic_loop_aa_horizontal<uint8_t>(
|
||||
char** data,
|
||||
const int64_t* strides,
|
||||
int64_t n,
|
||||
unsigned int weights_precision) {
|
||||
// See Note [ Weights computation for uint8_t and multiplication trick ]
|
||||
char* dst = data[0];
|
||||
char* src = data[1];
|
||||
// index stride is constant for the given dimension
|
||||
const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];
|
||||
|
||||
int64_t i = 0;
|
||||
|
||||
// Here we are implementing data interpolation within the same line (vs between the lines)
|
||||
// output[x, y] = input[xmin[x], y] * W[x] + input[xmin[x] + 1, y] * W[x + 1] + ... + input[xmin[x] + xsize, y] * W[x + xsize]
|
||||
|
||||
for (; i<n; i++) {
|
||||
|
||||
int64_t ids_min = *(int64_t*)&data[2 + 0][i * strides[2 + 0]];
|
||||
int64_t ids_size = *(int64_t*)&data[2 + 1][i * strides[2 + 1]];
|
||||
|
||||
char* src_min = src + i * strides[1] + ids_min;
|
||||
|
||||
uint8_t t = *(uint8_t*)&src_min[0];
|
||||
int64_t wts_idx = *(int64_t*)&data[2 + 4][i * strides[2 + 4]];
|
||||
int16_t* wts_ptr = (int16_t*)&data[2 + 3][wts_idx];
|
||||
int16_t wts = wts_ptr[0];
|
||||
|
||||
// Intermediate computations are using integer type
|
||||
int output = 1 << (weights_precision - 1); // accounts for the +0.5 part
|
||||
output += t * wts;
|
||||
for (const auto j : c10::irange(1, ids_size)) {
|
||||
wts = wts_ptr[j];
|
||||
t = *(uint8_t*)&src_min[j * ids_stride];
|
||||
output += t * wts;
|
||||
}
|
||||
*(uint8_t*)&dst[i * strides[0]] = (uint8_t)std::clamp(output >> weights_precision, 0, 255);
|
||||
}
|
||||
}
|
||||
|
||||
// Generic upsampling computation method using TensorIterator for Nd case.
|
||||
// Supports: nearest, linear, cubic modes with interp_size template argument: 1, 2, 4
|
||||
//
|
||||
@ -621,21 +744,23 @@ struct HelperInterpBase {
|
||||
template <typename scalar_t, typename aa_filter_fn_t>
|
||||
static inline void _compute_weights_aa(
|
||||
const int64_t i, const int64_t input_size, const scalar_t scale, const scalar_t support,
|
||||
scalar_t* wt_ptr, const int64_t interp_size, aa_filter_fn_t filter_fn,
|
||||
int64_t& xmin, int64_t& xsize
|
||||
scalar_t* wt_ptr, const int64_t max_interp_size, aa_filter_fn_t filter_fn,
|
||||
int64_t& xmin, int64_t& xsize, bool antialias, double align_corners_delta
|
||||
) {
|
||||
|
||||
scalar_t center = scale * (i + 0.5);
|
||||
// align_corners_delta is 0.5 for uint8 and align_corners=true and antialias=false
|
||||
// is 0.0 otherwise
|
||||
scalar_t center = scale * (i + 0.5 - align_corners_delta);
|
||||
scalar_t total_w = 0.0;
|
||||
scalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0;
|
||||
scalar_t invscale = (scale >= 1.0 && antialias) ? 1.0 / scale : 1.0;
|
||||
xmin = std::max(
|
||||
static_cast<int64_t>(center - support + 0.5), static_cast<int64_t>(0));
|
||||
xsize = std::min(static_cast<int64_t>(center + support + 0.5), input_size) -
|
||||
xmin;
|
||||
static_cast<int64_t>(center - support + 0.5 + align_corners_delta), static_cast<int64_t>(0));
|
||||
xsize = std::min(
|
||||
static_cast<int64_t>(center + support + 0.5 + align_corners_delta), input_size) - xmin;
|
||||
|
||||
int64_t j = 0;
|
||||
for (; j < xsize; j++) {
|
||||
scalar_t w = filter_fn((j + xmin - center + 0.5) * invscale);
|
||||
scalar_t w = filter_fn((j + xmin - center + 0.5 - align_corners_delta) * invscale);
|
||||
wt_ptr[j] = w;
|
||||
total_w += w;
|
||||
}
|
||||
@ -644,23 +769,39 @@ struct HelperInterpBase {
|
||||
wt_ptr[j] /= total_w;
|
||||
}
|
||||
}
|
||||
for (; j < interp_size; j++) {
|
||||
for (; j < max_interp_size; j++) {
|
||||
wt_ptr[j] = static_cast<scalar_t>(0.0);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename aa_filter_fn_t>
|
||||
static inline std::vector<Tensor> _compute_indices_weights_aa(
|
||||
// Note [ Support for antialias=False as a subcase of antilias=True ]
|
||||
// This function was originally written with the hard assumption that
|
||||
// antialias=True (hence the aa in the name). It was later extended to support
|
||||
// antialias=False. The only difference between aa and no-aa is in how the
|
||||
// weights and indices are computed (and their number). In aa their number is
|
||||
// variable but with no-aa, they're fixed to interp_size. The same "filters"
|
||||
// can be used otherwise. HOWEVER, support for antialias=False here may not be
|
||||
// optimally optimized: the code assumes an arbitrary number of weights and
|
||||
// indices, but this can be optimized further when aa=False since we know
|
||||
// their actual dimensions.
|
||||
template <typename scalar_t, typename aa_filter_fn_t, int weight_index_stride=sizeof(scalar_t)>
|
||||
static inline std::tuple<std::vector<Tensor>, int> _compute_indices_weights_aa(
|
||||
int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
|
||||
int64_t reshape_dim, scalar_t scale,
|
||||
int interp_size, aa_filter_fn_t aa_filter_fn
|
||||
int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, double align_corners_delta
|
||||
) {
|
||||
|
||||
std::vector<Tensor> output;
|
||||
|
||||
scalar_t support =
|
||||
(scale >= 1.0) ? (interp_size * 0.5) * scale : interp_size * 0.5;
|
||||
interp_size = (int)ceilf(support) * 2 + 1;
|
||||
scalar_t support;
|
||||
int max_interp_size;
|
||||
if (antialias) {
|
||||
support = (scale >= 1.0) ? (interp_size * 0.5) * scale : interp_size * 0.5;
|
||||
max_interp_size = (int) std::ceil(support) * 2 + 1;
|
||||
} else {
|
||||
support = interp_size * 0.5;
|
||||
max_interp_size = interp_size;
|
||||
}
|
||||
|
||||
auto new_shape = std::vector<int64_t>(ndims, 1);
|
||||
new_shape[reshape_dim] = output_size;
|
||||
@ -675,7 +816,7 @@ struct HelperInterpBase {
|
||||
|
||||
{
|
||||
// Weights
|
||||
new_shape[reshape_dim] = output_size * interp_size;
|
||||
new_shape[reshape_dim] = output_size * max_interp_size;
|
||||
auto wts = empty(new_shape, CPU(c10::CppTypeToScalarType<scalar_t>()));
|
||||
auto strides = wts.strides().vec();
|
||||
strides[reshape_dim] = 0;
|
||||
@ -701,20 +842,130 @@ struct HelperInterpBase {
|
||||
input_size,
|
||||
scale,
|
||||
support,
|
||||
wt_ptr + i * interp_size,
|
||||
interp_size,
|
||||
wt_ptr + i * max_interp_size,
|
||||
max_interp_size,
|
||||
aa_filter_fn,
|
||||
xmin,
|
||||
xmax);
|
||||
xmax,
|
||||
antialias,
|
||||
align_corners_delta);
|
||||
|
||||
idx_ptr_xmin[i] = xmin * stride;
|
||||
idx_ptr_size[i] = xmax;
|
||||
idx_ptr_stride[i] = stride;
|
||||
wt_idx_ptr[i] = i * interp_size * sizeof(scalar_t);
|
||||
wt_idx_ptr[i] = i * max_interp_size * weight_index_stride;
|
||||
}
|
||||
return output;
|
||||
return {output, max_interp_size};
|
||||
}
|
||||
|
||||
/*
|
||||
NOTE [ Weights computation for uint8_t and multiplication trick ]
|
||||
When the input/output dtype is uint8_t, we still compute the interpolation
|
||||
weights as double, but then convert them to int16 via some conversion logic
|
||||
detailed below. This allows us to compute all interpolation operation (sum of
|
||||
multiplications) as ints instead of floats. The result is converted back into
|
||||
uint8 in basic_loop_aa_horizontal<uint8_t> (and vertical)
|
||||
|
||||
In essence the idea is to avoid a multiplication between a float (the
|
||||
weight) and an int (the pixel value) and instead run a multpilication between
|
||||
2 ints:
|
||||
|
||||
```py
|
||||
COEF_PREC = 16
|
||||
|
||||
def mul(a:float, b:int) -> Tuple[float, int]:
|
||||
# return a * b, round(a * b)
|
||||
actual = a * b
|
||||
|
||||
assert a > 0 # I'm lazy
|
||||
int_a = floor(0.5 + a * (1 << COEF_PREC))
|
||||
with_trick = ((int_a * b) + (1 << (COEF_PREC - 1))) >> COEF_PREC
|
||||
|
||||
return actual, with_trick # round(actual) == with_trick!!
|
||||
```
|
||||
|
||||
Here's how it works:
|
||||
N == COEFF_PREC
|
||||
1 << N == 2**N
|
||||
floor(0.5 + x) == round(x)
|
||||
|
||||
So the operation is something like
|
||||
|
||||
int_a = round(a * 2**N) -- let's just say it's `a * 2**N` for simplicity
|
||||
|
||||
res = ((int_a * b) + (1 << (N - 1))) >> N
|
||||
= ((a * 2**N * b + 2**(N - 1)) / 2**N
|
||||
= a * b + 0.5
|
||||
= round(a * b)
|
||||
= what we wanted
|
||||
*/
|
||||
template <typename aa_filter_fn_t>
|
||||
static inline std::tuple<std::vector<Tensor>, int, unsigned int> _compute_indices_int16_weights_aa(
|
||||
int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
|
||||
int64_t reshape_dim, bool align_corners, const c10::optional<double> opt_scale,
|
||||
int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, bool align_i32=false
|
||||
) {
|
||||
|
||||
double scale = area_pixel_compute_scale<double>(
|
||||
input_size, output_size, align_corners, opt_scale);
|
||||
|
||||
std::vector<Tensor> indices_weights;
|
||||
auto align_corners_delta = (align_corners && !antialias) ? 0.5 : 0.0;
|
||||
std::tie(indices_weights, interp_size) = HelperInterpBase::_compute_indices_weights_aa<double, aa_filter_fn_t, sizeof(int16_t)>(
|
||||
input_size, output_size, stride, ndims, reshape_dim, scale, interp_size, aa_filter_fn, antialias, align_corners_delta);
|
||||
|
||||
// Rescale float weights to int16 and compute weights precision
|
||||
auto weights_f64 = indices_weights[3];
|
||||
double * data_f64 = weights_f64.data_ptr<double>();
|
||||
int64_t weights_f64_size = output_size * interp_size;
|
||||
// can't use weights_f64.max() here as tensor is restrided
|
||||
double w_max = data_f64[0];
|
||||
for (const auto i : c10::irange(weights_f64_size)) {
|
||||
double v = data_f64[i];
|
||||
if (w_max < v) {
|
||||
w_max = v;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned int weights_precision = 0;
|
||||
for (weights_precision = 0; weights_precision < 22; weights_precision += 1) {
|
||||
int next_value = (int) (0.5 + w_max * (1 << (weights_precision + 1)));
|
||||
if (next_value >= (1 << 15))
|
||||
break;
|
||||
}
|
||||
|
||||
// Rescale float values to int16
|
||||
int16_t * data_i16 = (int16_t *) data_f64;
|
||||
auto aligned_interp_size = interp_size;
|
||||
|
||||
if (align_i32) {
|
||||
// We should respect int32 alignment as
|
||||
// we will load data as int32 with AVX2
|
||||
// See ImagingResampleHorizontalConvolution8u4x, mmk0 = _mm256_set1_epi32(*(int32_t*)&k[x]);
|
||||
// compute aligned_interp_size = nearest pair value to interp_size
|
||||
while (aligned_interp_size % sizeof(int32_t) != 0) {
|
||||
aligned_interp_size += 1;
|
||||
}
|
||||
// assert that we wont go out of bounds
|
||||
TORCH_INTERNAL_ASSERT(aligned_interp_size * sizeof(int16_t) < interp_size * sizeof(double));
|
||||
}
|
||||
|
||||
for (const auto j : c10::irange(output_size)) {
|
||||
for (const auto k : c10::irange(interp_size)) {
|
||||
double v = data_f64[j * interp_size + k];
|
||||
if (v < 0) {
|
||||
data_i16[j * aligned_interp_size + k] = (int) (-0.5 + v * (1 << weights_precision));
|
||||
} else {
|
||||
data_i16[j * aligned_interp_size + k] = (int) (0.5 + v * (1 << weights_precision));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {indices_weights, aligned_interp_size, weights_precision};
|
||||
}
|
||||
|
||||
|
||||
|
||||
};
|
||||
|
||||
struct HelperInterpNearest : public HelperInterpBase {
|
||||
@ -923,8 +1174,9 @@ struct HelperInterpLinear : public HelperInterpBase {
|
||||
input_size, output_size, align_corners, opt_scale);
|
||||
|
||||
auto interp_size = HelperInterpLinear::interp_size;
|
||||
int unused;
|
||||
|
||||
indices_weights = HelperInterpLinear::_compute_indices_weights_aa<scalar_t>(
|
||||
std::tie(indices_weights, unused) = HelperInterpLinear::_compute_indices_weights_aa<scalar_t>(
|
||||
input_size,
|
||||
output_size,
|
||||
stride,
|
||||
@ -932,11 +1184,32 @@ struct HelperInterpLinear : public HelperInterpBase {
|
||||
reshape_dim,
|
||||
scale,
|
||||
interp_size,
|
||||
&HelperInterpLinear::aa_filter<scalar_t>);
|
||||
&HelperInterpLinear::aa_filter<scalar_t>,
|
||||
/*antialias=*/true,
|
||||
/*align_corners_delta=*/0.0);
|
||||
}
|
||||
);
|
||||
return indices_weights;
|
||||
}
|
||||
|
||||
static inline std::tuple<std::vector<Tensor>, int, unsigned int> compute_indices_int16_weights_aa(
|
||||
int64_t input_size,
|
||||
int64_t output_size,
|
||||
int64_t stride,
|
||||
int64_t ndims,
|
||||
int64_t reshape_dim,
|
||||
bool align_corners,
|
||||
const c10::optional<double> opt_scale,
|
||||
bool antialias,
|
||||
bool align_i32=false
|
||||
) {
|
||||
|
||||
auto interp_size = HelperInterpLinear::interp_size;
|
||||
auto fn = HelperInterpLinear::aa_filter<double>;
|
||||
return HelperInterpLinear::_compute_indices_int16_weights_aa(
|
||||
input_size, output_size, stride, ndims, reshape_dim,
|
||||
align_corners, opt_scale, interp_size, fn, antialias, align_i32);
|
||||
}
|
||||
};
|
||||
|
||||
struct HelperInterpCubic : public HelperInterpBase {
|
||||
@ -1033,8 +1306,9 @@ struct HelperInterpCubic : public HelperInterpBase {
|
||||
input_size, output_size, align_corners, opt_scale);
|
||||
|
||||
auto interp_size = HelperInterpCubic::interp_size;
|
||||
int unused;
|
||||
|
||||
indices_weights = HelperInterpCubic::_compute_indices_weights_aa<scalar_t>(
|
||||
std::tie(indices_weights, unused) = HelperInterpCubic::_compute_indices_weights_aa<scalar_t>(
|
||||
input_size,
|
||||
output_size,
|
||||
stride,
|
||||
@ -1042,11 +1316,14 @@ struct HelperInterpCubic : public HelperInterpBase {
|
||||
reshape_dim,
|
||||
scale,
|
||||
interp_size,
|
||||
&HelperInterpCubic::aa_filter<scalar_t>);
|
||||
&HelperInterpCubic::aa_filter<scalar_t>,
|
||||
/*antialias=*/true,
|
||||
/*align_corners_delta*/0.0);
|
||||
}
|
||||
);
|
||||
return indices_weights;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
// Generic upsampling interpolation kernel for N-d case.
|
||||
@ -1133,31 +1410,50 @@ void upsample_generic_Nd_kernel_impl(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void cpu_upsample_generic_aa(at::TensorIterator& iter) {
|
||||
template <typename scalar_t, bool is_horizontal>
|
||||
void cpu_upsample_generic_aa(at::TensorIterator& iter, unsigned int weights_precision) {
|
||||
|
||||
auto loop = [&](char** data, const int64_t* strides, int64_t n) {
|
||||
if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) &&
|
||||
is_zero_stride<3 + 2>(&strides[2])) {
|
||||
basic_loop_aa_single_dim_zero_strides<scalar_t, int64_t>(
|
||||
data, strides, n);
|
||||
if (is_horizontal) {
|
||||
|
||||
// Strides are : X 0 | 8 8 8 0 8 (Channels first)
|
||||
// Strides are : X X | 0 0 0 0 0 (Channels last)
|
||||
// upsampling data within a contiguous dimension (aka horizontal resampling)
|
||||
if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) &&
|
||||
is_zero_stride<3 + 2>(&strides[2])) {
|
||||
// channels last case
|
||||
basic_loop_aa_horizontal<scalar_t>(
|
||||
data, strides, n, weights_precision);
|
||||
} else {
|
||||
basic_loop_aa_horizontal<scalar_t>(
|
||||
data, strides, n, weights_precision);
|
||||
}
|
||||
} else {
|
||||
basic_loop_aa_single_dim_nonzero_strides<scalar_t, int64_t>(
|
||||
data, strides, n);
|
||||
// Strides are : X Y | 0 0 0 0 0 (Channels first)
|
||||
// Strides are : X X | 0 0 0 0 0 (Channels last)
|
||||
// upsampling data between contiguous dimensions (aka vertical resampling)
|
||||
if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) &&
|
||||
is_zero_stride<3 + 2>(&strides[2])) {
|
||||
basic_loop_aa_vertical<scalar_t>(
|
||||
data, strides, n, weights_precision);
|
||||
} else {
|
||||
basic_loop_aa_vertical<scalar_t>(
|
||||
data, strides, n, weights_precision);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
iter.for_each(loop);
|
||||
}
|
||||
|
||||
// Generic separable upsampling interpolation kernels for N-d case with anti-aliasing
|
||||
template <int out_ndims, typename scale_type, class F>
|
||||
template <int out_ndims, typename scale_type, class F, bool is_horizontal>
|
||||
void _separable_upsample_generic_Nd_kernel_impl_single_dim(
|
||||
const Tensor& output,
|
||||
const Tensor& input,
|
||||
int interp_dim,
|
||||
bool align_corners,
|
||||
const scale_type& scales) {
|
||||
const scale_type& scales,
|
||||
bool antialias) {
|
||||
|
||||
// input can be NCHW, NCL or NCKHW
|
||||
auto shape = input.sizes().vec();
|
||||
@ -1174,21 +1470,29 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim(
|
||||
strides[interp_dim] = 0;
|
||||
auto restrided_input = input.as_strided(shape, strides);
|
||||
|
||||
std::vector<std::vector<Tensor>> indices_weights;
|
||||
|
||||
int interp_size = F::interp_size;
|
||||
auto input_scalar_type = input.scalar_type();
|
||||
if (interp_size == 1 && input_scalar_type == at::ScalarType::Byte) {
|
||||
// nearest also supports uint8 tensor, but we have to use float
|
||||
// with compute_indices_weights
|
||||
input_scalar_type = at::ScalarType::Float;
|
||||
}
|
||||
|
||||
indices_weights.emplace_back(
|
||||
std::vector<Tensor> indices_weights;
|
||||
unsigned int weights_precision = 0;
|
||||
int unused;
|
||||
|
||||
if (input_scalar_type == at::kByte) {
|
||||
std::tie(indices_weights, unused, weights_precision) =
|
||||
// TODO: change that to F:: once / if bicubic mode supports uint8 after all
|
||||
HelperInterpLinear::compute_indices_int16_weights_aa(
|
||||
input.size(interp_dim), oshape[interp_dim],
|
||||
input.stride(interp_dim) * input.element_size(),
|
||||
input.dim(), interp_dim, align_corners, scales[interp_dim - 2],
|
||||
antialias);
|
||||
TORCH_INTERNAL_ASSERT(weights_precision > 0);
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT(antialias);
|
||||
indices_weights =
|
||||
F::compute_indices_weights_aa(
|
||||
input_scalar_type, input.size(interp_dim), oshape[interp_dim],
|
||||
input.stride(interp_dim) * input.element_size(),
|
||||
input.dim(), interp_dim, align_corners, scales[interp_dim - 2]));
|
||||
input.dim(), interp_dim, align_corners, scales[interp_dim - 2]);
|
||||
}
|
||||
|
||||
TensorIteratorConfig config;
|
||||
config.check_all_same_dtype(false)
|
||||
@ -1196,51 +1500,95 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim(
|
||||
.add_output(output)
|
||||
.add_input(restrided_input);
|
||||
|
||||
for (auto& idx_weight : indices_weights) {
|
||||
for (auto& tensor : idx_weight) {
|
||||
config.add_input(tensor);
|
||||
}
|
||||
for (auto& tensor : indices_weights) {
|
||||
config.add_input(tensor);
|
||||
}
|
||||
|
||||
auto iter = config.build();
|
||||
|
||||
if (interp_size > 1) {
|
||||
// Nearest also supports uint8 tensor, so need to handle it separately
|
||||
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "upsample_generic_Nd_aa", [&] {
|
||||
cpu_upsample_generic_aa<scalar_t>(iter);
|
||||
});
|
||||
} else {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(
|
||||
at::ScalarType::Byte, iter.dtype(), "upsample_generic_Nd_aa", [&] {
|
||||
cpu_upsample_generic_aa<scalar_t>(iter);
|
||||
});
|
||||
}
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(
|
||||
at::ScalarType::Byte, iter.dtype(), "upsample_generic_Nd_aa", [&] {
|
||||
cpu_upsample_generic_aa<scalar_t, is_horizontal>(iter, weights_precision);
|
||||
});
|
||||
}
|
||||
|
||||
// Generic separable upsampling interpolation kernel for N-d case with anti-aliasing.
|
||||
// It also supports antialias=False iff
|
||||
// (dtype == uint8 and mode in ("bilinear", "bicubic")): this is used as
|
||||
// fallback in these settings when AVX isn't supported.
|
||||
template <int out_ndims, typename scale_type, class F>
|
||||
void separable_upsample_generic_Nd_kernel_impl(
|
||||
const Tensor& output,
|
||||
const Tensor& input,
|
||||
bool align_corners,
|
||||
const scale_type& scales) {
|
||||
const scale_type& scales,
|
||||
bool antialias) {
|
||||
|
||||
auto output_shape = output.sizes();
|
||||
auto input_shape = input.sizes();
|
||||
auto temp_oshape = input_shape.vec();
|
||||
|
||||
if (output_shape == input_shape) {
|
||||
output.copy_(input);
|
||||
return;
|
||||
}
|
||||
|
||||
auto temp_oshape = input.sizes().vec();
|
||||
at::Tensor temp_output, temp_input = input;
|
||||
for (const auto i : c10::irange(out_ndims - 1)) {
|
||||
int interp_dim = 2 + out_ndims - 1 - i;
|
||||
temp_oshape[interp_dim] = output.sizes()[interp_dim];
|
||||
temp_output = at::empty(temp_oshape, input.options().memory_format(input.suggest_memory_format()));
|
||||
|
||||
int interp_dim = 0;
|
||||
// Precompute the number of single dim resize method invocations
|
||||
// to avoid copying temporary buffer to output
|
||||
int num_single_dim_ops = 0;
|
||||
for (const auto i : c10::irange(out_ndims)) {
|
||||
interp_dim = 2 + out_ndims - 1 - i;
|
||||
if (output_shape[interp_dim] != input_shape[interp_dim]) {
|
||||
num_single_dim_ops += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// upsampling data within the contiguous dimension (aka horizontal resampling)
|
||||
interp_dim = 2 + out_ndims - 1;
|
||||
if (output_shape[interp_dim] != input_shape[interp_dim]) {
|
||||
|
||||
num_single_dim_ops -= 1;
|
||||
if (num_single_dim_ops > 0) {
|
||||
temp_oshape[interp_dim] = output_shape[interp_dim];
|
||||
temp_output = at::empty(temp_oshape, input.options());
|
||||
} else {
|
||||
temp_output = output;
|
||||
}
|
||||
|
||||
_separable_upsample_generic_Nd_kernel_impl_single_dim<
|
||||
out_ndims,
|
||||
scale_t,
|
||||
F>(
|
||||
temp_output, temp_input, interp_dim, align_corners, scales);
|
||||
F,
|
||||
true>(
|
||||
temp_output, temp_input, interp_dim, align_corners, scales, antialias);
|
||||
temp_input = temp_output;
|
||||
}
|
||||
_separable_upsample_generic_Nd_kernel_impl_single_dim<
|
||||
out_ndims,
|
||||
scale_t,
|
||||
F>(output, temp_input, 2, align_corners, scales);
|
||||
|
||||
// upsampling data between contiguous dimensions (aka vertical resampling)
|
||||
for (const auto i : c10::irange(1, out_ndims)) {
|
||||
interp_dim = 2 + out_ndims - 1 - i;
|
||||
if (output_shape[interp_dim] != input_shape[interp_dim]) {
|
||||
|
||||
num_single_dim_ops -= 1;
|
||||
if (num_single_dim_ops > 0) {
|
||||
temp_oshape[interp_dim] = output_shape[interp_dim];
|
||||
temp_output = at::empty(temp_oshape, input.options());
|
||||
} else {
|
||||
temp_output = output;
|
||||
}
|
||||
|
||||
_separable_upsample_generic_Nd_kernel_impl_single_dim<
|
||||
out_ndims,
|
||||
scale_t,
|
||||
F,
|
||||
false>(
|
||||
temp_output, temp_input, interp_dim, align_corners, scales, antialias);
|
||||
temp_input = temp_output;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void upsample_nearest1d_kernel_impl(
|
||||
@ -1356,7 +1704,8 @@ void upsample_linear1d_kernel_impl(
|
||||
output, input, align_corners, {scales_w});
|
||||
}
|
||||
|
||||
void upsample_bilinear2d_kernel_impl(
|
||||
|
||||
void upsample_bilinear2d_kernel_impl_float(
|
||||
const Tensor& output,
|
||||
const Tensor& input,
|
||||
bool align_corners,
|
||||
@ -1378,15 +1727,56 @@ void upsample_bilinear2d_kernel_impl(
|
||||
}
|
||||
}
|
||||
|
||||
void upsample_bilinear2d_aa_kernel_impl(
|
||||
void upsample_bilinear2d_kernel_impl(
|
||||
const Tensor& output,
|
||||
const Tensor& input,
|
||||
bool align_corners,
|
||||
c10::optional<double> scales_h,
|
||||
c10::optional<double> scales_w) {
|
||||
|
||||
if (input.dtype() == at::kByte){
|
||||
#ifdef CPU_CAPABILITY_AVX2
|
||||
if (input.size(1) <= 4) {
|
||||
upsample_avx_bilinear_uint8<scale_t, HelperInterpLinear>(input,
|
||||
output, align_corners, {scales_h, scales_w},
|
||||
/*antialias=*/false);
|
||||
} else {
|
||||
separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
|
||||
output, input, align_corners, {scales_h, scales_w},
|
||||
/*antialias=*/false);
|
||||
}
|
||||
#else // CPU_CAPABILITY_AVX2
|
||||
separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
|
||||
output, input, align_corners, {scales_h, scales_w},
|
||||
/*antialias=*/false);
|
||||
#endif // CPU_CAPABILITY_AVX2
|
||||
} else {
|
||||
upsample_bilinear2d_kernel_impl_float(output, input, align_corners, scales_h, scales_w);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void upsample_bilinear2d_aa_kernel_impl(
|
||||
const Tensor& output,
|
||||
const Tensor& input,
|
||||
bool align_corners,
|
||||
c10::optional<double> scales_h,
|
||||
c10::optional<double> scales_w) {
|
||||
#ifdef CPU_CAPABILITY_AVX2
|
||||
if (input.dtype() == at::kByte && input.size(1) <= 4) {
|
||||
upsample_avx_bilinear_uint8<scale_t, HelperInterpLinear>(
|
||||
input, output, align_corners, {scales_h, scales_w},
|
||||
/*antialias=*/true);
|
||||
} else {
|
||||
separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
|
||||
output, input, align_corners, {scales_h, scales_w},
|
||||
/*antialias=*/true);
|
||||
}
|
||||
#else // CPU_CAPABILITY_AVX2
|
||||
separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
|
||||
output, input, align_corners, {scales_h, scales_w});
|
||||
output, input, align_corners, {scales_h, scales_w},
|
||||
/*antialias=*/true);
|
||||
#endif // CPU_CAPABILITY_AVX2
|
||||
}
|
||||
|
||||
void upsample_trilinear3d_kernel_impl(
|
||||
@ -1424,7 +1814,8 @@ void upsample_bicubic2d_aa_kernel_impl(
|
||||
c10::optional<double> scales_w) {
|
||||
|
||||
separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpCubic>(
|
||||
output, input, align_corners, {scales_h, scales_w});
|
||||
output, input, align_corners, {scales_h, scales_w},
|
||||
/*antialias=*/true);
|
||||
}
|
||||
|
||||
template <
|
||||
@ -1500,7 +1891,9 @@ void cpu_upsample_genNd_backward_aa(
|
||||
interp_height,
|
||||
filter_fn,
|
||||
ymin,
|
||||
ysize);
|
||||
ysize,
|
||||
/*antialias=*/true,
|
||||
/*align_corners_delta=*/0.0);
|
||||
|
||||
for (const auto ow : c10::irange(output_width)) {
|
||||
F::_compute_weights_aa(
|
||||
@ -1512,7 +1905,9 @@ void cpu_upsample_genNd_backward_aa(
|
||||
interp_width,
|
||||
filter_fn,
|
||||
xmin,
|
||||
xsize);
|
||||
xsize,
|
||||
/*antialias=*/true,
|
||||
/*align_corners_delta=*/0.0);
|
||||
|
||||
for (const auto c : c10::irange(begin, end)) {
|
||||
scalar_t grad_output_value =
|
||||
|
719
aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
Normal file
719
aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
Normal file
@ -0,0 +1,719 @@
|
||||
/*
|
||||
The Python Imaging Library (PIL) is
|
||||
|
||||
Copyright © 1997-2011 by Secret Labs AB
|
||||
Copyright © 1995-2011 by Fredrik Lundh
|
||||
|
||||
Pillow is the friendly PIL fork. It is
|
||||
|
||||
Copyright © 2010-2022 by Alex Clark and contributors
|
||||
|
||||
Like PIL, Pillow is licensed under the open source HPND License
|
||||
*/
|
||||
|
||||
// This code is heavily inspired from PILLOW-SIMD's implementation:
|
||||
// https://github.com/uploadcare/pillow-simd/blob/simd/master/src/libImaging/Resample.c
|
||||
|
||||
#pragma once
|
||||
#ifdef CPU_CAPABILITY_AVX2
|
||||
// TODO: This file only supports AVX2. We could split the AVX kernels into
|
||||
// smaller logical blocks in order to port them into the Vec.h logic. This would
|
||||
// allow to support other vectorization architectures and perhaps also support
|
||||
// the non-vectorized fallback (we'd need to make sure it's not slower than the
|
||||
// current fallback).
|
||||
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#else
|
||||
#include <ATen/ops/empty.h>
|
||||
#endif
|
||||
|
||||
|
||||
namespace {
|
||||
|
||||
static __m128i inline mm_cvtepu8_epi32(const uint32_t* C10_RESTRICT ptr) {
|
||||
return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)ptr));
|
||||
}
|
||||
|
||||
// TODO: We may want to hard-code an unrolled version for the case where
|
||||
// num_channels=3 to hint the compiler to vectorize this (looks at original
|
||||
// PIL-SIMD's code).
|
||||
at::Tensor unpack_rgb(const at::Tensor& packed_tensor) {
|
||||
// Convert a "packed" tensor (typically RGBRGBRGB if channels_last) into
|
||||
// RGBARGBARGBA format where A is hard-coded to 255. Each pixel is encoded
|
||||
// into as 32bits. This generalizes to num_channels <= 4 and also works for
|
||||
// non-channels_last tensors.
|
||||
|
||||
const uint8_t* packed = (const uint8_t*)packed_tensor.data_ptr<uint8_t>();
|
||||
auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2);
|
||||
auto num_channels = packed_tensor.size(0);
|
||||
|
||||
constexpr int rgba_size = 4;
|
||||
auto unpacked_tensor = at::empty({rgba_size, packed_tensor.size(1), packed_tensor.size(2)}, at::CPU(at::kByte));
|
||||
uint8_t* unpacked = (uint8_t*) unpacked_tensor.data_ptr<uint8_t>();
|
||||
|
||||
auto stride_i = packed_tensor.stride(2);
|
||||
auto stride_j = packed_tensor.stride(0);
|
||||
|
||||
for (const auto i : c10::irange(num_pixels)) {
|
||||
for (const auto j : c10::irange(rgba_size)) {
|
||||
unpacked[rgba_size * i + j] = (j < num_channels) ? packed[stride_i * i + stride_j * j] : 0;
|
||||
}
|
||||
}
|
||||
return unpacked_tensor;
|
||||
}
|
||||
|
||||
void pack_rgb(
|
||||
const at::Tensor& unpacked_tensor, // IN
|
||||
const at::Tensor& packed_tensor // OUT
|
||||
) {
|
||||
constexpr int rgba_size = 4;
|
||||
uint8_t* unpacked = (uint8_t*)unpacked_tensor.data_ptr<uint8_t>();
|
||||
uint8_t* packed = (uint8_t*)packed_tensor.data_ptr<uint8_t>();
|
||||
auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2);
|
||||
auto num_channels = packed_tensor.size(0);
|
||||
|
||||
auto packed_increment = packed_tensor.stride(2);
|
||||
auto packed_stride = packed_tensor.stride(0);
|
||||
|
||||
for (const auto i C10_UNUSED : c10::irange(num_pixels)) {
|
||||
for (const auto j : c10::irange(num_channels)) {
|
||||
packed[j * packed_stride] = unpacked[j];
|
||||
}
|
||||
unpacked += rgba_size;
|
||||
packed += packed_increment;
|
||||
}
|
||||
}
|
||||
|
||||
void ImagingResampleHorizontalConvolution8u4x(
|
||||
uint32_t* C10_RESTRICT lineOut0,
|
||||
uint32_t* C10_RESTRICT lineOut1,
|
||||
uint32_t* C10_RESTRICT lineOut2,
|
||||
uint32_t* C10_RESTRICT lineOut3,
|
||||
const uint32_t* C10_RESTRICT lineIn0,
|
||||
const uint32_t* C10_RESTRICT lineIn1,
|
||||
const uint32_t* C10_RESTRICT lineIn2,
|
||||
const uint32_t* C10_RESTRICT lineIn3,
|
||||
int xsize,
|
||||
int* xbounds,
|
||||
int16_t* kk,
|
||||
int kmax,
|
||||
int coefs_precision);
|
||||
|
||||
void ImagingResampleHorizontalConvolution8u(
|
||||
uint32_t* C10_RESTRICT lineOut,
|
||||
const uint32_t* C10_RESTRICT lineIn,
|
||||
int xsize,
|
||||
int* xbounds,
|
||||
int16_t* kk,
|
||||
int kmax,
|
||||
int coefs_precision);
|
||||
|
||||
void ImagingResampleVerticalConvolution8u(
|
||||
uint32_t* C10_RESTRICT lineOut,
|
||||
const uint32_t* C10_RESTRICT imIn,
|
||||
int xmin,
|
||||
int xmax,
|
||||
int16_t* k,
|
||||
int coefs_precision,
|
||||
int xin);
|
||||
|
||||
void ImagingResampleHorizontal(
|
||||
const at::Tensor & unpacked_output,
|
||||
const at::Tensor & unpacked_input,
|
||||
int ksize,
|
||||
const std::vector<at::Tensor>& horiz_indices_weights,
|
||||
unsigned int horiz_weights_precision) {
|
||||
// TODO: we may want to merge that into the fallback code (currently called
|
||||
// basic_loop_aa_horizontal<uint8_t>)
|
||||
// Although this may not be needed if / when we port all this code to use
|
||||
// Vec.h since this would potentially give us another fall-back implem
|
||||
int yy;
|
||||
|
||||
int16_t* kk = (int16_t*)(horiz_indices_weights[3].data_ptr<double>());
|
||||
|
||||
auto xout = unpacked_output.size(2);
|
||||
auto yout = unpacked_output.size(1);
|
||||
auto xin = unpacked_input.size(2);
|
||||
|
||||
std::vector<int> bounds_vec(2 * xout, 0);
|
||||
int* bounds = bounds_vec.data();
|
||||
|
||||
int64_t* idx_ptr_xmin = horiz_indices_weights[0].data_ptr<int64_t>();
|
||||
int64_t* idx_ptr_size = horiz_indices_weights[1].data_ptr<int64_t>();
|
||||
for (int i = 0; i < xout; i++) {
|
||||
bounds[2 * i + 0] = idx_ptr_xmin[i];
|
||||
bounds[2 * i + 1] = idx_ptr_size[i];
|
||||
}
|
||||
|
||||
uint32_t* unpacked_input_p = (uint32_t*) unpacked_input.data_ptr<uint8_t>();
|
||||
uint32_t* unpacked_output_p = (uint32_t*) unpacked_output.data_ptr<uint8_t>();
|
||||
|
||||
yy = 0;
|
||||
for (; yy < yout - 3; yy += 4) {
|
||||
ImagingResampleHorizontalConvolution8u4x(
|
||||
unpacked_output_p + yy * xout,
|
||||
unpacked_output_p + (yy + 1) * xout,
|
||||
unpacked_output_p + (yy + 2) * xout,
|
||||
unpacked_output_p + (yy + 3) * xout,
|
||||
unpacked_input_p + yy * xin,
|
||||
unpacked_input_p + (yy + 1) * xin,
|
||||
unpacked_input_p + (yy + 2) * xin,
|
||||
unpacked_input_p + (yy + 3) * xin,
|
||||
xout,
|
||||
bounds,
|
||||
kk,
|
||||
ksize,
|
||||
(int)horiz_weights_precision);
|
||||
}
|
||||
for (; yy < yout; yy++) {
|
||||
ImagingResampleHorizontalConvolution8u(
|
||||
unpacked_output_p + yy * xout,
|
||||
unpacked_input_p + yy * xin,
|
||||
xout,
|
||||
bounds,
|
||||
kk,
|
||||
ksize,
|
||||
(int)horiz_weights_precision);
|
||||
}
|
||||
}
|
||||
|
||||
void ImagingResampleVertical(
|
||||
const at::Tensor & unpacked_output,
|
||||
const at::Tensor & unpacked_input,
|
||||
int ksize,
|
||||
const std::vector<at::Tensor>& vert_indices_weights,
|
||||
unsigned int vert_weights_precision) {
|
||||
// TODO: we may want to merge that into the fallback code (currently called
|
||||
// basic_loop_aa_vertical<uint8_t>)
|
||||
// Although this may not be needed if / when we port all this code to use
|
||||
// Vec.h since this would potentially give us another fall-back implem
|
||||
int ymin, ymax;
|
||||
int16_t* k = nullptr;
|
||||
int16_t* kk = (int16_t*)(vert_indices_weights[3].data_ptr<double>());
|
||||
|
||||
int64_t* idx_ptr_xmin = vert_indices_weights[0].data_ptr<int64_t>();
|
||||
int64_t* idx_ptr_size = vert_indices_weights[1].data_ptr<int64_t>();
|
||||
|
||||
uint32_t* unpacked_output_p = (uint32_t*) unpacked_output.data_ptr<uint8_t>();
|
||||
uint32_t* unpacked_input_p = (uint32_t*) unpacked_input.data_ptr<uint8_t>();
|
||||
|
||||
auto xout = unpacked_output.size(2);
|
||||
auto yout = unpacked_output.size(1);
|
||||
|
||||
for (const auto yy : c10::irange(yout)) {
|
||||
k = &kk[yy * ksize];
|
||||
|
||||
ymin = idx_ptr_xmin[yy];
|
||||
ymax = idx_ptr_size[yy];
|
||||
ImagingResampleVerticalConvolution8u(
|
||||
unpacked_output_p + yy * xout,
|
||||
unpacked_input_p,
|
||||
ymin,
|
||||
ymax,
|
||||
k,
|
||||
(int)vert_weights_precision,
|
||||
xout);
|
||||
}
|
||||
}
|
||||
|
||||
// This is the only public entry point in this file. It supports bilinear
|
||||
// mode for uint8 dtype when C <= 4, with or without antialias. The
|
||||
// implem is based on PIL-SIMD.
|
||||
// Its equivalent implementation (fallback) for when AVX isn't supported or when
|
||||
// C > 4 is separable_upsample_generic_Nd_kernel_impl() There are a bunch of
|
||||
// future improvement that can be done: look for the TODOs in this file.
|
||||
// For details on how the weights are computed and how the multiplications are
|
||||
// run on int (instead of float weights), see
|
||||
// [ Weights computation for uint8_t and multiplication trick ]
|
||||
// For details on how the AVX kernels are implemented, see
|
||||
// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
|
||||
// See also [ Support for antialias=False as a subcase of antilias=True ] to
|
||||
// learn more about how the antialias=False case is computed. The same holds
|
||||
// here: all these kernels are general enough to handle an arbitrary number of
|
||||
// weights, but when aa=False they could be optimized further.
|
||||
template <typename scale_type, class F>
|
||||
void upsample_avx_bilinear_uint8(
|
||||
const at::Tensor& input,
|
||||
const at::Tensor& output,
|
||||
bool align_corners,
|
||||
const scale_type& scales,
|
||||
bool antialias) {
|
||||
auto batch_size = input.size(0);
|
||||
auto num_channels = input.size(1);
|
||||
auto xin = input.size(3);
|
||||
auto yin = input.size(2);
|
||||
auto xout = output.size(3);
|
||||
auto yout = output.size(2);
|
||||
|
||||
if (xin == xout && yin == yout) {
|
||||
output.copy_(input);
|
||||
return;
|
||||
}
|
||||
|
||||
auto need_horizontal = xout != xin;
|
||||
auto need_vertical = yout != yin;
|
||||
|
||||
int ksize_horiz, ksize_vert;
|
||||
std::vector<at::Tensor> horiz_indices_weights, vert_indices_weights;
|
||||
unsigned int horiz_weights_precision, vert_weights_precision;
|
||||
|
||||
if (need_horizontal) {
|
||||
int interp_dim = 3;
|
||||
std::tie(horiz_indices_weights, ksize_horiz, horiz_weights_precision) =
|
||||
F::compute_indices_int16_weights_aa(
|
||||
/*input_size=*/xin,
|
||||
/*output_size=*/xout,
|
||||
/*stride=*/1,
|
||||
/*ndims=*/4,
|
||||
/*reshape_dim=*/interp_dim,
|
||||
/*align_corners=*/align_corners,
|
||||
/*opt_scale=*/scales[interp_dim - 2],
|
||||
/*antialias=*/antialias,
|
||||
/*align_i32=*/true);
|
||||
}
|
||||
|
||||
if (need_vertical) {
|
||||
int interp_dim = 2;
|
||||
std::tie(vert_indices_weights, ksize_vert, vert_weights_precision) =
|
||||
F::compute_indices_int16_weights_aa(
|
||||
/*input_size=*/yin,
|
||||
/*output_size=*/yout,
|
||||
/*stride=*/1,
|
||||
/*ndims=*/4,
|
||||
/*reshape_dim=*/interp_dim,
|
||||
/*align_corners=*/align_corners,
|
||||
/*opt_scale=*/scales[interp_dim - 2],
|
||||
/*antialias=*/antialias,
|
||||
/*align_i32=*/true);
|
||||
}
|
||||
|
||||
bool is_rgba = num_channels == 4 && input.is_contiguous(at::MemoryFormat::ChannelsLast);
|
||||
|
||||
at::Tensor buffer_horiz, buffer_vert;
|
||||
if (need_horizontal && !(is_rgba && !need_vertical)) {
|
||||
buffer_horiz = at::empty({4, yin, xout}, input.options());
|
||||
}
|
||||
if (need_vertical && !is_rgba) {
|
||||
buffer_vert = at::empty({4, yout, xout}, input.options());
|
||||
}
|
||||
|
||||
// TODO: The unpack / pack operations create a copy of the original input and
|
||||
// output tensor. There should be a way to avoid these copies by instead
|
||||
// modifying the low-level kernels. Or maybe at least avoid copying the entire
|
||||
// tensors and just copy part of them (line by line).
|
||||
for (const auto i : c10::irange(batch_size)) {
|
||||
|
||||
at::Tensor unpacked_input = (is_rgba) ? input[i] : unpack_rgb(input[i]);
|
||||
at::Tensor unpacked_output;
|
||||
|
||||
if (need_horizontal) {
|
||||
|
||||
at::Tensor unpacked_output_temp = (is_rgba && !need_vertical) ? output[i] : buffer_horiz;
|
||||
|
||||
ImagingResampleHorizontal(
|
||||
unpacked_output_temp,
|
||||
unpacked_input,
|
||||
ksize_horiz,
|
||||
horiz_indices_weights,
|
||||
horiz_weights_precision);
|
||||
unpacked_output = unpacked_input = unpacked_output_temp;
|
||||
}
|
||||
if (need_vertical) {
|
||||
unpacked_output = (is_rgba) ? output[i] : buffer_vert;
|
||||
|
||||
ImagingResampleVertical(
|
||||
unpacked_output,
|
||||
unpacked_input,
|
||||
ksize_vert,
|
||||
vert_indices_weights,
|
||||
vert_weights_precision);
|
||||
}
|
||||
|
||||
TORCH_INTERNAL_ASSERT(unpacked_output.defined());
|
||||
|
||||
if (!is_rgba) {
|
||||
pack_rgb(unpacked_output, output[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
|
||||
void ImagingResampleHorizontalConvolution8u4x(
|
||||
uint32_t* C10_RESTRICT lineOut0,
|
||||
uint32_t* C10_RESTRICT lineOut1,
|
||||
uint32_t* C10_RESTRICT lineOut2,
|
||||
uint32_t* C10_RESTRICT lineOut3,
|
||||
const uint32_t* C10_RESTRICT lineIn0,
|
||||
const uint32_t* C10_RESTRICT lineIn1,
|
||||
const uint32_t* C10_RESTRICT lineIn2,
|
||||
const uint32_t* C10_RESTRICT lineIn3,
|
||||
int xsize,
|
||||
int* xbounds,
|
||||
int16_t* kk,
|
||||
int kmax,
|
||||
int coefs_precision) {
|
||||
int xmin, xmax, x;
|
||||
int16_t* k;
|
||||
|
||||
for (const auto xx : c10::irange(xsize)) {
|
||||
xmin = xbounds[xx * 2 + 0];
|
||||
xmax = xbounds[xx * 2 + 1];
|
||||
k = &kk[xx * kmax];
|
||||
x = 0;
|
||||
|
||||
__m256i sss0, sss1;
|
||||
__m256i zero = _mm256_setzero_si256();
|
||||
__m256i initial = _mm256_set1_epi32(1 << (coefs_precision - 1));
|
||||
sss0 = initial;
|
||||
sss1 = initial;
|
||||
|
||||
for (; x < xmax - 3; x += 4) {
|
||||
__m256i pix, mmk0, mmk1, source;
|
||||
|
||||
mmk0 = _mm256_set1_epi32(*(int32_t*)&k[x]);
|
||||
mmk1 = _mm256_set1_epi32(*(int32_t*)&k[x + 2]);
|
||||
|
||||
source = _mm256_inserti128_si256(
|
||||
_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lineIn0[x + xmin])),
|
||||
_mm_loadu_si128((__m128i*)&lineIn1[x + xmin]),
|
||||
1);
|
||||
// clang-format off
|
||||
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
|
||||
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
|
||||
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
|
||||
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk0));
|
||||
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
|
||||
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
|
||||
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
|
||||
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk1));
|
||||
|
||||
source = _mm256_inserti128_si256(
|
||||
_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lineIn2[x + xmin])),
|
||||
_mm_loadu_si128((__m128i*)&lineIn3[x + xmin]),
|
||||
1);
|
||||
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
|
||||
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
|
||||
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
|
||||
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk0));
|
||||
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
|
||||
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
|
||||
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
|
||||
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk1));
|
||||
}
|
||||
|
||||
for (; x < xmax - 1; x += 2) {
|
||||
__m256i pix, mmk;
|
||||
|
||||
mmk = _mm256_set1_epi32(*(int32_t*)&k[x]);
|
||||
|
||||
pix = _mm256_inserti128_si256(
|
||||
_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lineIn0[x + xmin])),
|
||||
_mm_loadl_epi64((__m128i*)&lineIn1[x + xmin]),
|
||||
1);
|
||||
pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
|
||||
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
|
||||
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
|
||||
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
|
||||
|
||||
pix = _mm256_inserti128_si256(
|
||||
_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lineIn2[x + xmin])),
|
||||
_mm_loadl_epi64((__m128i*)&lineIn3[x + xmin]),
|
||||
1);
|
||||
pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
|
||||
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
|
||||
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
|
||||
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
for (; x < xmax; x++) {
|
||||
__m256i pix, mmk;
|
||||
|
||||
// [16] xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0
|
||||
mmk = _mm256_set1_epi32(k[x]);
|
||||
|
||||
// [16] xx a0 xx b0 xx g0 xx r0 xx a0 xx b0 xx g0 xx r0
|
||||
pix = _mm256_inserti128_si256(
|
||||
_mm256_castsi128_si256(mm_cvtepu8_epi32(&lineIn0[x + xmin])),
|
||||
mm_cvtepu8_epi32(&lineIn1[x + xmin]),
|
||||
1);
|
||||
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
|
||||
|
||||
pix = _mm256_inserti128_si256(
|
||||
_mm256_castsi128_si256(mm_cvtepu8_epi32(&lineIn2[x + xmin])),
|
||||
mm_cvtepu8_epi32(&lineIn3[x + xmin]),
|
||||
1);
|
||||
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
|
||||
}
|
||||
|
||||
sss0 = _mm256_srai_epi32(sss0, coefs_precision);
|
||||
sss1 = _mm256_srai_epi32(sss1, coefs_precision);
|
||||
sss0 = _mm256_packs_epi32(sss0, zero);
|
||||
sss1 = _mm256_packs_epi32(sss1, zero);
|
||||
sss0 = _mm256_packus_epi16(sss0, zero);
|
||||
sss1 = _mm256_packus_epi16(sss1, zero);
|
||||
lineOut0[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 0));
|
||||
lineOut1[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1));
|
||||
lineOut2[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 0));
|
||||
lineOut3[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1));
|
||||
}
|
||||
}
|
||||
|
||||
// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
|
||||
void ImagingResampleHorizontalConvolution8u(
|
||||
uint32_t* C10_RESTRICT lineOut,
|
||||
const uint32_t* C10_RESTRICT lineIn,
|
||||
int xsize,
|
||||
int* xbounds,
|
||||
int16_t* kk,
|
||||
int kmax,
|
||||
int coefs_precision) {
|
||||
int xmin, xmax, x;
|
||||
int16_t* k;
|
||||
|
||||
for (const auto xx : c10::irange(xsize)) {
|
||||
__m128i sss;
|
||||
xmin = xbounds[xx * 2 + 0];
|
||||
xmax = xbounds[xx * 2 + 1];
|
||||
k = &kk[xx * kmax];
|
||||
x = 0;
|
||||
|
||||
if (xmax < 8) {
|
||||
sss = _mm_set1_epi32(1 << (coefs_precision - 1));
|
||||
} else {
|
||||
// Lower part will be added to higher, use only half of the error
|
||||
__m256i sss256 = _mm256_set1_epi32(1 << (coefs_precision - 2));
|
||||
|
||||
for (; x < xmax - 7; x += 8) {
|
||||
__m256i pix, mmk, source;
|
||||
__m128i tmp = _mm_loadu_si128((__m128i*)&k[x]);
|
||||
__m256i ksource =
|
||||
_mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
|
||||
|
||||
// clang-format off
|
||||
source = _mm256_loadu_si256((__m256i*)&lineIn[x + xmin]);
|
||||
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
|
||||
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
|
||||
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
|
||||
mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
|
||||
11,10, 9,8, 11,10, 9,8, 11,10, 9,8, 11,10, 9,8,
|
||||
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
|
||||
sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
|
||||
|
||||
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
|
||||
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
|
||||
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
|
||||
mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
|
||||
15,14, 13,12, 15,14, 13,12, 15,14, 13,12, 15,14, 13,12,
|
||||
7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4));
|
||||
sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
for (; x < xmax - 3; x += 4) {
|
||||
__m256i pix, mmk, source;
|
||||
__m128i tmp = _mm_loadl_epi64((__m128i*)&k[x]);
|
||||
__m256i ksource =
|
||||
_mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
|
||||
|
||||
tmp = _mm_loadu_si128((__m128i*)&lineIn[x + xmin]);
|
||||
source = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
|
||||
|
||||
// clang-format off
|
||||
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
|
||||
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
|
||||
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
|
||||
mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
|
||||
7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4,
|
||||
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
|
||||
sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
sss = _mm_add_epi32(
|
||||
_mm256_extracti128_si256(sss256, 0),
|
||||
_mm256_extracti128_si256(sss256, 1));
|
||||
}
|
||||
|
||||
for (; x < xmax - 1; x += 2) {
|
||||
__m128i mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
|
||||
__m128i source = _mm_loadl_epi64((__m128i*)&lineIn[x + xmin]);
|
||||
__m128i pix = _mm_shuffle_epi8(
|
||||
source,
|
||||
_mm_set_epi8(-1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0));
|
||||
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
|
||||
}
|
||||
|
||||
for (; x < xmax; x++) {
|
||||
__m128i pix = mm_cvtepu8_epi32(&lineIn[x + xmin]);
|
||||
__m128i mmk = _mm_set1_epi32(k[x]);
|
||||
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
|
||||
}
|
||||
sss = _mm_srai_epi32(sss, coefs_precision);
|
||||
sss = _mm_packs_epi32(sss, sss);
|
||||
lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss));
|
||||
}
|
||||
}
|
||||
|
||||
// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
|
||||
void ImagingResampleVerticalConvolution8u(
|
||||
uint32_t* C10_RESTRICT lineOut,
|
||||
const uint32_t* C10_RESTRICT imIn,
|
||||
int xmin,
|
||||
int xmax,
|
||||
int16_t* k,
|
||||
int coefs_precision,
|
||||
int xin) {
|
||||
int x;
|
||||
int xx = 0;
|
||||
int xsize = xin;
|
||||
|
||||
__m128i initial = _mm_set1_epi32(1 << (coefs_precision - 1));
|
||||
__m256i initial_256 = _mm256_set1_epi32(1 << (coefs_precision - 1));
|
||||
|
||||
for (; xx < xsize - 7; xx += 8) {
|
||||
__m256i sss0 = initial_256;
|
||||
__m256i sss1 = initial_256;
|
||||
__m256i sss2 = initial_256;
|
||||
__m256i sss3 = initial_256;
|
||||
x = 0;
|
||||
for (; x < xmax - 1; x += 2) {
|
||||
__m256i source, source1, source2;
|
||||
__m256i pix, mmk;
|
||||
|
||||
// Load two coefficients at once
|
||||
mmk = _mm256_set1_epi32(*(int32_t*)&k[x]);
|
||||
|
||||
// Load 2 lines
|
||||
// (__m256i *) &imIn->image32[x + xmin][xx]
|
||||
source1 = _mm256_loadu_si256((__m256i*)(imIn + (x + xmin) * xin + xx));
|
||||
// (__m256i *) &imIn->image32[x + 1 + xmin][xx]
|
||||
source2 =
|
||||
_mm256_loadu_si256((__m256i*)(imIn + (x + 1 + xmin) * xin + xx));
|
||||
|
||||
source = _mm256_unpacklo_epi8(source1, source2);
|
||||
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
|
||||
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
|
||||
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
|
||||
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
|
||||
|
||||
source = _mm256_unpackhi_epi8(source1, source2);
|
||||
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
|
||||
sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk));
|
||||
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
|
||||
sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk));
|
||||
}
|
||||
for (; x < xmax; x += 1) {
|
||||
__m256i source, source1, pix, mmk;
|
||||
mmk = _mm256_set1_epi32(k[x]);
|
||||
|
||||
// (__m256i *) &imIn->image32[x + xmin][xx])
|
||||
source1 = _mm256_loadu_si256((__m256i*)(imIn + (x + xmin) * xin + xx));
|
||||
|
||||
source = _mm256_unpacklo_epi8(source1, _mm256_setzero_si256());
|
||||
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
|
||||
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
|
||||
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
|
||||
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
|
||||
|
||||
source = _mm256_unpackhi_epi8(source1, _mm256_setzero_si256());
|
||||
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
|
||||
sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk));
|
||||
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
|
||||
sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk));
|
||||
}
|
||||
sss0 = _mm256_srai_epi32(sss0, coefs_precision);
|
||||
sss1 = _mm256_srai_epi32(sss1, coefs_precision);
|
||||
sss2 = _mm256_srai_epi32(sss2, coefs_precision);
|
||||
sss3 = _mm256_srai_epi32(sss3, coefs_precision);
|
||||
|
||||
sss0 = _mm256_packs_epi32(sss0, sss1);
|
||||
sss2 = _mm256_packs_epi32(sss2, sss3);
|
||||
sss0 = _mm256_packus_epi16(sss0, sss2);
|
||||
_mm256_storeu_si256((__m256i*)&lineOut[xx], sss0);
|
||||
}
|
||||
|
||||
for (; xx < xsize - 1; xx += 2) {
|
||||
__m128i sss0 = initial; // left row
|
||||
__m128i sss1 = initial; // right row
|
||||
x = 0;
|
||||
for (; x < xmax - 1; x += 2) {
|
||||
__m128i source, source1, source2;
|
||||
__m128i pix, mmk;
|
||||
|
||||
// Load two coefficients at once
|
||||
mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
|
||||
|
||||
// Load 2 lines
|
||||
// (__m128i *) &imIn->image32[x + xmin][xx])
|
||||
source1 = _mm_loadl_epi64((__m128i*)(imIn + (x + xmin) * xin + xx));
|
||||
// (__m128i *) &imIn->image32[x + 1 + xmin][xx]
|
||||
source2 = _mm_loadl_epi64((__m128i*)(imIn + (x + 1 + xmin) * xin + xx));
|
||||
|
||||
source = _mm_unpacklo_epi8(source1, source2);
|
||||
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
|
||||
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
|
||||
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
|
||||
}
|
||||
for (; x < xmax; x += 1) {
|
||||
__m128i source, source1, pix, mmk;
|
||||
mmk = _mm_set1_epi32(k[x]);
|
||||
|
||||
// (__m128i *) &imIn->image32[x + xmin][xx]);
|
||||
source1 = _mm_loadl_epi64((__m128i*)(imIn + (x + xmin) * xin + xx));
|
||||
|
||||
source = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
|
||||
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
|
||||
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
|
||||
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
|
||||
}
|
||||
sss0 = _mm_srai_epi32(sss0, coefs_precision);
|
||||
sss1 = _mm_srai_epi32(sss1, coefs_precision);
|
||||
|
||||
sss0 = _mm_packs_epi32(sss0, sss1);
|
||||
sss0 = _mm_packus_epi16(sss0, sss0);
|
||||
_mm_storel_epi64((__m128i*)&lineOut[xx], sss0);
|
||||
}
|
||||
|
||||
for (; xx < xsize; xx++) {
|
||||
__m128i sss = initial;
|
||||
x = 0;
|
||||
for (; x < xmax - 1; x += 2) {
|
||||
__m128i source, source1, source2;
|
||||
__m128i pix, mmk;
|
||||
|
||||
// Load two coefficients at once
|
||||
mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
|
||||
|
||||
// Load 2 lines
|
||||
// *(int *) &imIn->image32[x + xmin][xx]
|
||||
source1 = _mm_cvtsi32_si128(*(int*)(imIn + (x + xmin) * xin + xx));
|
||||
// *(int *) &imIn->image32[x + 1 + xmin][xx]
|
||||
source2 = _mm_cvtsi32_si128(*(int*)(imIn + (x + 1 + xmin) * xin + xx));
|
||||
|
||||
source = _mm_unpacklo_epi8(source1, source2);
|
||||
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
|
||||
}
|
||||
|
||||
for (; x < xmax; x++) {
|
||||
// &imIn->image32[x + xmin][xx]
|
||||
__m128i pix = mm_cvtepu8_epi32(imIn + (x + xmin) * xin + xx);
|
||||
__m128i mmk = _mm_set1_epi32(k[x]);
|
||||
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
|
||||
}
|
||||
sss = _mm_srai_epi32(sss, coefs_precision);
|
||||
sss = _mm_packs_epi32(sss, sss);
|
||||
lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss));
|
||||
}
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
#endif // CPU_CAPABILITY_AVX2
|
130
test/test_nn.py
130
test/test_nn.py
@ -9367,67 +9367,67 @@ class TestNNDeviceType(NNTestCase):
|
||||
@parametrize_test("antialias", [True, False])
|
||||
@parametrize_test("align_corners", [True, False])
|
||||
@parametrize_test("mode", ["bilinear", "bicubic"])
|
||||
@parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
|
||||
@onlyNativeDeviceTypes
|
||||
def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode):
|
||||
def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory_format):
|
||||
# Forward AD does not support XLA because XLA tensors don't have storage
|
||||
check_forward_ad = torch.device(device).type != 'xla'
|
||||
|
||||
kwargs = dict(mode=mode, align_corners=align_corners, antialias=antialias)
|
||||
for memory_format in [torch.contiguous_format, torch.channels_last]:
|
||||
# test float scale factor up & downsampling
|
||||
for scale_factor in [0.5, 1.5, 2]:
|
||||
in_t = torch.ones(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
|
||||
out_size = int(math.floor(in_t.shape[-1] * scale_factor))
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
|
||||
expected_out = torch.ones(2, 3, out_size, out_size, device=device)
|
||||
self.assertEqual(expected_out, out_t)
|
||||
# Assert that memory format is carried through to the output
|
||||
self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
|
||||
out_t.backward(torch.randn_like(out_t))
|
||||
self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))
|
||||
# test float scale factor up & downsampling
|
||||
for scale_factor in [0.5, 1.5, 2]:
|
||||
in_t = torch.ones(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
|
||||
out_size = int(math.floor(in_t.shape[-1] * scale_factor))
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
|
||||
expected_out = torch.ones(2, 3, out_size, out_size, device=device)
|
||||
self.assertEqual(expected_out, out_t)
|
||||
# Assert that memory format is carried through to the output
|
||||
self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
|
||||
out_t.backward(torch.randn_like(out_t))
|
||||
self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))
|
||||
|
||||
if torch.device(device).type == 'cuda':
|
||||
# Bilinear backward is nondeterministic because of atomicAdd usage
|
||||
nondet_tol = 1e-5
|
||||
else:
|
||||
nondet_tol = 0.0
|
||||
if torch.device(device).type == 'cuda':
|
||||
# Bilinear backward is nondeterministic because of atomicAdd usage
|
||||
nondet_tol = 1e-5
|
||||
else:
|
||||
nondet_tol = 0.0
|
||||
|
||||
input = torch.randn(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
|
||||
gradcheck(
|
||||
lambda x: F.interpolate(x, out_size, **kwargs),
|
||||
[input],
|
||||
check_forward_ad=check_forward_ad, nondet_tol=nondet_tol
|
||||
)
|
||||
gradgradcheck(
|
||||
lambda x: F.interpolate(x, out_size, **kwargs),
|
||||
[input],
|
||||
check_fwd_over_rev=check_forward_ad, nondet_tol=nondet_tol
|
||||
)
|
||||
input = torch.randn(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
|
||||
gradcheck(
|
||||
lambda x: F.interpolate(x, out_size, **kwargs),
|
||||
[input],
|
||||
check_forward_ad=check_forward_ad, nondet_tol=nondet_tol
|
||||
)
|
||||
gradgradcheck(
|
||||
lambda x: F.interpolate(x, out_size, **kwargs),
|
||||
[input],
|
||||
check_fwd_over_rev=check_forward_ad, nondet_tol=nondet_tol
|
||||
)
|
||||
|
||||
# Assert that cpu and cuda give same results
|
||||
if torch.device(device).type == 'cuda':
|
||||
for shapes in [
|
||||
(2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2)
|
||||
]:
|
||||
a_cuda = torch.randn(
|
||||
*shapes, device=device
|
||||
).contiguous(memory_format=memory_format).requires_grad_()
|
||||
a_cpu = a_cuda.detach().cpu().requires_grad_()
|
||||
# Assert that cpu and cuda give same results
|
||||
if torch.device(device).type == 'cuda':
|
||||
for shapes in [
|
||||
(2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2)
|
||||
]:
|
||||
a_cuda = torch.randn(
|
||||
*shapes, device=device
|
||||
).contiguous(memory_format=memory_format).requires_grad_()
|
||||
a_cpu = a_cuda.detach().cpu().requires_grad_()
|
||||
|
||||
with warnings.catch_warnings(record=True):
|
||||
out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, **kwargs)
|
||||
out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, **kwargs)
|
||||
with warnings.catch_warnings(record=True):
|
||||
out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, **kwargs)
|
||||
out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, **kwargs)
|
||||
|
||||
self.assertEqual(out_cpu, out_cuda.cpu())
|
||||
self.assertEqual(out_cpu, out_cuda.cpu())
|
||||
|
||||
g_cuda = torch.randn_like(out_cuda)
|
||||
g_cpu = g_cuda.cpu()
|
||||
g_cuda = torch.randn_like(out_cuda)
|
||||
g_cpu = g_cuda.cpu()
|
||||
|
||||
out_cuda.backward(g_cuda)
|
||||
out_cpu.backward(g_cpu)
|
||||
out_cuda.backward(g_cuda)
|
||||
out_cpu.backward(g_cpu)
|
||||
|
||||
self.assertEqual(a_cuda.grad, a_cpu.grad)
|
||||
self.assertEqual(a_cuda.grad, a_cpu.grad)
|
||||
|
||||
@parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
|
||||
def test_upsamplingBilinear2d_aa_correctness(self, device, memory_format):
|
||||
@ -9445,6 +9445,40 @@ class TestNNDeviceType(NNTestCase):
|
||||
t_out = F.interpolate(t_in, size=(2, 2), mode="bilinear", align_corners=False, antialias=True)
|
||||
self.assertEqual(expected_out, t_out)
|
||||
|
||||
@parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
|
||||
@parametrize_test("antialias", [True, False])
|
||||
@parametrize_test("align_corners", [True, False])
|
||||
@parametrize_test("num_channels", [3, 5])
|
||||
@parametrize_test("output_size", [32, 600])
|
||||
def test_upsamplingBiLinear2d_consistency(self, device, memory_format, antialias, align_corners, num_channels, output_size):
|
||||
if torch.device(device).type == "cuda":
|
||||
raise SkipTest("CUDA implementation is not yet supporting uint8")
|
||||
|
||||
mode = "bilinear"
|
||||
# Check if Max Abs Error between resized input_uint8 and resized input_float is smaller than a tolerated value, e.g. 1.0
|
||||
input_ui8 = torch.randint(0, 256, size=(1, num_channels, 400, 400), dtype=torch.uint8, device=device)
|
||||
input_ui8 = input_ui8.contiguous(memory_format=memory_format)
|
||||
input_f32 = input_ui8.float()
|
||||
|
||||
output_f32 = F.interpolate(
|
||||
input_f32, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias
|
||||
)
|
||||
output_ui8 = F.interpolate(
|
||||
input_ui8, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias
|
||||
)
|
||||
|
||||
mae_tol = 0.5
|
||||
max_abs_err_tol = 1.0
|
||||
num_wrong_pixels_tol = 5
|
||||
|
||||
abs_diff = torch.abs(output_f32.round() - output_ui8.float())
|
||||
mae = torch.mean(abs_diff)
|
||||
max_abs_err = torch.max(abs_diff)
|
||||
num_wrong_pixels = (abs_diff > max_abs_err_tol).sum()
|
||||
self.assertTrue(mae < mae_tol, msg=f"mae={mae}")
|
||||
self.assertTrue(max_abs_err < max_abs_err_tol + 1e-5, msg=f"max ae={max_abs_err}")
|
||||
self.assertTrue(num_wrong_pixels < num_wrong_pixels_tol, msg=f"num_wrong_pixels={num_wrong_pixels}")
|
||||
|
||||
def test_upsamplingBicubic2d_correctness(self, device):
|
||||
# test output against known input: align_corners=False result must match opencv
|
||||
in_t = torch.arange(8., device=device).view(1, 2, 2, 2)
|
||||
|
@ -12118,7 +12118,7 @@ op_db: List[OpInfo] = [
|
||||
supports_fwgrad_bwgrad=True,
|
||||
supports_autograd=True,
|
||||
supports_forward_ad=True,
|
||||
dtypes=floating_types_and(torch.bfloat16),
|
||||
dtypes=floating_types_and(torch.uint8, torch.bfloat16),
|
||||
dtypesIfCUDA=floating_types_and(torch.half),
|
||||
gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
|
||||
sample_inputs_func=partial(sample_inputs_interpolate, 'bilinear'),
|
||||
@ -12184,7 +12184,7 @@ op_db: List[OpInfo] = [
|
||||
supports_autograd=True,
|
||||
supports_forward_ad=True,
|
||||
supports_fwgrad_bwgrad=True,
|
||||
dtypes=floating_types_and(torch.bfloat16),
|
||||
dtypes=floating_types_and(torch.uint8, torch.bfloat16),
|
||||
dtypesIfCUDA=floating_types_and(torch.half),
|
||||
gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
|
||||
sample_inputs_func=partial(sample_inputs_upsample, 'bilinear'),
|
||||
|
Reference in New Issue
Block a user