Add uint8 support for interpolate for CPU images (#90771)

Joint work with @vfdev-5 This PR introduces native uint8 support for `interpolate()`, for `bilinear` ~and `bicubic`~ modes for CPU images (`mode=nearest[_exact]` was already supported ). On a typical torchvision training job on ImageNet, the speedup are ~4X when AVX2 is supported, comparing the uint8 native (this PR) vs torchvision's current `Resize()`: ``` AA = antialias float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does) input_size output_size channels_last AA mode num_threads speed-up float vs uint8 (this PR) (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=1 4X 2.6ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=1 2.1X 1.3ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=1 3X 2.1ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=1 4X 2.4ms vs 0.6ms (Note: we removed bicubic support for now) (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=1 4X 2.9ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=1 5X 3.1ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=1 3X 2.4ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=1 4X 2.8ms vs 0.7ms ``` There is still room for further speed-ups (see TODOs in the code). #### More benchmark details with AVX2 support - speedups typically range from 1.5X to 10X. A few edge-cases are slower, worth investigating why. <details> ``` AA = antialias float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does) input_size output_size channels_last AA mode num_threads speed-up float vs uint8 (this PR) (1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=1 5X 1.1ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=1 5X 1.2ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=1 2.8X 0.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=1 7X 1.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=1 5X 1.2ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=1 12X 2.9ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=1 3X 0.8ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=1 7X 1.8ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=2 2.6X 0.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=2 2.8X 0.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=2 1.7X 0.4ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=2 1.4X 0.3ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=2 2.7X 0.7ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=2 7X 1.6ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=2 1.8X 0.4ms vs 0.2ms (1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=2 4X 1.0ms vs 0.2ms (1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=1 4X 2.5ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=1 3.0X 1.8ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=1 3X 1.8ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=1 4X 2.3ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=1 4X 2.7ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=1 7X 4.3ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=1 3X 2.1ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=1 4X 2.6ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=2 2.7X 1.6ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=2 2.6X 1.5ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=2 2.1X 1.2ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=2 1.6X 0.9ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=2 2.8X 1.7ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=2 5X 2.8ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=2 2.3X 1.4ms vs 0.6ms (1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=2 3X 1.9ms vs 0.6ms (1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=1 4X 26.6ms vs 6.7ms (1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=1 4X 23.9ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=1 2.5X 16.8ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=1 5X 33.1ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=1 4X 25.9ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=1 8X 59.6ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=1 1.9X 14.3ms vs 7.4ms (1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=1 5X 35.4ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=2 2.0X 13.6ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=2 2.2X 14.8ms vs 6.7ms (1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=2 1.3X 8.8ms vs 6.9ms (1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=2 1.2X 8.4ms vs 6.8ms (1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=2 1.8X 12.8ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=2 4X 32.1ms vs 7.2ms (1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=2 1.4X 10.1ms vs 7.3ms (1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=2 2.9X 20.9ms vs 7.3ms (1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=1 1.4X 0.5ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=1 0.7X 0.2ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=1 1.3X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=1 1.4X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=1 2.1X 0.7ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=1 1.3X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=1 1.9X 0.6ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=1 1.0X 0.3ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=2 1.0X 0.3ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=2 0.6X 0.2ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=2 0.8X 0.3ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=2 1.4X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=2 1.4X 0.5ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=2 1.2X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=2 1.2X 0.4ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=2 0.9X 0.3ms vs 0.3ms (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=1 4X 2.6ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=1 2.1X 1.3ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=1 3X 2.1ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=1 4X 2.4ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=1 4X 2.9ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=1 5X 3.1ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=1 3X 2.4ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=1 4X 2.8ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=2 1.5X 1.0ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=2 1.2X 0.8ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=2 2.3X 1.5ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=2 1.9X 1.2ms vs 0.6ms (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=2 1.6X 1.2ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=2 4X 2.4ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=2 2.4X 1.6ms vs 0.7ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=2 2.8X 1.8ms vs 0.6ms (1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=1 2.1X 12.8ms vs 6.1ms (1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=1 0.6X 3.8ms vs 5.9ms (1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=1 1.2X 7.1ms vs 6.1ms (1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=1 1.9X 11.0ms vs 5.9ms (1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=1 2.0X 12.6ms vs 6.4ms (1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=1 1.0X 6.1ms vs 6.0ms (1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=1 1.8X 11.3ms vs 6.4ms (1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=1 0.8X 4.6ms vs 6.0ms (1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=2 1.6X 9.3ms vs 6.0ms (1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=2 0.3X 2.0ms vs 5.8ms (1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=2 1.2X 7.2ms vs 6.0ms (1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=2 0.3X 1.6ms vs 5.8ms (1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=2 1.1X 7.1ms vs 6.5ms (1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=2 0.6X 3.3ms vs 5.9ms (1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=2 0.9X 5.9ms vs 6.3ms (1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=2 0.4X 2.4ms vs 5.9ms ``` </details> without AVX2 support - no significant speed-up, but there are various possible improvements (see TODOs) <details> ``` AA = antialias float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does) input_size output_size channels_last AA mode num_threads speed-up float vs uint8 (this PR) (1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=1 0.9X 1.5ms vs 1.6ms (1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=1 0.9X 1.5ms vs 1.6ms (1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=1 0.8X 0.9ms vs 1.1ms (1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=1 1.5X 1.7ms vs 1.1ms (1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=1 0.9X 1.6ms vs 1.8ms (1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=1 2.1X 3.9ms vs 1.9ms (1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=1 0.8X 1.1ms vs 1.4ms (1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=1 1.7X 2.4ms vs 1.5ms (1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=2 0.9X 0.8ms vs 0.8ms (1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=2 0.9X 0.8ms vs 0.8ms (1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=2 0.9X 0.5ms vs 0.6ms (1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=2 0.7X 0.5ms vs 0.7ms (1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=2 0.9X 0.9ms vs 1.0ms (1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=2 2.1X 2.0ms vs 1.0ms (1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=2 0.8X 0.6ms vs 0.8ms (1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=2 1.7X 1.3ms vs 0.8ms (1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=1 1.0X 3.0ms vs 3.0ms (1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=1 1.0X 2.8ms vs 2.9ms (1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=1 1.0X 2.3ms vs 2.2ms (1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=1 1.4X 3.3ms vs 2.3ms (1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=1 1.0X 3.5ms vs 3.5ms (1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=1 1.7X 6.1ms vs 3.5ms (1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=1 0.9X 2.6ms vs 2.9ms (1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=1 1.4X 4.2ms vs 2.9ms (1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=2 1.0X 1.7ms vs 1.7ms (1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=2 0.9X 1.6ms vs 1.8ms (1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=2 0.9X 1.3ms vs 1.4ms (1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=2 0.7X 1.1ms vs 1.6ms (1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=2 1.0X 2.0ms vs 2.0ms (1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=2 1.7X 3.2ms vs 1.9ms (1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=2 0.8X 1.5ms vs 1.9ms (1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=2 1.2X 2.3ms vs 1.9ms (1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=1 1.1X 34.7ms vs 32.4ms (1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=1 1.0X 31.2ms vs 32.4ms (1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=1 1.0X 23.5ms vs 22.7ms (1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=1 1.9X 42.5ms vs 22.7ms (1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=1 0.9X 33.9ms vs 37.4ms (1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=1 2.2X 84.0ms vs 37.5ms (1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=1 1.0X 28.4ms vs 28.8ms (1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=1 2.0X 56.7ms vs 28.8ms (1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=2 1.1X 17.5ms vs 16.4ms (1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=2 1.1X 17.7ms vs 16.4ms (1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=2 0.8X 8.8ms vs 11.4ms (1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=2 1.0X 11.1ms vs 11.4ms (1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=2 1.1X 19.9ms vs 18.8ms (1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=2 2.3X 42.5ms vs 18.7ms (1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=2 1.0X 14.1ms vs 14.5ms (1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=2 2.0X 28.4ms vs 14.5ms (1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=1 1.0X 0.6ms vs 0.6ms (1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=1 0.7X 0.3ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=1 0.9X 0.5ms vs 0.6ms (1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=1 1.7X 0.6ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=1 1.0X 0.8ms vs 0.8ms (1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=1 1.1X 0.5ms vs 0.5ms (1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=1 0.9X 0.7ms vs 0.8ms (1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=1 0.9X 0.4ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=2 1.0X 0.4ms vs 0.4ms (1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=2 0.8X 0.2ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=2 0.9X 0.3ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=2 1.3X 0.3ms vs 0.2ms (1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=2 1.0X 0.5ms vs 0.5ms (1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=2 1.3X 0.4ms vs 0.3ms (1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=2 0.9X 0.5ms vs 0.5ms (1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=2 1.2X 0.3ms vs 0.3ms (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=1 0.8X 2.1ms vs 2.5ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=1 0.7X 1.6ms vs 2.4ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=1 1.2X 2.4ms vs 2.1ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=1 1.3X 2.6ms vs 2.0ms (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=1 1.1X 3.4ms vs 3.0ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=1 1.7X 4.8ms vs 2.8ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=1 1.1X 2.9ms vs 2.7ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=1 1.4X 3.5ms vs 2.4ms (1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=2 0.9X 1.2ms vs 1.3ms (1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=2 1.3X 1.6ms vs 1.2ms (1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=2 0.8X 0.9ms vs 1.1ms (1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=2 1.3X 1.3ms vs 1.0ms (1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=2 1.4X 2.2ms vs 1.6ms (1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=2 1.9X 2.8ms vs 1.5ms (1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=2 0.8X 1.1ms vs 1.4ms (1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=2 1.7X 2.1ms vs 1.3ms (1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=1 1.0X 10.0ms vs 9.9ms (1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=1 0.7X 4.6ms vs 6.2ms (1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=1 0.9X 9.1ms vs 9.8ms (1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=1 1.7X 9.4ms vs 5.7ms (1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=1 1.0X 15.2ms vs 14.8ms (1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=1 1.0X 7.6ms vs 7.5ms (1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=1 0.9X 13.3ms vs 14.4ms (1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=1 0.8X 5.9ms vs 7.0ms (1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=2 1.2X 6.0ms vs 5.2ms (1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=2 0.7X 2.3ms vs 3.2ms (1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=2 1.0X 4.8ms vs 5.0ms (1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=2 0.7X 1.9ms vs 2.9ms (1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=2 1.6X 12.3ms vs 7.5ms (1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=2 1.0X 3.9ms vs 3.9ms (1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=2 1.0X 7.0ms vs 7.3ms (1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=2 0.9X 3.0ms vs 3.5ms ``` </details> Benchmark code <details> ```py import operator_benchmark as op_bench import torch """Microbenchmarks for interpolate operator.""" class InterpolateBenchmark(op_bench.TorchBenchmarkBase): def init(self, input_size, output_size, channels_last=False, mode='linear', antialias=False, dtype=torch.float): input_image = torch.randint(0, 256, size=input_size, dtype=torch.uint8, device='cpu') if channels_last: input_image = input_image.contiguous(memory_format=torch.channels_last) self.inputs = { "input_image": input_image, "output_size": output_size, "mode": mode, "antialias": antialias, "dtype":dtype, } self.set_module_name("interpolate") def forward(self, input_image, output_size, mode, antialias, dtype): if dtype == torch.float: input_image = input_image.float() out = torch.nn.functional.interpolate(input_image, size=output_size, mode=mode, align_corners=False, antialias=antialias) if dtype == torch.float: out = out.round().clamp(min=0, max=256).to(torch.uint8) def make_config(): sizes = ( ((224, 224), (64, 64)), ((270, 268), (224, 224)), ((256, 256), (1024, 1024)), ) attrs = [] for (HW1, HW2) in sizes: attrs.append([(1, 3, *HW1), HW2]) # 3 channels # attrs.append([(1, 1, *HW1), HW2]) # 1 channel attrs.append([(1, 3, *HW2), HW1]) # 3 channels # attrs.append([(1, 1, *HW2), HW1]) # 1 channel config = op_bench.config_list( attr_names=["input_size", "output_size"], attrs=attrs, cross_product_configs={ 'channels_last': [True, False], 'mode': ["bilinear", "bicubic"], 'antialias': [True, False], # 'dtype': [torch.float, torch.uint8] # 'dtype': [torch.uint8] 'dtype': [torch.float] }, tags=["short"], ) return config config = make_config() op_bench.generate_pt_test(config, InterpolateBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main() ``` ```py import re import argparse parser = argparse.ArgumentParser() parser.add_argument("f1", nargs="?", default="main") parser.add_argument("f2", nargs="?", default="new") args = parser.parse_args() with open(args.f1) as f: main = f.readlines() with open(args.f2) as f: new = f.readlines() out = [] for main_line, new_line in zip(main, new): # num_threads=1 # TODO: remove if main_line.startswith("num_threads="): num_threads = int(main_line.split("=")[-1]) if main_line.startswith("# Input"): deets = f"{main_line.strip()}, {num_threads=}" if main_line.startswith("Forward"): main_time = float(main_line.split()[-1]) new_time = float(new_line.split()[-1]) ratio = main_time / new_time fmt = ".1f" if ratio < 3 else ".0f" improv = f"{ratio:{fmt}}X" time_fmt = ",.3f" if new_time < 100 else ",.1f" deets = deets.strip().replace("# Input: ", "") deets = deets.replace(": ", "=") deets = deets.replace("input_size=", "") deets = deets.replace(", output_size=", " -> ") deets = deets.replace("dtype=torch.", "") deets = deets.replace("mode=", "") deets = deets.replace("antialias=", "") deets = deets.replace("channels_last=", "") # deets = deets.replace("channels_last=True, ", "") split = deets.split(",") # size = ','.join(split[:-3]) # mode, dtype, threads = split[-3:] # deets = f"{size:<30} {mode:<15} {dtype:<10} {threads:<15}" size = ','.join(split[:-5]) channels_last, mode, antialias, dtype, threads= split[-5:] deets = f"{size:<33} {channels_last:<7} {antialias:<7} {mode:<10} {threads:<15}" l = f"{deets} {improv:<5} {main_time / 1000:{time_fmt}}ms vs {new_time / 1000:{time_fmt}}ms" out.append(l) def key(s): # s = ''.join(s.split()[1:]) # remove "N.nX" part num_threads = (int(re.findall(r"num_threads=(\d+)", s)[0]),) input_shape, output_shape = re.findall("\(.*?\)", s) input_shape = input_shape[1:-1] # remove parenthesis input_HW = tuple(int(x) for x in input_shape.split(",")[-2:]) input_C = (-int(input_shape.split(",")[1]),) output_HW = tuple(int(x) for x in output_shape[1:-1].split(",")) is_downsample = (output_HW[0] < input_HW[0],) if "linear" in s: mode = "linear" elif "nearest-exact" in s: mode = "nearest-exact" else: # assert "nearest" in s mode = "nearest" mode = (mode,) return is_downsample + input_HW + output_HW + num_threads + input_C + mode for i, l in enumerate(sorted(out, key=key)): if i % 8 == 0: print() # if i % 10 == 0 and i % 40 != 0: # print() # if i % 40 == 0: # print("-" * 100) print(l) ``` </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/90771 Approved by: https://github.com/peterbell10, https://github.com/ngimel
2025-10-20 12:54:11 +08:00 · 2023-02-10 01:43:49 +00:00
parent 782e4f5c02
commit 544c04f2df
5 changed files with 1327 additions and 141 deletions
--- a/38
+++ b/38
@ -416,3 +416,41 @@ derivation and reference the following license:
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
+
+=======================================================================
+PILLOW-SIMD Software License
+=======================================================================
+
+Code derived from implementations in PILLOW-SIMD should mention its derivation
+and reference the following license:
+
+    The Python Imaging Library (PIL) is
+
+        Copyright © 1997-2011 by Secret Labs AB
+        Copyright © 1995-2011 by Fredrik Lundh
+
+    Pillow is the friendly PIL fork. It is
+
+        Copyright © 2010-2022 by Alex Clark and contributors
+
+    Like PIL, Pillow is licensed under the open source HPND License:
+
+    By obtaining, using, and/or copying this software and/or its associated
+    documentation, you agree that you have read, understood, and will comply
+    with the following terms and conditions:
+
+    Permission to use, copy, modify, and distribute this software and its
+    associated documentation for any purpose and without fee is hereby granted,
+    provided that the above copyright notice appears in all copies, and that
+    both that copyright notice and this permission notice appear in supporting
+    documentation, and that the name of Secret Labs AB or the author not be
+    used in advertising or publicity pertaining to distribution of the software
+    without specific, written prior permission.
+
+    SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+    SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
+    IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL,
+    INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+    LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+    OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+    PERFORMANCE OF THIS SOFTWARE.
--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@ -8,6 +8,7 @@
 #include <ATen/native/UpSample.h>
 #include <ATen/native/cpu/utils.h>
 #include <c10/util/irange.h>
+#include <ATen/native/cpu/UpSampleKernelAVXAntialias.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -22,12 +23,53 @@ namespace {

 using scale_t = std::vector<c10::optional<double>>;

+// TODO: this file could benefit from a global renaming of its functions /
+// classes and terms, as well as from adding more comments. In particular:
+// - It's not obvious that despite their names (and the file name), all these
+//   kernels don't just do upsampling: they do general interpolation, i.e. they
+//   also all support downscaling.
+// - the term "horizontal" or "within dims" or "contiguous dim" refers to the
+//   last dimension.
+//   It's not specific to 2D images and applies to 3D (and 1D??) inputs as well.
+//   Similarly "vertical" or "across dims" refers to all dims that aren't the
+//   last one. In other kernels these are also referred to as "zero-stride" and
+//   "non-zero-stride" - we should unify all this.
+// - the terms "zero-stride" and "non-zero strides" refer to the weights and
+//   indices, not to the contiguity of input or output
+// - It's not always clear which kernel is vectorized and which one isn't.
+// - The functions like _use_vectorized_kernel_cond() should be renamed and
+//   their description updated, because they're not the only "fork" in the
+//   code-path where a choice is made between a vectorized kernel vs a
+//   non-vectorized one. See e.g. upsample_bilinear2d_kernel_impl() where we
+//   already make a similar check, before the one in
+//   _use_vectorized_kernel_cond().
+// - It's not always clear which code is part of a "separable interpolation"
+//   code-path.
+// - Some names need to be more specific. For example
+//   "cpu_upsample_generic_aa()" looks like a super generic name, but the function
+//   is instead fairly specific - we need to make that clearer.
+// - Some functions have a "aa" suffix but it doesn't mean that they only
+//   support antialias. Some of them also support antialias=False now.
+// - Various comments are outdated. Case in point: the one just below about the
+//   `Interpolate` struct being used for cpu_upsample_linear:
+//   cpu_upsample_linear doesn't exist anymore, and these structs are used for
+//   various modes, *not* just linear.
+// - It'd be useful to document how interpolation works in general, and in particular state explicitly:
+//   - that the weights and indices across a given dimension are the same for
+//     all pixels (hence the benefit of pre-computing them)
+//   - that it can be "separated", i.e. we can do the horizontal pass and the
+//     vertical pass independently (and that some kernels are written this way,
+//     while some aren't.)
+// - we can probably remove the template over index_t, because it's always
+//   hard-coded as int64_t
+
+
 // Helper structs and methods for cpu_upsample_linear
 //
 // Interpolation methods that used below are separable, and as such we can compute the interpolation
 // independently per dimension in a recursive way. Please, refer to #10482 for more context.
 //
-// Linear Interpolation structure to compute output value in n-dimensional case.
+// Interpolation structure to compute output value in n-dimensional case.
 // - recursively compute interpolated output for each dimension
 // - we rely a lot on compiler's code optimization such that implemented operations
 //   can be automatically factorized and vectorized using SSE and AVX2
@ -255,48 +297,129 @@ static inline void basic_loop(char** data, const int64_t* strides, int64_t n) {
  }
 }

-template <typename scalar_t, typename index_t>
-static inline void basic_loop_aa_single_dim_zero_strides(
+template <typename scalar_t>
+static inline void basic_loop_aa_vertical(
    char** data,
    const int64_t* strides,
-    int64_t n) {
+    int64_t n,
+    unsigned int weights_precision) {
  char* dst = data[0];
  char* src = data[1];
  // index stride is constant for the given dimension
-  const index_t ids_stride = *(index_t*)&data[2 + 2][0];
+  const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];

  for (const auto i : c10::irange(n)) {
    *(scalar_t*)&dst[i * strides[0]] =
-        interpolate_aa_single_dim_zero_strides<scalar_t, index_t>(
+        interpolate_aa_single_dim_zero_strides<scalar_t, int64_t>(
            src + i * strides[1], &data[2], ids_stride);
  }
 }

-template <typename scalar_t, typename index_t>
-static inline void basic_loop_aa_single_dim_nonzero_strides(
+template <>
+inline void basic_loop_aa_vertical<uint8_t>(
    char** data,
    const int64_t* strides,
-    int64_t n) {
+    int64_t n,
+    unsigned int weights_precision) {
+  // See Note [ Weights computation for uint8_t and multiplication trick ]
+  char* dst = data[0];
+  char* src = data[1];
+
+  // index stride is constant for the given dimension
+  const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];
+  const int64_t ids_size = *(int64_t*)&data[2 + 1][0];
+  const int64_t ids_min = *(int64_t*)&data[2 + 0][0];
+
+  int64_t i = 0;
+
+  for (; i<n; i++) {
+
+    char* src_min = src + i * strides[1] + ids_min;
+
+    uint8_t t = *(uint8_t*)&src_min[0];
+    int64_t wts_idx = *(int64_t*)&data[2 + 4][0];
+    int16_t* wts_ptr = (int16_t*)&data[2 + 3][wts_idx];
+    int16_t wts = wts_ptr[0];
+
+    // Intermediate computations are using integer type
+    int output = 1 << (weights_precision - 1);  // accounts for the +0.5 part
+    output += t * wts;
+    for (const auto j : c10::irange(1, ids_size)) {
+      wts = wts_ptr[j];
+      t = *(uint8_t*)&src_min[j * ids_stride];
+      output += t * wts;
+    }
+    *(uint8_t*)&dst[i * strides[0]] = (uint8_t)std::clamp(output >> weights_precision, 0, 255);
+  }
+}
+
+template <typename scalar_t>
+static inline void basic_loop_aa_horizontal(
+    char** data,
+    const int64_t* strides,
+    int64_t n,
+    unsigned int weights_precision) {
  char* dst = data[0];
  char* src = data[1];
  // index stride is constant for the given dimension
-  const index_t ids_stride = *(index_t*)&data[2 + 2][0];
+  const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];

  if (strides[1] == 0) {
    for (const auto i : c10::irange(n)) {
      *(scalar_t*)&dst[i * strides[0]] =
-          interpolate_aa_single_dim<scalar_t, index_t>(
+          interpolate_aa_single_dim<scalar_t, int64_t>(
              src, &data[2], &strides[2], i, ids_stride);
    }
  } else {
    for (const auto i : c10::irange(n)) {
      *(scalar_t*)&dst[i * strides[0]] =
-          interpolate_aa_single_dim<scalar_t, index_t>(
+          interpolate_aa_single_dim<scalar_t, int64_t>(
              src + i * strides[1], &data[2], &strides[2], i, ids_stride);
    }
  }
 }

+template <>
+inline void basic_loop_aa_horizontal<uint8_t>(
+    char** data,
+    const int64_t* strides,
+    int64_t n,
+    unsigned int weights_precision) {
+  // See Note [ Weights computation for uint8_t and multiplication trick ]
+  char* dst = data[0];
+  char* src = data[1];
+  // index stride is constant for the given dimension
+  const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];
+
+  int64_t i = 0;
+
+  // Here we are implementing data interpolation within the same line (vs between the lines)
+  // output[x, y] = input[xmin[x], y] * W[x] + input[xmin[x] + 1, y] * W[x + 1] + ... + input[xmin[x] + xsize, y] * W[x + xsize]
+
+  for (; i<n; i++) {
+
+    int64_t ids_min = *(int64_t*)&data[2 + 0][i * strides[2 + 0]];
+    int64_t ids_size = *(int64_t*)&data[2 + 1][i * strides[2 + 1]];
+
+    char* src_min = src + i * strides[1] + ids_min;
+
+    uint8_t t = *(uint8_t*)&src_min[0];
+    int64_t wts_idx = *(int64_t*)&data[2 + 4][i * strides[2 + 4]];
+    int16_t* wts_ptr = (int16_t*)&data[2 + 3][wts_idx];
+    int16_t wts = wts_ptr[0];
+
+    // Intermediate computations are using integer type
+    int output = 1 << (weights_precision - 1);  // accounts for the +0.5 part
+    output += t * wts;
+    for (const auto j : c10::irange(1, ids_size)) {
+      wts = wts_ptr[j];
+      t = *(uint8_t*)&src_min[j * ids_stride];
+      output += t * wts;
+    }
+    *(uint8_t*)&dst[i * strides[0]] = (uint8_t)std::clamp(output >> weights_precision, 0, 255);
+  }
+}
+
 // Generic upsampling computation method using TensorIterator for Nd case.
 // Supports: nearest, linear, cubic modes with interp_size template argument: 1, 2, 4
 //
@ -621,21 +744,23 @@ struct HelperInterpBase {
  template <typename scalar_t, typename aa_filter_fn_t>
  static inline void _compute_weights_aa(
    const int64_t i, const int64_t input_size, const scalar_t scale, const scalar_t support,
-    scalar_t* wt_ptr, const int64_t interp_size, aa_filter_fn_t filter_fn,
-    int64_t& xmin, int64_t& xsize
+    scalar_t* wt_ptr, const int64_t max_interp_size, aa_filter_fn_t filter_fn,
+    int64_t& xmin, int64_t& xsize, bool antialias, double align_corners_delta
  ) {

-    scalar_t center = scale * (i + 0.5);
+    // align_corners_delta is 0.5 for uint8 and align_corners=true and antialias=false
+    //                     is 0.0 otherwise
+    scalar_t center = scale * (i + 0.5 - align_corners_delta);
    scalar_t total_w = 0.0;
-    scalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0;
+    scalar_t invscale = (scale >= 1.0 && antialias) ? 1.0 / scale : 1.0;
    xmin = std::max(
-        static_cast<int64_t>(center - support + 0.5), static_cast<int64_t>(0));
-    xsize = std::min(static_cast<int64_t>(center + support + 0.5), input_size) -
-        xmin;
+        static_cast<int64_t>(center - support + 0.5 + align_corners_delta), static_cast<int64_t>(0));
+    xsize = std::min(
+        static_cast<int64_t>(center + support + 0.5 + align_corners_delta), input_size) - xmin;

    int64_t j = 0;
    for (; j < xsize; j++) {
-      scalar_t w = filter_fn((j + xmin - center + 0.5) * invscale);
+      scalar_t w = filter_fn((j + xmin - center + 0.5 - align_corners_delta) * invscale);
      wt_ptr[j] = w;
      total_w += w;
    }
@ -644,23 +769,39 @@ struct HelperInterpBase {
        wt_ptr[j] /= total_w;
      }
    }
-    for (; j < interp_size; j++) {
+    for (; j < max_interp_size; j++) {
      wt_ptr[j] = static_cast<scalar_t>(0.0);
    }
  }

-  template <typename scalar_t, typename aa_filter_fn_t>
-  static inline std::vector<Tensor> _compute_indices_weights_aa(
+  // Note [ Support for antialias=False as a subcase of antilias=True ]
+  // This function was originally written with the hard assumption that
+  // antialias=True (hence the aa in the name). It was later extended to support
+  // antialias=False. The only difference between aa and no-aa is in how the
+  // weights and indices are computed (and their number). In aa their number is
+  // variable but with no-aa, they're fixed to interp_size. The same "filters"
+  // can be used otherwise. HOWEVER, support for antialias=False here may not be
+  // optimally optimized: the code assumes an arbitrary number of weights and
+  // indices, but this can be optimized further when aa=False since we know
+  // their actual dimensions.
+  template <typename scalar_t, typename aa_filter_fn_t, int weight_index_stride=sizeof(scalar_t)>
+  static inline std::tuple<std::vector<Tensor>, int> _compute_indices_weights_aa(
    int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
    int64_t reshape_dim, scalar_t scale,
-    int interp_size, aa_filter_fn_t aa_filter_fn
+    int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, double align_corners_delta
  ) {

    std::vector<Tensor> output;

-    scalar_t support =
-        (scale >= 1.0) ? (interp_size * 0.5) * scale : interp_size * 0.5;
-    interp_size = (int)ceilf(support) * 2 + 1;
+    scalar_t support;
+    int max_interp_size;
+    if (antialias) {
+        support = (scale >= 1.0) ? (interp_size * 0.5) * scale : interp_size * 0.5;
+        max_interp_size = (int) std::ceil(support) * 2 + 1;
+    } else {
+        support = interp_size * 0.5;
+        max_interp_size = interp_size;
+    }

    auto new_shape = std::vector<int64_t>(ndims, 1);
    new_shape[reshape_dim] = output_size;
@ -675,7 +816,7 @@ struct HelperInterpBase {

    {
      // Weights
-      new_shape[reshape_dim] = output_size * interp_size;
+      new_shape[reshape_dim] = output_size * max_interp_size;
      auto wts = empty(new_shape, CPU(c10::CppTypeToScalarType<scalar_t>()));
      auto strides = wts.strides().vec();
      strides[reshape_dim] = 0;
@ -701,20 +842,130 @@ struct HelperInterpBase {
          input_size,
          scale,
          support,
-          wt_ptr + i * interp_size,
-          interp_size,
+          wt_ptr + i * max_interp_size,
+          max_interp_size,
          aa_filter_fn,
          xmin,
-          xmax);
+          xmax,
+          antialias,
+          align_corners_delta);

      idx_ptr_xmin[i] = xmin * stride;
      idx_ptr_size[i] = xmax;
      idx_ptr_stride[i] = stride;
-      wt_idx_ptr[i] = i * interp_size * sizeof(scalar_t);
+      wt_idx_ptr[i] = i * max_interp_size * weight_index_stride;
    }
-    return output;
+    return {output, max_interp_size};
  }

+  /*
+  NOTE [ Weights computation for uint8_t and multiplication trick ]
+  When the input/output dtype is uint8_t, we still compute the interpolation
+  weights as double, but then convert them to int16 via some conversion logic
+  detailed below. This allows us to compute all interpolation operation (sum of
+  multiplications) as ints instead of floats. The result is converted back into
+  uint8 in basic_loop_aa_horizontal<uint8_t> (and vertical)
+
+  In essence the idea is to avoid a multiplication between a float (the
+  weight) and an int (the pixel value) and instead run a multpilication between
+  2 ints:
+
+  ```py
+  COEF_PREC = 16
+
+  def mul(a:float, b:int) -> Tuple[float, int]:
+    # return a * b, round(a * b)
+    actual = a * b
+
+    assert a > 0  # I'm lazy
+    int_a = floor(0.5 + a * (1 << COEF_PREC))
+    with_trick = ((int_a * b) + (1 << (COEF_PREC - 1))) >> COEF_PREC
+
+    return actual, with_trick  # round(actual) == with_trick!!
+  ```
+
+  Here's how it works:
+  N == COEFF_PREC
+  1 << N == 2**N
+  floor(0.5 + x) == round(x)
+
+  So the operation is something like
+
+  int_a = round(a * 2**N)  -- let's just say it's `a * 2**N` for simplicity
+
+  res = ((int_a * b) + (1 << (N - 1))) >> N
+      = ((a * 2**N * b + 2**(N - 1)) / 2**N
+      = a * b + 0.5
+      = round(a * b)
+      = what we wanted
+  */
+  template <typename aa_filter_fn_t>
+  static inline std::tuple<std::vector<Tensor>, int, unsigned int> _compute_indices_int16_weights_aa(
+    int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
+    int64_t reshape_dim, bool align_corners, const c10::optional<double> opt_scale,
+    int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, bool align_i32=false
+  ) {
+
+    double scale = area_pixel_compute_scale<double>(
+        input_size, output_size, align_corners, opt_scale);
+
+    std::vector<Tensor> indices_weights;
+    auto align_corners_delta = (align_corners && !antialias) ? 0.5 : 0.0;
+    std::tie(indices_weights, interp_size) = HelperInterpBase::_compute_indices_weights_aa<double, aa_filter_fn_t, sizeof(int16_t)>(
+        input_size, output_size, stride, ndims, reshape_dim, scale, interp_size, aa_filter_fn, antialias, align_corners_delta);
+
+    // Rescale float weights to int16 and compute weights precision
+    auto weights_f64 = indices_weights[3];
+    double * data_f64 = weights_f64.data_ptr<double>();
+    int64_t weights_f64_size = output_size * interp_size;
+    // can't use weights_f64.max() here as tensor is restrided
+    double w_max = data_f64[0];
+    for (const auto i : c10::irange(weights_f64_size)) {
+        double v = data_f64[i];
+        if (w_max < v) {
+            w_max = v;
+        }
+    }
+
+    unsigned int weights_precision = 0;
+    for (weights_precision = 0; weights_precision < 22; weights_precision += 1) {
+        int next_value = (int) (0.5 + w_max * (1 << (weights_precision + 1)));
+        if (next_value >= (1 << 15))
+            break;
+    }
+
+    // Rescale float values to int16
+    int16_t * data_i16 = (int16_t *) data_f64;
+    auto aligned_interp_size = interp_size;
+
+    if (align_i32) {
+      // We should respect int32 alignment as
+      // we will load data as int32 with AVX2
+      // See ImagingResampleHorizontalConvolution8u4x, mmk0 = _mm256_set1_epi32(*(int32_t*)&k[x]);
+      // compute aligned_interp_size = nearest pair value to interp_size
+      while (aligned_interp_size % sizeof(int32_t) != 0) {
+        aligned_interp_size += 1;
+      }
+      // assert that we wont go out of bounds
+      TORCH_INTERNAL_ASSERT(aligned_interp_size * sizeof(int16_t) < interp_size * sizeof(double));
+    }
+
+    for (const auto j : c10::irange(output_size)) {
+      for (const auto k : c10::irange(interp_size)) {
+        double v = data_f64[j * interp_size + k];
+        if (v < 0) {
+            data_i16[j * aligned_interp_size + k] = (int) (-0.5 + v * (1 << weights_precision));
+        } else {
+            data_i16[j * aligned_interp_size + k] = (int) (0.5 + v * (1 << weights_precision));
+        }
+      }
+    }
+
+    return {indices_weights, aligned_interp_size, weights_precision};
+  }
+
+
+
 };

 struct HelperInterpNearest : public HelperInterpBase {
@ -923,8 +1174,9 @@ struct HelperInterpLinear : public HelperInterpBase {
            input_size, output_size, align_corners, opt_scale);

        auto interp_size = HelperInterpLinear::interp_size;
+        int unused;

-        indices_weights = HelperInterpLinear::_compute_indices_weights_aa<scalar_t>(
+        std::tie(indices_weights, unused) = HelperInterpLinear::_compute_indices_weights_aa<scalar_t>(
            input_size,
            output_size,
            stride,
@ -932,11 +1184,32 @@ struct HelperInterpLinear : public HelperInterpBase {
            reshape_dim,
            scale,
            interp_size,
-            &HelperInterpLinear::aa_filter<scalar_t>);
+            &HelperInterpLinear::aa_filter<scalar_t>,
+            /*antialias=*/true,
+            /*align_corners_delta=*/0.0);
      }
    );
    return indices_weights;
  }
+
+  static inline std::tuple<std::vector<Tensor>, int, unsigned int> compute_indices_int16_weights_aa(
+    int64_t input_size,
+    int64_t output_size,
+    int64_t stride,
+    int64_t ndims,
+    int64_t reshape_dim,
+    bool align_corners,
+    const c10::optional<double> opt_scale,
+    bool antialias,
+    bool align_i32=false
+  ) {
+
+    auto interp_size = HelperInterpLinear::interp_size;
+    auto fn = HelperInterpLinear::aa_filter<double>;
+    return HelperInterpLinear::_compute_indices_int16_weights_aa(
+        input_size, output_size, stride, ndims, reshape_dim,
+        align_corners, opt_scale, interp_size, fn, antialias, align_i32);
+  }
 };

 struct HelperInterpCubic : public HelperInterpBase {
@ -1033,8 +1306,9 @@ struct HelperInterpCubic : public HelperInterpBase {
            input_size, output_size, align_corners, opt_scale);

        auto interp_size = HelperInterpCubic::interp_size;
+        int unused;

-        indices_weights = HelperInterpCubic::_compute_indices_weights_aa<scalar_t>(
+        std::tie(indices_weights, unused) = HelperInterpCubic::_compute_indices_weights_aa<scalar_t>(
            input_size,
            output_size,
            stride,
@ -1042,11 +1316,14 @@ struct HelperInterpCubic : public HelperInterpBase {
            reshape_dim,
            scale,
            interp_size,
-            &HelperInterpCubic::aa_filter<scalar_t>);
+            &HelperInterpCubic::aa_filter<scalar_t>,
+            /*antialias=*/true,
+            /*align_corners_delta*/0.0);
      }
    );
    return indices_weights;
  }
+
 };

 // Generic upsampling interpolation kernel for N-d case.
@ -1133,31 +1410,50 @@ void upsample_generic_Nd_kernel_impl(
  }
 }

-template <typename scalar_t>
-void cpu_upsample_generic_aa(at::TensorIterator& iter) {
+template <typename scalar_t, bool is_horizontal>
+void cpu_upsample_generic_aa(at::TensorIterator& iter, unsigned int weights_precision) {

  auto loop = [&](char** data, const int64_t* strides, int64_t n) {
-    if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) &&
-        is_zero_stride<3 + 2>(&strides[2])) {
-      basic_loop_aa_single_dim_zero_strides<scalar_t, int64_t>(
-          data, strides, n);
+    if (is_horizontal) {
+
+      // Strides are : X 0 | 8 8 8 0 8  (Channels first)
+      // Strides are : X X | 0 0 0 0 0  (Channels last)
+      // upsampling data within a contiguous dimension (aka horizontal resampling)
+      if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) &&
+          is_zero_stride<3 + 2>(&strides[2])) {
+        // channels last case
+        basic_loop_aa_horizontal<scalar_t>(
+            data, strides, n, weights_precision);
+      } else {
+        basic_loop_aa_horizontal<scalar_t>(
+            data, strides, n, weights_precision);
+      }
    } else {
-      basic_loop_aa_single_dim_nonzero_strides<scalar_t, int64_t>(
-          data, strides, n);
+      // Strides are : X Y | 0 0 0 0 0 (Channels first)
+      // Strides are : X X | 0 0 0 0 0 (Channels last)
+      // upsampling data between contiguous dimensions (aka vertical resampling)
+      if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) &&
+          is_zero_stride<3 + 2>(&strides[2])) {
+        basic_loop_aa_vertical<scalar_t>(
+            data, strides, n, weights_precision);
+      } else {
+        basic_loop_aa_vertical<scalar_t>(
+            data, strides, n, weights_precision);
+      }
    }
  };

  iter.for_each(loop);
 }

-// Generic separable upsampling interpolation kernels for N-d case with anti-aliasing
-template <int out_ndims, typename scale_type, class F>
+template <int out_ndims, typename scale_type, class F, bool is_horizontal>
 void _separable_upsample_generic_Nd_kernel_impl_single_dim(
    const Tensor& output,
    const Tensor& input,
    int interp_dim,
    bool align_corners,
-    const scale_type& scales) {
+    const scale_type& scales,
+    bool antialias) {

  // input can be NCHW, NCL or NCKHW
  auto shape = input.sizes().vec();
@ -1174,21 +1470,29 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim(
  strides[interp_dim] = 0;
  auto restrided_input = input.as_strided(shape, strides);

-  std::vector<std::vector<Tensor>> indices_weights;
-
-  int interp_size = F::interp_size;
  auto input_scalar_type = input.scalar_type();
-  if (interp_size == 1 && input_scalar_type == at::ScalarType::Byte) {
-    // nearest also supports uint8 tensor, but we have to use float
-    // with compute_indices_weights
-    input_scalar_type = at::ScalarType::Float;
-  }

-  indices_weights.emplace_back(
+  std::vector<Tensor> indices_weights;
+  unsigned int weights_precision = 0;
+  int unused;
+
+  if (input_scalar_type == at::kByte) {
+    std::tie(indices_weights, unused, weights_precision) =
+      // TODO: change that to F:: once / if bicubic mode supports uint8 after all
+      HelperInterpLinear::compute_indices_int16_weights_aa(
+        input.size(interp_dim), oshape[interp_dim],
+        input.stride(interp_dim) * input.element_size(),
+        input.dim(), interp_dim, align_corners, scales[interp_dim - 2],
+        antialias);
+    TORCH_INTERNAL_ASSERT(weights_precision > 0);
+  } else {
+    TORCH_INTERNAL_ASSERT(antialias);
+    indices_weights =
      F::compute_indices_weights_aa(
        input_scalar_type, input.size(interp_dim), oshape[interp_dim],
        input.stride(interp_dim) * input.element_size(),
-        input.dim(), interp_dim, align_corners, scales[interp_dim - 2]));
+        input.dim(), interp_dim, align_corners, scales[interp_dim - 2]);
+  }

  TensorIteratorConfig config;
  config.check_all_same_dtype(false)
@ -1196,51 +1500,95 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim(
      .add_output(output)
      .add_input(restrided_input);

-  for (auto& idx_weight : indices_weights) {
-    for (auto& tensor : idx_weight) {
-      config.add_input(tensor);
-    }
+  for (auto& tensor : indices_weights) {
+    config.add_input(tensor);
  }

  auto iter = config.build();

-  if (interp_size > 1) {
-    // Nearest also supports uint8 tensor, so need to handle it separately
-    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "upsample_generic_Nd_aa", [&] {
-      cpu_upsample_generic_aa<scalar_t>(iter);
-    });
-  } else {
-    AT_DISPATCH_FLOATING_TYPES_AND(
-        at::ScalarType::Byte, iter.dtype(), "upsample_generic_Nd_aa", [&] {
-          cpu_upsample_generic_aa<scalar_t>(iter);
-        });
-  }
+  AT_DISPATCH_FLOATING_TYPES_AND(
+      at::ScalarType::Byte, iter.dtype(), "upsample_generic_Nd_aa", [&] {
+        cpu_upsample_generic_aa<scalar_t, is_horizontal>(iter, weights_precision);
+      });
 }

+// Generic separable upsampling interpolation kernel for N-d case with anti-aliasing.
+// It also supports antialias=False iff
+// (dtype == uint8 and mode in ("bilinear", "bicubic")): this is used as
+// fallback in these settings when AVX isn't supported.
 template <int out_ndims, typename scale_type, class F>
 void separable_upsample_generic_Nd_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    bool align_corners,
-    const scale_type& scales) {
+    const scale_type& scales,
+    bool antialias) {
+
+  auto output_shape = output.sizes();
+  auto input_shape = input.sizes();
+  auto temp_oshape = input_shape.vec();
+
+  if (output_shape == input_shape) {
+    output.copy_(input);
+    return;
+  }

-  auto temp_oshape = input.sizes().vec();
  at::Tensor temp_output, temp_input = input;
-  for (const auto i : c10::irange(out_ndims - 1)) {
-    int interp_dim = 2 + out_ndims - 1 - i;
-    temp_oshape[interp_dim] = output.sizes()[interp_dim];
-    temp_output = at::empty(temp_oshape, input.options().memory_format(input.suggest_memory_format()));
+
+  int interp_dim = 0;
+  // Precompute the number of single dim resize method invocations
+  // to avoid copying temporary buffer to output
+  int num_single_dim_ops = 0;
+  for (const auto i : c10::irange(out_ndims)) {
+    interp_dim = 2 + out_ndims - 1 - i;
+    if (output_shape[interp_dim] != input_shape[interp_dim]) {
+      num_single_dim_ops += 1;
+    }
+  }
+
+  // upsampling data within the contiguous dimension (aka horizontal resampling)
+  interp_dim = 2 + out_ndims - 1;
+  if (output_shape[interp_dim] != input_shape[interp_dim]) {
+
+    num_single_dim_ops -= 1;
+    if (num_single_dim_ops > 0) {
+      temp_oshape[interp_dim] = output_shape[interp_dim];
+      temp_output = at::empty(temp_oshape, input.options());
+    } else {
+      temp_output = output;
+    }
+
    _separable_upsample_generic_Nd_kernel_impl_single_dim<
        out_ndims,
        scale_t,
-        F>(
-        temp_output, temp_input, interp_dim, align_corners, scales);
+        F,
+        true>(
+        temp_output, temp_input, interp_dim, align_corners, scales, antialias);
    temp_input = temp_output;
  }
-  _separable_upsample_generic_Nd_kernel_impl_single_dim<
-      out_ndims,
-      scale_t,
-      F>(output, temp_input, 2, align_corners, scales);
+
+  // upsampling data between contiguous dimensions (aka vertical resampling)
+  for (const auto i : c10::irange(1, out_ndims)) {
+    interp_dim = 2 + out_ndims - 1 - i;
+    if (output_shape[interp_dim] != input_shape[interp_dim]) {
+
+      num_single_dim_ops -= 1;
+      if (num_single_dim_ops > 0) {
+        temp_oshape[interp_dim] = output_shape[interp_dim];
+        temp_output = at::empty(temp_oshape, input.options());
+      } else {
+        temp_output = output;
+      }
+
+      _separable_upsample_generic_Nd_kernel_impl_single_dim<
+          out_ndims,
+          scale_t,
+          F,
+          false>(
+          temp_output, temp_input, interp_dim, align_corners, scales, antialias);
+      temp_input = temp_output;
+    }
+  }
 }

 void upsample_nearest1d_kernel_impl(
@ -1356,7 +1704,8 @@ void upsample_linear1d_kernel_impl(
    output, input, align_corners, {scales_w});
 }

-void upsample_bilinear2d_kernel_impl(
+
+void upsample_bilinear2d_kernel_impl_float(
    const Tensor& output,
    const Tensor& input,
    bool align_corners,
@ -1378,15 +1727,56 @@ void upsample_bilinear2d_kernel_impl(
  }
 }

-void upsample_bilinear2d_aa_kernel_impl(
+void upsample_bilinear2d_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    bool align_corners,
    c10::optional<double> scales_h,
    c10::optional<double> scales_w) {

+  if (input.dtype() == at::kByte){
+    #ifdef CPU_CAPABILITY_AVX2
+      if (input.size(1) <= 4) {
+        upsample_avx_bilinear_uint8<scale_t, HelperInterpLinear>(input,
+          output, align_corners, {scales_h, scales_w},
+          /*antialias=*/false);
+      } else {
+        separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
+          output, input, align_corners, {scales_h, scales_w},
+          /*antialias=*/false);
+      }
+    #else  // CPU_CAPABILITY_AVX2
+      separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
+        output, input, align_corners, {scales_h, scales_w},
+        /*antialias=*/false);
+    #endif  // CPU_CAPABILITY_AVX2
+  } else {
+    upsample_bilinear2d_kernel_impl_float(output, input, align_corners, scales_h, scales_w);
+  }
+}
+
+
+void upsample_bilinear2d_aa_kernel_impl(
+    const Tensor& output,
+    const Tensor& input,
+    bool align_corners,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+#ifdef CPU_CAPABILITY_AVX2
+  if (input.dtype() == at::kByte && input.size(1) <= 4) {
+    upsample_avx_bilinear_uint8<scale_t, HelperInterpLinear>(
+      input, output, align_corners, {scales_h, scales_w},
+      /*antialias=*/true);
+  } else {
+    separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
+        output, input, align_corners, {scales_h, scales_w},
+        /*antialias=*/true);
+  }
+#else // CPU_CAPABILITY_AVX2
  separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
-      output, input, align_corners, {scales_h, scales_w});
+      output, input, align_corners, {scales_h, scales_w},
+      /*antialias=*/true);
+#endif // CPU_CAPABILITY_AVX2
 }

 void upsample_trilinear3d_kernel_impl(
@ -1424,7 +1814,8 @@ void upsample_bicubic2d_aa_kernel_impl(
    c10::optional<double> scales_w) {

  separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpCubic>(
-      output, input, align_corners, {scales_h, scales_w});
+    output, input, align_corners, {scales_h, scales_w},
+    /*antialias=*/true);
 }

 template <
@ -1500,7 +1891,9 @@ void cpu_upsample_genNd_backward_aa(
          interp_height,
          filter_fn,
          ymin,
-          ysize);
+          ysize,
+          /*antialias=*/true,
+          /*align_corners_delta=*/0.0);

      for (const auto ow : c10::irange(output_width)) {
        F::_compute_weights_aa(
@ -1512,7 +1905,9 @@ void cpu_upsample_genNd_backward_aa(
            interp_width,
            filter_fn,
            xmin,
-            xsize);
+            xsize,
+            /*antialias=*/true,
+            /*align_corners_delta=*/0.0);

        for (const auto c : c10::irange(begin, end)) {
          scalar_t grad_output_value =
--- a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
+++ b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
@ -0,0 +1,719 @@
+/*
+The Python Imaging Library (PIL) is
+
+    Copyright © 1997-2011 by Secret Labs AB
+    Copyright © 1995-2011 by Fredrik Lundh
+
+Pillow is the friendly PIL fork. It is
+
+    Copyright © 2010-2022 by Alex Clark and contributors
+
+Like PIL, Pillow is licensed under the open source HPND License
+*/
+
+// This code is heavily inspired from PILLOW-SIMD's implementation:
+// https://github.com/uploadcare/pillow-simd/blob/simd/master/src/libImaging/Resample.c
+
+#pragma once
+#ifdef CPU_CAPABILITY_AVX2
+// TODO: This file only supports AVX2. We could split the AVX kernels into
+// smaller logical blocks in order to port them into the Vec.h logic. This would
+// allow to support other vectorization architectures and perhaps also support
+// the non-vectorized fallback (we'd need to make sure it's not slower than the
+// current fallback).
+
+#include <ATen/core/Tensor.h>
+#include <ATen/cpu/vec/intrinsics.h>
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+
+namespace {
+
+static __m128i inline mm_cvtepu8_epi32(const uint32_t* C10_RESTRICT ptr) {
+  return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)ptr));
+}
+
+// TODO: We may want to hard-code an unrolled version for the case where
+// num_channels=3 to hint the compiler to vectorize this (looks at original
+// PIL-SIMD's code).
+at::Tensor unpack_rgb(const at::Tensor& packed_tensor) {
+  // Convert a "packed" tensor (typically RGBRGBRGB if channels_last) into
+  // RGBARGBARGBA format where A is hard-coded to 255. Each pixel is encoded
+  // into as 32bits. This generalizes to num_channels <= 4 and also works for
+  // non-channels_last tensors.
+
+  const uint8_t* packed = (const uint8_t*)packed_tensor.data_ptr<uint8_t>();
+  auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2);
+  auto num_channels = packed_tensor.size(0);
+
+  constexpr int rgba_size = 4;
+  auto unpacked_tensor = at::empty({rgba_size, packed_tensor.size(1), packed_tensor.size(2)}, at::CPU(at::kByte));
+  uint8_t* unpacked = (uint8_t*) unpacked_tensor.data_ptr<uint8_t>();
+
+  auto stride_i = packed_tensor.stride(2);
+  auto stride_j = packed_tensor.stride(0);
+
+  for (const auto i : c10::irange(num_pixels)) {
+    for (const auto j : c10::irange(rgba_size)) {
+      unpacked[rgba_size * i + j] = (j < num_channels) ? packed[stride_i * i + stride_j * j] : 0;
+    }
+  }
+  return unpacked_tensor;
+}
+
+void pack_rgb(
+    const at::Tensor& unpacked_tensor, // IN
+    const at::Tensor& packed_tensor // OUT
+) {
+  constexpr int rgba_size = 4;
+  uint8_t* unpacked = (uint8_t*)unpacked_tensor.data_ptr<uint8_t>();
+  uint8_t* packed = (uint8_t*)packed_tensor.data_ptr<uint8_t>();
+  auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2);
+  auto num_channels = packed_tensor.size(0);
+
+  auto packed_increment = packed_tensor.stride(2);
+  auto packed_stride = packed_tensor.stride(0);
+
+  for (const auto i C10_UNUSED : c10::irange(num_pixels)) {
+    for (const auto j : c10::irange(num_channels)) {
+      packed[j * packed_stride] = unpacked[j];
+    }
+    unpacked += rgba_size;
+    packed += packed_increment;
+  }
+}
+
+void ImagingResampleHorizontalConvolution8u4x(
+    uint32_t* C10_RESTRICT lineOut0,
+    uint32_t* C10_RESTRICT lineOut1,
+    uint32_t* C10_RESTRICT lineOut2,
+    uint32_t* C10_RESTRICT lineOut3,
+    const uint32_t* C10_RESTRICT lineIn0,
+    const uint32_t* C10_RESTRICT lineIn1,
+    const uint32_t* C10_RESTRICT lineIn2,
+    const uint32_t* C10_RESTRICT lineIn3,
+    int xsize,
+    int* xbounds,
+    int16_t* kk,
+    int kmax,
+    int coefs_precision);
+
+void ImagingResampleHorizontalConvolution8u(
+    uint32_t* C10_RESTRICT lineOut,
+    const uint32_t* C10_RESTRICT lineIn,
+    int xsize,
+    int* xbounds,
+    int16_t* kk,
+    int kmax,
+    int coefs_precision);
+
+void ImagingResampleVerticalConvolution8u(
+    uint32_t* C10_RESTRICT lineOut,
+    const uint32_t* C10_RESTRICT imIn,
+    int xmin,
+    int xmax,
+    int16_t* k,
+    int coefs_precision,
+    int xin);
+
+void ImagingResampleHorizontal(
+    const at::Tensor & unpacked_output,
+    const at::Tensor & unpacked_input,
+    int ksize,
+    const std::vector<at::Tensor>& horiz_indices_weights,
+    unsigned int horiz_weights_precision) {
+  // TODO: we may want to merge that into the fallback code (currently called
+  // basic_loop_aa_horizontal<uint8_t>)
+  // Although this may not be needed if / when we port all this code to use
+  // Vec.h since this would potentially give us another fall-back implem
+  int yy;
+
+  int16_t* kk = (int16_t*)(horiz_indices_weights[3].data_ptr<double>());
+
+  auto xout = unpacked_output.size(2);
+  auto yout = unpacked_output.size(1);
+  auto xin = unpacked_input.size(2);
+
+  std::vector<int> bounds_vec(2 * xout, 0);
+  int* bounds = bounds_vec.data();
+
+  int64_t* idx_ptr_xmin = horiz_indices_weights[0].data_ptr<int64_t>();
+  int64_t* idx_ptr_size = horiz_indices_weights[1].data_ptr<int64_t>();
+  for (int i = 0; i < xout; i++) {
+    bounds[2 * i + 0] = idx_ptr_xmin[i];
+    bounds[2 * i + 1] = idx_ptr_size[i];
+  }
+
+  uint32_t* unpacked_input_p = (uint32_t*) unpacked_input.data_ptr<uint8_t>();
+  uint32_t* unpacked_output_p = (uint32_t*) unpacked_output.data_ptr<uint8_t>();
+
+  yy = 0;
+  for (; yy < yout - 3; yy += 4) {
+    ImagingResampleHorizontalConvolution8u4x(
+        unpacked_output_p + yy * xout,
+        unpacked_output_p + (yy + 1) * xout,
+        unpacked_output_p + (yy + 2) * xout,
+        unpacked_output_p + (yy + 3) * xout,
+        unpacked_input_p + yy * xin,
+        unpacked_input_p + (yy + 1) * xin,
+        unpacked_input_p + (yy + 2) * xin,
+        unpacked_input_p + (yy + 3) * xin,
+        xout,
+        bounds,
+        kk,
+        ksize,
+        (int)horiz_weights_precision);
+  }
+  for (; yy < yout; yy++) {
+    ImagingResampleHorizontalConvolution8u(
+        unpacked_output_p + yy * xout,
+        unpacked_input_p + yy * xin,
+        xout,
+        bounds,
+        kk,
+        ksize,
+        (int)horiz_weights_precision);
+  }
+}
+
+void ImagingResampleVertical(
+    const at::Tensor & unpacked_output,
+    const at::Tensor & unpacked_input,
+    int ksize,
+    const std::vector<at::Tensor>& vert_indices_weights,
+    unsigned int vert_weights_precision) {
+  // TODO: we may want to merge that into the fallback code (currently called
+  // basic_loop_aa_vertical<uint8_t>)
+  // Although this may not be needed if / when we port all this code to use
+  // Vec.h since this would potentially give us another fall-back implem
+  int ymin, ymax;
+  int16_t* k = nullptr;
+  int16_t* kk = (int16_t*)(vert_indices_weights[3].data_ptr<double>());
+
+  int64_t* idx_ptr_xmin = vert_indices_weights[0].data_ptr<int64_t>();
+  int64_t* idx_ptr_size = vert_indices_weights[1].data_ptr<int64_t>();
+
+  uint32_t* unpacked_output_p = (uint32_t*) unpacked_output.data_ptr<uint8_t>();
+  uint32_t* unpacked_input_p = (uint32_t*) unpacked_input.data_ptr<uint8_t>();
+
+  auto xout = unpacked_output.size(2);
+  auto yout = unpacked_output.size(1);
+
+  for (const auto yy : c10::irange(yout)) {
+    k = &kk[yy * ksize];
+
+    ymin = idx_ptr_xmin[yy];
+    ymax = idx_ptr_size[yy];
+    ImagingResampleVerticalConvolution8u(
+        unpacked_output_p + yy * xout,
+        unpacked_input_p,
+        ymin,
+        ymax,
+        k,
+        (int)vert_weights_precision,
+        xout);
+  }
+}
+
+// This is the only public entry point in this file.  It supports bilinear
+// mode for uint8 dtype when C <= 4, with or without antialias. The
+// implem is based on PIL-SIMD.
+// Its equivalent implementation (fallback) for when AVX isn't supported or when
+// C > 4 is separable_upsample_generic_Nd_kernel_impl()  There are a bunch of
+// future improvement that can be done: look for the TODOs in this file.
+// For details on how the weights are computed and how the multiplications are
+// run on int (instead of float weights), see
+// [ Weights computation for uint8_t and multiplication trick ]
+// For details on how the AVX kernels are implemented, see
+// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
+// See also [ Support for antialias=False as a subcase of antilias=True ] to
+// learn more about how the antialias=False case is computed. The same holds
+// here: all these kernels are general enough to handle an arbitrary number of
+// weights, but when aa=False they could be optimized further.
+template <typename scale_type, class F>
+void upsample_avx_bilinear_uint8(
+    const at::Tensor& input,
+    const at::Tensor& output,
+    bool align_corners,
+    const scale_type& scales,
+    bool antialias) {
+  auto batch_size = input.size(0);
+  auto num_channels = input.size(1);
+  auto xin = input.size(3);
+  auto yin = input.size(2);
+  auto xout = output.size(3);
+  auto yout = output.size(2);
+
+  if (xin == xout && yin == yout) {
+    output.copy_(input);
+    return;
+  }
+
+  auto need_horizontal = xout != xin;
+  auto need_vertical = yout != yin;
+
+  int ksize_horiz, ksize_vert;
+  std::vector<at::Tensor> horiz_indices_weights, vert_indices_weights;
+  unsigned int horiz_weights_precision, vert_weights_precision;
+
+  if (need_horizontal) {
+    int interp_dim = 3;
+    std::tie(horiz_indices_weights, ksize_horiz, horiz_weights_precision) =
+        F::compute_indices_int16_weights_aa(
+            /*input_size=*/xin,
+            /*output_size=*/xout,
+            /*stride=*/1,
+            /*ndims=*/4,
+            /*reshape_dim=*/interp_dim,
+            /*align_corners=*/align_corners,
+            /*opt_scale=*/scales[interp_dim - 2],
+            /*antialias=*/antialias,
+            /*align_i32=*/true);
+  }
+
+  if (need_vertical) {
+    int interp_dim = 2;
+    std::tie(vert_indices_weights, ksize_vert, vert_weights_precision) =
+        F::compute_indices_int16_weights_aa(
+            /*input_size=*/yin,
+            /*output_size=*/yout,
+            /*stride=*/1,
+            /*ndims=*/4,
+            /*reshape_dim=*/interp_dim,
+            /*align_corners=*/align_corners,
+            /*opt_scale=*/scales[interp_dim - 2],
+            /*antialias=*/antialias,
+            /*align_i32=*/true);
+  }
+
+  bool is_rgba = num_channels == 4 && input.is_contiguous(at::MemoryFormat::ChannelsLast);
+
+  at::Tensor buffer_horiz, buffer_vert;
+  if (need_horizontal && !(is_rgba && !need_vertical)) {
+    buffer_horiz = at::empty({4, yin, xout}, input.options());
+  }
+  if (need_vertical && !is_rgba) {
+    buffer_vert = at::empty({4, yout, xout}, input.options());
+  }
+
+  // TODO: The unpack / pack operations create a copy of the original input and
+  // output tensor. There should be a way to avoid these copies by instead
+  // modifying the low-level kernels. Or maybe at least avoid copying the entire
+  // tensors and just copy part of them (line by line).
+  for (const auto i : c10::irange(batch_size)) {
+
+    at::Tensor unpacked_input = (is_rgba) ? input[i] : unpack_rgb(input[i]);
+    at::Tensor unpacked_output;
+
+    if (need_horizontal) {
+
+      at::Tensor unpacked_output_temp = (is_rgba && !need_vertical) ? output[i] : buffer_horiz;
+
+      ImagingResampleHorizontal(
+          unpacked_output_temp,
+          unpacked_input,
+          ksize_horiz,
+          horiz_indices_weights,
+          horiz_weights_precision);
+      unpacked_output = unpacked_input = unpacked_output_temp;
+    }
+    if (need_vertical) {
+      unpacked_output = (is_rgba) ? output[i] : buffer_vert;
+
+      ImagingResampleVertical(
+          unpacked_output,
+          unpacked_input,
+          ksize_vert,
+          vert_indices_weights,
+          vert_weights_precision);
+    }
+
+    TORCH_INTERNAL_ASSERT(unpacked_output.defined());
+
+    if (!is_rgba) {
+      pack_rgb(unpacked_output, output[i]);
+    }
+  }
+}
+
+// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
+void ImagingResampleHorizontalConvolution8u4x(
+    uint32_t* C10_RESTRICT lineOut0,
+    uint32_t* C10_RESTRICT lineOut1,
+    uint32_t* C10_RESTRICT lineOut2,
+    uint32_t* C10_RESTRICT lineOut3,
+    const uint32_t* C10_RESTRICT lineIn0,
+    const uint32_t* C10_RESTRICT lineIn1,
+    const uint32_t* C10_RESTRICT lineIn2,
+    const uint32_t* C10_RESTRICT lineIn3,
+    int xsize,
+    int* xbounds,
+    int16_t* kk,
+    int kmax,
+    int coefs_precision) {
+  int xmin, xmax, x;
+  int16_t* k;
+
+  for (const auto xx : c10::irange(xsize)) {
+    xmin = xbounds[xx * 2 + 0];
+    xmax = xbounds[xx * 2 + 1];
+    k = &kk[xx * kmax];
+    x = 0;
+
+    __m256i sss0, sss1;
+    __m256i zero = _mm256_setzero_si256();
+    __m256i initial = _mm256_set1_epi32(1 << (coefs_precision - 1));
+    sss0 = initial;
+    sss1 = initial;
+
+    for (; x < xmax - 3; x += 4) {
+      __m256i pix, mmk0, mmk1, source;
+
+      mmk0 = _mm256_set1_epi32(*(int32_t*)&k[x]);
+      mmk1 = _mm256_set1_epi32(*(int32_t*)&k[x + 2]);
+
+      source = _mm256_inserti128_si256(
+          _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lineIn0[x + xmin])),
+          _mm_loadu_si128((__m128i*)&lineIn1[x + xmin]),
+          1);
+      // clang-format off
+      pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk0));
+      pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
+        -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
+        -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk1));
+
+      source = _mm256_inserti128_si256(
+          _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lineIn2[x + xmin])),
+          _mm_loadu_si128((__m128i*)&lineIn3[x + xmin]),
+          1);
+      pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk0));
+      pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
+        -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
+        -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk1));
+    }
+
+    for (; x < xmax - 1; x += 2) {
+      __m256i pix, mmk;
+
+      mmk = _mm256_set1_epi32(*(int32_t*)&k[x]);
+
+      pix = _mm256_inserti128_si256(
+          _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lineIn0[x + xmin])),
+          _mm_loadl_epi64((__m128i*)&lineIn1[x + xmin]),
+          1);
+      pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
+
+      pix = _mm256_inserti128_si256(
+          _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lineIn2[x + xmin])),
+          _mm_loadl_epi64((__m128i*)&lineIn3[x + xmin]),
+          1);
+      pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
+      // clang-format on
+    }
+
+    for (; x < xmax; x++) {
+      __m256i pix, mmk;
+
+      // [16] xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0
+      mmk = _mm256_set1_epi32(k[x]);
+
+      // [16] xx a0 xx b0 xx g0 xx r0 xx a0 xx b0 xx g0 xx r0
+      pix = _mm256_inserti128_si256(
+          _mm256_castsi128_si256(mm_cvtepu8_epi32(&lineIn0[x + xmin])),
+          mm_cvtepu8_epi32(&lineIn1[x + xmin]),
+          1);
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
+
+      pix = _mm256_inserti128_si256(
+          _mm256_castsi128_si256(mm_cvtepu8_epi32(&lineIn2[x + xmin])),
+          mm_cvtepu8_epi32(&lineIn3[x + xmin]),
+          1);
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
+    }
+
+    sss0 = _mm256_srai_epi32(sss0, coefs_precision);
+    sss1 = _mm256_srai_epi32(sss1, coefs_precision);
+    sss0 = _mm256_packs_epi32(sss0, zero);
+    sss1 = _mm256_packs_epi32(sss1, zero);
+    sss0 = _mm256_packus_epi16(sss0, zero);
+    sss1 = _mm256_packus_epi16(sss1, zero);
+    lineOut0[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 0));
+    lineOut1[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1));
+    lineOut2[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 0));
+    lineOut3[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1));
+  }
+}
+
+// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
+void ImagingResampleHorizontalConvolution8u(
+    uint32_t* C10_RESTRICT lineOut,
+    const uint32_t* C10_RESTRICT lineIn,
+    int xsize,
+    int* xbounds,
+    int16_t* kk,
+    int kmax,
+    int coefs_precision) {
+  int xmin, xmax, x;
+  int16_t* k;
+
+  for (const auto xx : c10::irange(xsize)) {
+    __m128i sss;
+    xmin = xbounds[xx * 2 + 0];
+    xmax = xbounds[xx * 2 + 1];
+    k = &kk[xx * kmax];
+    x = 0;
+
+    if (xmax < 8) {
+      sss = _mm_set1_epi32(1 << (coefs_precision - 1));
+    } else {
+      // Lower part will be added to higher, use only half of the error
+      __m256i sss256 = _mm256_set1_epi32(1 << (coefs_precision - 2));
+
+      for (; x < xmax - 7; x += 8) {
+        __m256i pix, mmk, source;
+        __m128i tmp = _mm_loadu_si128((__m128i*)&k[x]);
+        __m256i ksource =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
+
+        // clang-format off
+        source = _mm256_loadu_si256((__m256i*)&lineIn[x + xmin]);
+        pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
+          -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
+          -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
+        mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
+          11,10, 9,8, 11,10, 9,8, 11,10, 9,8, 11,10, 9,8,
+          3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
+        sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
+
+        pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
+          -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
+          -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
+        mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
+          15,14, 13,12, 15,14, 13,12, 15,14, 13,12, 15,14, 13,12,
+          7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4));
+        sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
+        // clang-format on
+      }
+
+      for (; x < xmax - 3; x += 4) {
+        __m256i pix, mmk, source;
+        __m128i tmp = _mm_loadl_epi64((__m128i*)&k[x]);
+        __m256i ksource =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
+
+        tmp = _mm_loadu_si128((__m128i*)&lineIn[x + xmin]);
+        source = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
+
+        // clang-format off
+        pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
+          -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
+          -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
+        mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
+          7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4,
+          3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
+        sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
+        // clang-format on
+      }
+
+      sss = _mm_add_epi32(
+          _mm256_extracti128_si256(sss256, 0),
+          _mm256_extracti128_si256(sss256, 1));
+    }
+
+    for (; x < xmax - 1; x += 2) {
+      __m128i mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
+      __m128i source = _mm_loadl_epi64((__m128i*)&lineIn[x + xmin]);
+      __m128i pix = _mm_shuffle_epi8(
+          source,
+          _mm_set_epi8(-1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0));
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+
+    for (; x < xmax; x++) {
+      __m128i pix = mm_cvtepu8_epi32(&lineIn[x + xmin]);
+      __m128i mmk = _mm_set1_epi32(k[x]);
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+    sss = _mm_srai_epi32(sss, coefs_precision);
+    sss = _mm_packs_epi32(sss, sss);
+    lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss));
+  }
+}
+
+// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
+void ImagingResampleVerticalConvolution8u(
+    uint32_t* C10_RESTRICT lineOut,
+    const uint32_t* C10_RESTRICT imIn,
+    int xmin,
+    int xmax,
+    int16_t* k,
+    int coefs_precision,
+    int xin) {
+  int x;
+  int xx = 0;
+  int xsize = xin;
+
+  __m128i initial = _mm_set1_epi32(1 << (coefs_precision - 1));
+  __m256i initial_256 = _mm256_set1_epi32(1 << (coefs_precision - 1));
+
+  for (; xx < xsize - 7; xx += 8) {
+    __m256i sss0 = initial_256;
+    __m256i sss1 = initial_256;
+    __m256i sss2 = initial_256;
+    __m256i sss3 = initial_256;
+    x = 0;
+    for (; x < xmax - 1; x += 2) {
+      __m256i source, source1, source2;
+      __m256i pix, mmk;
+
+      // Load two coefficients at once
+      mmk = _mm256_set1_epi32(*(int32_t*)&k[x]);
+
+      // Load 2 lines
+      //                           (__m256i *) &imIn->image32[x + xmin][xx]
+      source1 = _mm256_loadu_si256((__m256i*)(imIn + (x + xmin) * xin + xx));
+      //                           (__m256i *) &imIn->image32[x + 1 + xmin][xx]
+      source2 =
+          _mm256_loadu_si256((__m256i*)(imIn + (x + 1 + xmin) * xin + xx));
+
+      source = _mm256_unpacklo_epi8(source1, source2);
+      pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
+      pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
+
+      source = _mm256_unpackhi_epi8(source1, source2);
+      pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
+      sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk));
+      pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
+      sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk));
+    }
+    for (; x < xmax; x += 1) {
+      __m256i source, source1, pix, mmk;
+      mmk = _mm256_set1_epi32(k[x]);
+
+      //                           (__m256i *) &imIn->image32[x + xmin][xx])
+      source1 = _mm256_loadu_si256((__m256i*)(imIn + (x + xmin) * xin + xx));
+
+      source = _mm256_unpacklo_epi8(source1, _mm256_setzero_si256());
+      pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
+      pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
+
+      source = _mm256_unpackhi_epi8(source1, _mm256_setzero_si256());
+      pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
+      sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk));
+      pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
+      sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk));
+    }
+    sss0 = _mm256_srai_epi32(sss0, coefs_precision);
+    sss1 = _mm256_srai_epi32(sss1, coefs_precision);
+    sss2 = _mm256_srai_epi32(sss2, coefs_precision);
+    sss3 = _mm256_srai_epi32(sss3, coefs_precision);
+
+    sss0 = _mm256_packs_epi32(sss0, sss1);
+    sss2 = _mm256_packs_epi32(sss2, sss3);
+    sss0 = _mm256_packus_epi16(sss0, sss2);
+    _mm256_storeu_si256((__m256i*)&lineOut[xx], sss0);
+  }
+
+  for (; xx < xsize - 1; xx += 2) {
+    __m128i sss0 = initial; // left row
+    __m128i sss1 = initial; // right row
+    x = 0;
+    for (; x < xmax - 1; x += 2) {
+      __m128i source, source1, source2;
+      __m128i pix, mmk;
+
+      // Load two coefficients at once
+      mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
+
+      // Load 2 lines
+      //                        (__m128i *) &imIn->image32[x + xmin][xx])
+      source1 = _mm_loadl_epi64((__m128i*)(imIn + (x + xmin) * xin + xx));
+      //                        (__m128i *) &imIn->image32[x + 1 + xmin][xx]
+      source2 = _mm_loadl_epi64((__m128i*)(imIn + (x + 1 + xmin) * xin + xx));
+
+      source = _mm_unpacklo_epi8(source1, source2);
+      pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
+      sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
+      pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
+      sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
+    }
+    for (; x < xmax; x += 1) {
+      __m128i source, source1, pix, mmk;
+      mmk = _mm_set1_epi32(k[x]);
+
+      //                        (__m128i *) &imIn->image32[x + xmin][xx]);
+      source1 = _mm_loadl_epi64((__m128i*)(imIn + (x + xmin) * xin + xx));
+
+      source = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
+      pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
+      sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
+      pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
+      sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
+    }
+    sss0 = _mm_srai_epi32(sss0, coefs_precision);
+    sss1 = _mm_srai_epi32(sss1, coefs_precision);
+
+    sss0 = _mm_packs_epi32(sss0, sss1);
+    sss0 = _mm_packus_epi16(sss0, sss0);
+    _mm_storel_epi64((__m128i*)&lineOut[xx], sss0);
+  }
+
+  for (; xx < xsize; xx++) {
+    __m128i sss = initial;
+    x = 0;
+    for (; x < xmax - 1; x += 2) {
+      __m128i source, source1, source2;
+      __m128i pix, mmk;
+
+      // Load two coefficients at once
+      mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
+
+      // Load 2 lines
+      //                           *(int *) &imIn->image32[x + xmin][xx]
+      source1 = _mm_cvtsi32_si128(*(int*)(imIn + (x + xmin) * xin + xx));
+      //                          *(int *) &imIn->image32[x + 1 + xmin][xx]
+      source2 = _mm_cvtsi32_si128(*(int*)(imIn + (x + 1 + xmin) * xin + xx));
+
+      source = _mm_unpacklo_epi8(source1, source2);
+      pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+
+    for (; x < xmax; x++) {
+      //                             &imIn->image32[x + xmin][xx]
+      __m128i pix = mm_cvtepu8_epi32(imIn + (x + xmin) * xin + xx);
+      __m128i mmk = _mm_set1_epi32(k[x]);
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+    sss = _mm_srai_epi32(sss, coefs_precision);
+    sss = _mm_packs_epi32(sss, sss);
+    lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss));
+  }
+}
+
+} // anonymous namespace
+#endif // CPU_CAPABILITY_AVX2
--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -9367,67 +9367,67 @@ class TestNNDeviceType(NNTestCase):
    @parametrize_test("antialias", [True, False])
    @parametrize_test("align_corners", [True, False])
    @parametrize_test("mode", ["bilinear", "bicubic"])
+    @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
    @onlyNativeDeviceTypes
-    def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode):
+    def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory_format):
        # Forward AD does not support XLA because XLA tensors don't have storage
        check_forward_ad = torch.device(device).type != 'xla'

        kwargs = dict(mode=mode, align_corners=align_corners, antialias=antialias)
-        for memory_format in [torch.contiguous_format, torch.channels_last]:
-            # test float scale factor up & downsampling
-            for scale_factor in [0.5, 1.5, 2]:
-                in_t = torch.ones(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
-                out_size = int(math.floor(in_t.shape[-1] * scale_factor))
-                with warnings.catch_warnings(record=True) as w:
-                    out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
-                expected_out = torch.ones(2, 3, out_size, out_size, device=device)
-                self.assertEqual(expected_out, out_t)
-                # Assert that memory format is carried through to the output
-                self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
-                out_t.backward(torch.randn_like(out_t))
-                self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))
+        # test float scale factor up & downsampling
+        for scale_factor in [0.5, 1.5, 2]:
+            in_t = torch.ones(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
+            out_size = int(math.floor(in_t.shape[-1] * scale_factor))
+            with warnings.catch_warnings(record=True) as w:
+                out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
+            expected_out = torch.ones(2, 3, out_size, out_size, device=device)
+            self.assertEqual(expected_out, out_t)
+            # Assert that memory format is carried through to the output
+            self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
+            out_t.backward(torch.randn_like(out_t))
+            self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))

-                if torch.device(device).type == 'cuda':
-                    # Bilinear backward is nondeterministic because of atomicAdd usage
-                    nondet_tol = 1e-5
-                else:
-                    nondet_tol = 0.0
+            if torch.device(device).type == 'cuda':
+                # Bilinear backward is nondeterministic because of atomicAdd usage
+                nondet_tol = 1e-5
+            else:
+                nondet_tol = 0.0

-                input = torch.randn(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
-                gradcheck(
-                    lambda x: F.interpolate(x, out_size, **kwargs),
-                    [input],
-                    check_forward_ad=check_forward_ad, nondet_tol=nondet_tol
-                )
-                gradgradcheck(
-                    lambda x: F.interpolate(x, out_size, **kwargs),
-                    [input],
-                    check_fwd_over_rev=check_forward_ad, nondet_tol=nondet_tol
-                )
+            input = torch.randn(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
+            gradcheck(
+                lambda x: F.interpolate(x, out_size, **kwargs),
+                [input],
+                check_forward_ad=check_forward_ad, nondet_tol=nondet_tol
+            )
+            gradgradcheck(
+                lambda x: F.interpolate(x, out_size, **kwargs),
+                [input],
+                check_fwd_over_rev=check_forward_ad, nondet_tol=nondet_tol
+            )

-                # Assert that cpu and cuda give same results
-                if torch.device(device).type == 'cuda':
-                    for shapes in [
-                        (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2)
-                    ]:
-                        a_cuda = torch.randn(
-                            *shapes, device=device
-                        ).contiguous(memory_format=memory_format).requires_grad_()
-                        a_cpu = a_cuda.detach().cpu().requires_grad_()
+            # Assert that cpu and cuda give same results
+            if torch.device(device).type == 'cuda':
+                for shapes in [
+                    (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2)
+                ]:
+                    a_cuda = torch.randn(
+                        *shapes, device=device
+                    ).contiguous(memory_format=memory_format).requires_grad_()
+                    a_cpu = a_cuda.detach().cpu().requires_grad_()

-                        with warnings.catch_warnings(record=True):
-                            out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, **kwargs)
-                            out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, **kwargs)
+                    with warnings.catch_warnings(record=True):
+                        out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, **kwargs)
+                        out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, **kwargs)

-                        self.assertEqual(out_cpu, out_cuda.cpu())
+                    self.assertEqual(out_cpu, out_cuda.cpu())

-                        g_cuda = torch.randn_like(out_cuda)
-                        g_cpu = g_cuda.cpu()
+                    g_cuda = torch.randn_like(out_cuda)
+                    g_cpu = g_cuda.cpu()

-                        out_cuda.backward(g_cuda)
-                        out_cpu.backward(g_cpu)
+                    out_cuda.backward(g_cuda)
+                    out_cpu.backward(g_cpu)

-                        self.assertEqual(a_cuda.grad, a_cpu.grad)
+                    self.assertEqual(a_cuda.grad, a_cpu.grad)

    @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
    def test_upsamplingBilinear2d_aa_correctness(self, device, memory_format):
@ -9445,6 +9445,40 @@ class TestNNDeviceType(NNTestCase):
        t_out = F.interpolate(t_in, size=(2, 2), mode="bilinear", align_corners=False, antialias=True)
        self.assertEqual(expected_out, t_out)

+    @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
+    @parametrize_test("antialias", [True, False])
+    @parametrize_test("align_corners", [True, False])
+    @parametrize_test("num_channels", [3, 5])
+    @parametrize_test("output_size", [32, 600])
+    def test_upsamplingBiLinear2d_consistency(self, device, memory_format, antialias, align_corners, num_channels, output_size):
+        if torch.device(device).type == "cuda":
+            raise SkipTest("CUDA implementation is not yet supporting uint8")
+
+        mode = "bilinear"
+        # Check if Max Abs Error between resized input_uint8 and resized input_float is smaller than a tolerated value, e.g. 1.0
+        input_ui8 = torch.randint(0, 256, size=(1, num_channels, 400, 400), dtype=torch.uint8, device=device)
+        input_ui8 = input_ui8.contiguous(memory_format=memory_format)
+        input_f32 = input_ui8.float()
+
+        output_f32 = F.interpolate(
+            input_f32, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias
+        )
+        output_ui8 = F.interpolate(
+            input_ui8, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias
+        )
+
+        mae_tol = 0.5
+        max_abs_err_tol = 1.0
+        num_wrong_pixels_tol = 5
+
+        abs_diff = torch.abs(output_f32.round() - output_ui8.float())
+        mae = torch.mean(abs_diff)
+        max_abs_err = torch.max(abs_diff)
+        num_wrong_pixels = (abs_diff > max_abs_err_tol).sum()
+        self.assertTrue(mae < mae_tol, msg=f"mae={mae}")
+        self.assertTrue(max_abs_err < max_abs_err_tol + 1e-5, msg=f"max ae={max_abs_err}")
+        self.assertTrue(num_wrong_pixels < num_wrong_pixels_tol, msg=f"num_wrong_pixels={num_wrong_pixels}")
+
    def test_upsamplingBicubic2d_correctness(self, device):
        # test output against known input: align_corners=False result must match opencv
        in_t = torch.arange(8., device=device).view(1, 2, 2, 2)
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@ -12118,7 +12118,7 @@ op_db: List[OpInfo] = [
           supports_fwgrad_bwgrad=True,
           supports_autograd=True,
           supports_forward_ad=True,
-           dtypes=floating_types_and(torch.bfloat16),
+           dtypes=floating_types_and(torch.uint8, torch.bfloat16),
           dtypesIfCUDA=floating_types_and(torch.half),
           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
           sample_inputs_func=partial(sample_inputs_interpolate, 'bilinear'),
@ -12184,7 +12184,7 @@ op_db: List[OpInfo] = [
           supports_autograd=True,
           supports_forward_ad=True,
           supports_fwgrad_bwgrad=True,
-           dtypes=floating_types_and(torch.bfloat16),
+           dtypes=floating_types_and(torch.uint8, torch.bfloat16),
           dtypesIfCUDA=floating_types_and(torch.half),
           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
           sample_inputs_func=partial(sample_inputs_upsample, 'bilinear'),