[MPS] Move max_pool2d to Metal for stride != 1 (#157876)

This PR updates `max_pool2d` to use a Metal kernel instead of the old MPS graph impl. However, when the `stride` argument is 1 in all dimensions, the old implementation gives significantly better performance, so we fall back to it in that case. Below is a performance comparison of `max_pool2d` before and after this PR, obtained from this script: 2f02f2bf7a/max_pool_mps/perf.py

<details><summary>Click to expand</summary>

case | before PR | after PR | speedup |   | case info
-- | -- | -- | -- | -- | --
0 | 0.014264 | 0.004473 | 3.188911245 |   | (3, 2, 2), {'kernel_size': 2, 'return_indices': True}
1 | 0.010752 | 0.00421 | 2.55391924 |   | (3, 2, 2), {'kernel_size': 2, 'return_indices': False}
2 | 0.020777 | 0.006123 | 3.393271272 |   | (3, 10, 10), {'kernel_size': 5, 'return_indices': True}
3 | 0.011065 | 0.005759 | 1.921340511 |   | (3, 10, 10), {'kernel_size': 5, 'return_indices': False}
4 | 0.01452 | 0.007829 | 1.854642994 |   | (3, 100, 100), {'kernel_size': 5, 'return_indices': True}
5 | 0.009258 | 0.007075 | 1.308551237 |   | (3, 100, 100), {'kernel_size': 5, 'return_indices': False}
6 | 0.188137 | 0.168688 | 1.115295694 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 0, 'return_indices': True}
7 | 0.161362 | 0.154746 | 1.042753932 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 0, 'return_indices': False}
8 | 0.182883 | 0.16945 | 1.079274122 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 1, 'return_indices': True}
9 | 0.156875 | 0.163346 | 0.9603847049 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 1, 'return_indices': False}
10 | 0.193433 | 0.167396 | 1.155541351 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 2, 'return_indices': True}
11 | 0.158967 | 0.151246 | 1.051049284 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 2, 'return_indices': False}
12 | 0.931071 | 0.932883 | 0.9980576342 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 0, 'return_indices': True}
13 | 0.324496 | 0.3252 | 0.9978351784 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 0, 'return_indices': False}
14 | 0.944071 | 0.936246 | 1.008357846 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 1, 'return_indices': True}
15 | 0.322171 | 0.314854 | 1.023239343 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 1, 'return_indices': False}
16 | 0.894158 | 0.886408 | 1.008743152 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 2, 'return_indices': True}
17 | 0.309338 | 0.304146 | 1.017070749 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 2, 'return_indices': False}
18 | 0.606 | 0.260546 | 2.325884873 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 0, 'return_indices': True}
19 | 0.30445 | 0.231054 | 1.317657344 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 0, 'return_indices': False}
20 | 0.474708 | 0.261925 | 1.812381407 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 1, 'return_indices': True}
21 | 0.23175 | 0.231883 | 0.9994264349 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 1, 'return_indices': False}
22 | 0.434475 | 0.266246 | 1.631855502 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 2, 'return_indices': True}
23 | 0.236942 | 0.231792 | 1.022218196 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 2, 'return_indices': False}
24 | 0.202396 | 0.174888 | 1.157289237 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 0, 'return_indices': True}
25 | 0.160679 | 0.158246 | 1.015374796 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 0, 'return_indices': False}
26 | 0.200354 | 0.184133 | 1.088093932 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 1, 'return_indices': True}
27 | 0.160779 | 0.160679 | 1.000622359 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 1, 'return_indices': False}
28 | 0.199175 | 0.178625 | 1.115045486 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 2, 'return_indices': True}
29 | 0.159458 | 0.160883 | 0.9911426316 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 2, 'return_indices': False}
30 | 0.199021 | 0.165329 | 1.203787599 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 0, 'return_indices': True}
31 | 0.156337 | 0.158213 | 0.9881425673 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 0, 'return_indices': False}
32 | 0.180146 | 0.174483 | 1.032455884 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 1, 'return_indices': True}
33 | 0.156988 | 0.158167 | 0.9925458534 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 1, 'return_indices': False}
34 | 0.182133 | 0.176521 | 1.031792251 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 2, 'return_indices': True}
35 | 0.169042 | 0.156483 | 1.080257919 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 2, 'return_indices': False}
36 | 1.767821 | 1.766254 | 1.000887188 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 0, 'return_indices': True}
37 | 1.059346 | 1.058775 | 1.000539302 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 0, 'return_indices': False}
38 | 1.85755 | 1.859429 | 0.9989894747 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 1, 'return_indices': True}
39 | 1.100417 | 1.097683 | 1.002490701 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 1, 'return_indices': False}
40 | 1.843167 | 1.847558 | 0.9976233493 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 2, 'return_indices': True}
41 | 1.090142 | 1.093163 | 0.9972364597 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 2, 'return_indices': False}
42 | 0.480867 | 0.251733 | 1.910226311 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 0, 'return_indices': True}
43 | 0.319246 | 0.236479 | 1.349997251 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 0, 'return_indices': False}
44 | 0.49315 | 0.256408 | 1.923301925 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 1, 'return_indices': True}
45 | 0.316746 | 0.227854 | 1.390127011 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 1, 'return_indices': False}
46 | 0.4912 | 0.257762 | 1.905633879 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 2, 'return_indices': True}
47 | 0.324771 | 0.229371 | 1.41592006 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 2, 'return_indices': False}
48 | 0.152904 | 0.095079 | 1.608178462 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 0, 'return_indices': True}
49 | 0.102963 | 0.089217 | 1.154073775 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 0, 'return_indices': False}
50 | 0.155158 | 0.095429 | 1.625899884 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 1, 'return_indices': True}
51 | 0.104338 | 0.089979 | 1.15958168 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 1, 'return_indices': False}
52 | 0.153121 | 0.096429 | 1.587914424 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 2, 'return_indices': True}
53 | 0.103642 | 0.090254 | 1.148336916 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 2, 'return_indices': False}
54 | 0.191071 | 0.165125 | 1.157129447 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 0, 'return_indices': True}
55 | 0.153971 | 0.149021 | 1.033216795 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 0, 'return_indices': False}
56 | 0.193192 | 0.166892 | 1.157586942 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 1, 'return_indices': True}
57 | 0.156617 | 0.15215 | 1.029359185 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 1, 'return_indices': False}
58 | 0.178033 | 0.167308 | 1.06410333 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 2, 'return_indices': True}
59 | 0.157425 | 0.164404 | 0.9575496947 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 2, 'return_indices': False}
60 | 1.757638 | 1.750896 | 1.0038506 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 0, 'return_indices': True}
61 | 1.048471 | 1.047967 | 1.000480931 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 0, 'return_indices': False}
62 | 1.790708 | 1.789767 | 1.000525767 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 1, 'return_indices': True}
63 | 1.054575 | 1.054796 | 0.9997904808 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 1, 'return_indices': False}
64 | 1.785837 | 1.784192 | 1.000921986 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 2, 'return_indices': True}
65 | 1.054713 | 1.054492 | 1.00020958 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 2, 'return_indices': False}
66 | 0.478267 | 0.261017 | 1.832321266 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 0, 'return_indices': True}
67 | 0.32005 | 0.226654 | 1.412064204 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 0, 'return_indices': False}
68 | 0.484008 | 0.254721 | 1.900149575 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 1, 'return_indices': True}
69 | 0.321 | 0.218842 | 1.466811672 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 1, 'return_indices': False}
70 | 0.482087 | 0.248771 | 1.937874591 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 2, 'return_indices': True}
71 | 0.316558 | 0.230533 | 1.373156988 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 2, 'return_indices': False}
72 | 0.137842 | 0.085088 | 1.619993419 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 0, 'return_indices': True}
73 | 0.100671 | 0.0769 | 1.309115735 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 0, 'return_indices': False}
74 | 0.148321 | 0.086967 | 1.705485989 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 1, 'return_indices': True}
75 | 0.101392 | 0.075454 | 1.343759112 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 1, 'return_indices': False}
76 | 0.150208 | 0.083742 | 1.793699697 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 2, 'return_indices': True}
77 | 0.099587 | 0.075825 | 1.313379492 |   | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 2, 'return_indices': False}
78 | 0.622546 | 0.602729 | 1.03287879 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 0, 'return_indices': True}
79 | 0.531696 | 0.5067 | 1.049330965 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 0, 'return_indices': False}
80 | 0.626646 | 0.617038 | 1.015571164 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 1, 'return_indices': True}
81 | 0.530354 | 0.525367 | 1.009492412 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 1, 'return_indices': False}
82 | 0.633933 | 0.577775 | 1.097197006 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 2, 'return_indices': True}
83 | 0.533067 | 0.526954 | 1.011600633 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 2, 'return_indices': False}
84 | 3.372867 | 3.386412 | 0.9960001914 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 0, 'return_indices': True}
85 | 1.155975 | 1.156604 | 0.9994561665 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 0, 'return_indices': False}
86 | 3.401921 | 3.39755 | 1.001286515 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 1, 'return_indices': True}
87 | 1.202829 | 1.192538 | 1.008629494 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 1, 'return_indices': False}
88 | 3.23675 | 3.220238 | 1.005127571 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 2, 'return_indices': True}
89 | 1.077067 | 1.085613 | 0.9921279498 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 2, 'return_indices': False}
90 | 1.572925 | 0.925625 | 1.699311276 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 0, 'return_indices': True}
91 | 0.791204 | 0.793454 | 0.9971642969 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 0, 'return_indices': False}
92 | 1.572742 | 0.922729 | 1.704446268 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 1, 'return_indices': True}
93 | 0.784292 | 0.788871 | 0.9941955022 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 1, 'return_indices': False}
94 | 1.526546 | 0.925708 | 1.649057802 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 2, 'return_indices': True}
95 | 0.769321 | 0.787675 | 0.9766985114 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 2, 'return_indices': False}
96 | 0.736033 | 0.612808 | 1.201082558 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 0, 'return_indices': True}
97 | 0.574625 | 0.530925 | 1.082309177 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 0, 'return_indices': False}
98 | 0.722021 | 0.614488 | 1.174996094 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 1, 'return_indices': True}
99 | 0.563171 | 0.533721 | 1.055178642 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 1, 'return_indices': False}
100 | 0.735725 | 0.613992 | 1.198264798 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 2, 'return_indices': True}
101 | 0.583487 | 0.532513 | 1.095723485 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 2, 'return_indices': False}
102 | 0.656383 | 0.575313 | 1.140914598 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 0, 'return_indices': True}
103 | 0.559796 | 0.509079 | 1.099625009 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 0, 'return_indices': False}
104 | 0.662046 | 0.572362 | 1.156691045 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 1, 'return_indices': True}
105 | 0.552633 | 0.508671 | 1.086425214 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 1, 'return_indices': False}
106 | 0.634108 | 0.574629 | 1.103508525 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 2, 'return_indices': True}
107 | 0.534013 | 0.510996 | 1.045043405 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 2, 'return_indices': False}
108 | 7.056642 | 7.066717 | 0.9985743026 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 0, 'return_indices': True}
109 | 4.144275 | 4.142658 | 1.000390329 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 0, 'return_indices': False}
110 | 7.172683 | 7.189867 | 0.9976099697 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 1, 'return_indices': True}
111 | 4.162538 | 4.158875 | 1.000880767 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 1, 'return_indices': False}
112 | 7.194233 | 7.181837 | 1.001726021 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 2, 'return_indices': True}
113 | 4.294083 | 4.196062 | 1.023360236 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 2, 'return_indices': False}
114 | 1.875692 | 0.891071 | 2.104986022 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 0, 'return_indices': True}
115 | 1.097479 | 0.781175 | 1.404907991 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 0, 'return_indices': False}
116 | 1.8883 | 0.89015 | 2.121327866 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 1, 'return_indices': True}
117 | 1.101329 | 0.778542 | 1.414604479 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 1, 'return_indices': False}
118 | 1.872833 | 0.893654 | 2.095702587 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 2, 'return_indices': True}
119 | 1.096712 | 0.784579 | 1.397835017 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 2, 'return_indices': False}
120 | 0.513029 | 0.374417 | 1.370207549 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 0, 'return_indices': True}
121 | 0.349546 | 0.305763 | 1.143192603 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 0, 'return_indices': False}
122 | 0.518929 | 0.377487 | 1.374693698 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 1, 'return_indices': True}
123 | 0.364662 | 0.3145 | 1.159497615 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 1, 'return_indices': False}
124 | 0.521275 | 0.375242 | 1.389170189 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 2, 'return_indices': True}
125 | 0.367488 | 0.308354 | 1.191773092 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 2, 'return_indices': False}
126 | 0.652342 | 0.569308 | 1.145850752 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 0, 'return_indices': True}
127 | 0.555696 | 0.506892 | 1.096280865 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 0, 'return_indices': False}
128 | 0.654333 | 0.570367 | 1.147213987 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 1, 'return_indices': True}
129 | 0.548925 | 0.505825 | 1.085207335 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 1, 'return_indices': False}
130 | 0.655908 | 0.571904 | 1.146884792 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 2, 'return_indices': True}
131 | 0.560808 | 0.508238 | 1.103435792 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 2, 'return_indices': False}
132 | 6.949462 | 6.949112 | 1.000050366 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 0, 'return_indices': True}
133 | 4.072913 | 4.065013 | 1.001943413 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 0, 'return_indices': False}
134 | 7.200896 | 7.197792 | 1.000431243 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 1, 'return_indices': True}
135 | 4.291367 | 4.218538 | 1.017264038 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 1, 'return_indices': False}
136 | 7.1823 | 7.306933 | 0.9829431856 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 2, 'return_indices': True}
137 | 4.151175 | 4.149592 | 1.000381483 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 2, 'return_indices': False}
138 | 1.781279 | 0.884288 | 2.014365229 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 0, 'return_indices': True}
139 | 1.050804 | 0.774362 | 1.356993241 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 0, 'return_indices': False}
140 | 1.860758 | 0.884637 | 2.103414169 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 1, 'return_indices': True}
141 | 1.099908 | 0.775887 | 1.417613647 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 1, 'return_indices': False}
142 | 1.857387 | 0.885738 | 2.096993693 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 2, 'return_indices': True}
143 | 1.105279 | 0.77365 | 1.428655077 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 2, 'return_indices': False}
144 | 0.489408 | 0.269583 | 1.815426047 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 0, 'return_indices': True}
145 | 0.322525 | 0.236979 | 1.360985573 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 0, 'return_indices': False}
146 | 0.515475 | 0.265813 | 1.93923924 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 1, 'return_indices': True}
147 | 0.315525 | 0.228146 | 1.382995976 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 1, 'return_indices': False}
148 | 0.503438 | 0.277204 | 1.816128194 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 2, 'return_indices': True}
149 | 0.335421 | 0.228275 | 1.469372467 |   | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 2, 'return_indices': False}
150 | 5.72495 | 4.909554 | 1.166083518 |   | (10, 10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': None, 'return_indices': True}
151 | 4.45215 | 4.251333 | 1.047236243 |   | (10, 10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': None, 'return_indices': False}
152 | 29.953021 | 29.879879 | 1.002447868 |   | (10, 10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': 1, 'return_indices': True}
153 | 9.854683 | 9.839517 | 1.001541336 |   | (10, 10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': 1, 'return_indices': False}
154 | 6.178033 | 5.697375 | 1.084364817 |   | (10, 10, 1000, 1000), {'kernel_size': 100, 'padding': 50, 'return_indices': True}
155 | 6.280317 | 5.712525 | 1.099394226 |   | (10, 10, 1000, 1000), {'kernel_size': 100, 'padding': 50, 'return_indices': False}
156 | 10.256062 | 11.336527 | 0.9046917103 |   | (10, 10, 1000, 1000), {'kernel_size': 250, 'padding': 50, 'return_indices': True}
157 | 9.469546 | 11.33705 | 0.8352742556 |   | (10, 10, 1000, 1000), {'kernel_size': 250, 'padding': 50, 'return_indices': False}
158 | 0.119087 | 0.0797 | 1.494190715 |   | (10, 10, 100, 100), {'kernel_size': 2, 'return_indices': True}
159 | 0.098713 | 0.047173 | 2.092574142 |   | (10, 10, 100, 100), {'kernel_size': 2, 'return_indices': False}
160 | 0.960812 | 0.675762 | 1.421820108 |   | (10, 10, 300, 300), {'kernel_size': 2, 'return_indices': True}
161 | 0.536546 | 0.485958 | 1.104099531 |   | (10, 10, 300, 300), {'kernel_size': 2, 'return_indices': False}
162 | 2.555225 | 1.791567 | 1.426251432 |   | (10, 10, 500, 500), {'kernel_size': 2, 'return_indices': True}
163 | 1.419087 | 1.305137 | 1.087308842 |   | (10, 10, 500, 500), {'kernel_size': 2, 'return_indices': False}
164 | 5.182008 | 3.48085 | 1.488719135 |   | (10, 10, 700, 700), {'kernel_size': 2, 'return_indices': True}
165 | 2.831779 | 2.498537 | 1.133374851 |   | (10, 10, 700, 700), {'kernel_size': 2, 'return_indices': False}
166 | 8.546038 | 5.7783 | 1.478988284 |   | (10, 10, 900, 900), {'kernel_size': 2, 'return_indices': True}
167 | 4.731004 | 4.161975 | 1.136720908 |   | (10, 10, 900, 900), {'kernel_size': 2, 'return_indices': False}
168 | 0.084754 | 0.07435 | 1.139932751 |   | (10, 10, 100, 100), {'kernel_size': 2, 'return_indices': True}
169 | 0.057933 | 0.043096 | 1.344277891 |   | (10, 10, 100, 100), {'kernel_size': 2, 'return_indices': False}
170 | 2.568592 | 1.802117 | 1.425319222 |   | (10, 10, 500, 500), {'kernel_size': 2, 'return_indices': True}
171 | 1.433054 | 1.307342 | 1.096158465 |   | (10, 10, 500, 500), {'kernel_size': 2, 'return_indices': False}
172 | 10.3213 | 7.111604 | 1.451332217 |   | (10, 10, 1000, 1000), {'kernel_size': 2, 'return_indices': True}
173 | 5.680525 | 5.168129 | 1.099145358 |   | (10, 10, 1000, 1000), {'kernel_size': 2, 'return_indices': False}
174 | 1.02255 | 1.01375 | 1.008680641 |   | (10, 1000, 1000), {'kernel_size': 2, 'padding': 1, 'stride': 1, 'return_indices': False}
175 | 3.074233 | 3.094383 | 0.993488201 |   | (10, 1000, 1000), {'kernel_size': 2, 'padding': 1, 'stride': 1, 'return_indices': True}
176 | 1.016812 | 1.030575 | 0.9866453194 |   | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': 1, 'return_indices': False}
177 | 3.053658 | 3.089504 | 0.9883974903 |   | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': 1, 'return_indices': True}
178 | 1.025863 | 1.032088 | 0.9939685376 |   | (10, 1000, 1000), {'kernel_size': 8, 'padding': 1, 'stride': 1, 'return_indices': False}
179 | 3.798942 | 3.799213 | 0.9999286694 |   | (10, 1000, 1000), {'kernel_size': 8, 'padding': 1, 'stride': 1, 'return_indices': True}
180 | 4.492979 | 4.493421 | 0.999901634 |   | (10, 1000, 1000), {'kernel_size': 16, 'padding': 1, 'stride': 1, 'return_indices': False}
181 | 51.543363 | 51.266204 | 1.005406271 |   | (10, 1000, 1000), {'kernel_size': 16, 'padding': 1, 'stride': 1, 'return_indices': True}
182 | 1.018008 | 1.001587 | 1.016394981 |   | (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (1, 1), 'return_indices': False}
183 | 3.035404 | 3.003113 | 1.010752509 |   | (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (1, 1), 'return_indices': True}
184 | 0.610421 | 0.56 | 1.0900375 |   | (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (1, 4), 'return_indices': False}
185 | 1.138983 | 0.757296 | 1.504012962 |   | (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (1, 4), 'return_indices': True}
186 | 0.641558 | 0.557808 | 1.150141267 |   | (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (4, 1), 'return_indices': False}
187 | 1.181475 | 0.754725 | 1.565437742 |   | (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (4, 1), 'return_indices': True}
188 | 1.03045 | 1.026904 | 1.003453098 |   | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (1, 1), 'return_indices': False}
189 | 3.041421 | 3.0263 | 1.00499653 |   | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (1, 1), 'return_indices': True}
190 | 0.609929 | 0.572304 | 1.065743032 |   | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (1, 4), 'return_indices': False}
191 | 1.146875 | 0.756446 | 1.516135983 |   | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (1, 4), 'return_indices': True}
192 | 0.645187 | 0.561708 | 1.148616363 |   | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (4, 1), 'return_indices': False}
193 | 1.181721 | 0.758054 | 1.558887625 |   | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (4, 1), 'return_indices': True}
194 | 0.927654 | 0.925946 | 1.0018446 |   | (10, 1000, 1000), {'kernel_size': 1, 'return_indices': False}
195 | 2.749983 | 2.740354 | 1.00351378 |   | (10, 1000, 1000), {'kernel_size': 1, 'return_indices': True}

</details>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/157876
Approved by: https://github.com/malfet
This commit is contained in:
Kurt Mohler
2025-07-31 12:22:20 -05:00
committed by PyTorch MergeBot
parent a4f69a5da0
commit 86eb65f7f0
2 changed files with 185 additions and 57 deletions

View File

@ -88,6 +88,53 @@ void max_pool_3d_input_iter(
}
}
template <typename T, bool return_indices>
void max_pool_2d_input_iter(
constant T* input,
device T* output,
device int64_t* indices,
constant int32_t* input_sizes,
constant int32_t* input_strides,
thread int32_t (&pooling_dim_indices)[3],
constant int32_t* kernel_size,
constant int32_t* stride,
constant int32_t* padding,
constant int32_t* dilation) {
auto bounds0 = get_input_iter_bounds<0>(
input_sizes, pooling_dim_indices, kernel_size, stride, padding, dilation);
auto bounds1 = get_input_iter_bounds<1>(
input_sizes, pooling_dim_indices, kernel_size, stride, padding, dilation);
auto d0 = dilation[0];
auto d1 = dilation[1];
T max_value = input
[input_strides[0] * bounds0.start + input_strides[1] * bounds1.start];
auto max_index = bounds0.start * input_sizes[1] + bounds1.start;
for (auto i0 = bounds0.start; i0 < bounds0.end; i0 += d0) {
auto offset0 = input_strides[0] * i0;
for (auto i1 = bounds1.start; i1 < bounds1.end; i1 += d1) {
auto offset1 = input_strides[1] * i1;
auto input_value = input[offset0 + offset1];
bool is_greater = input_value > max_value;
max_value = is_greater ? input_value : max_value;
if (return_indices) {
auto input_index = i0 * input_sizes[1] + i1;
max_index = is_greater ? input_index : max_index;
}
}
}
*output = max_value;
if (return_indices) {
*indices = max_index;
}
}
struct PoolOffsets {
int32_t output;
int32_t indices;
@ -212,7 +259,7 @@ kernel void max_pool(
PoolOffsets offsets = find_pool_offsets(
output_sizes,
output_strides,
indices_strides,
return_indices ? indices_strides : nullptr,
input_strides,
pooling_dim_indices,
dims,
@ -224,18 +271,47 @@ kernel void max_pool(
indices += offsets.indices;
input += offsets.input_leading;
max_pool_3d_input_iter<T>(
input,
output,
indices,
input_sizes + leading_dims,
input_strides + leading_dims,
pooling_dim_indices,
kernel_size,
stride,
padding,
dilation,
return_indices);
switch (pooling_dims) {
case 2:
if (return_indices) {
return max_pool_2d_input_iter<T, /*return_indices=*/true>(
input,
output,
indices,
input_sizes + leading_dims,
input_strides + leading_dims,
pooling_dim_indices,
kernel_size,
stride,
padding,
dilation);
} else {
return max_pool_2d_input_iter<T, /*return_indices=*/false>(
input,
output,
indices,
input_sizes + leading_dims,
input_strides + leading_dims,
pooling_dim_indices,
kernel_size,
stride,
padding,
dilation);
}
case 3:
return max_pool_3d_input_iter<T>(
input,
output,
indices,
input_sizes + leading_dims,
input_strides + leading_dims,
pooling_dim_indices,
kernel_size,
stride,
padding,
dilation,
return_indices);
}
}
// Finds the element in the grad input which corresponds to the index into the

View File

@ -297,13 +297,13 @@ static PoolSizes process_pool_sizes(const Tensor& input,
pooling_dims,
" ints");
TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 3,
TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == pooling_dims,
op_name,
": stride must either be omitted, a single int, or a tuple of ",
pooling_dims,
" ints");
TORCH_CHECK(padding.size() == 1 || padding.size() == 3,
TORCH_CHECK(padding.size() == 1 || padding.size() == pooling_dims,
op_name,
": padding must either be a single int, or a tuple of ",
pooling_dims,
@ -333,6 +333,22 @@ static PoolSizes process_pool_sizes(const Tensor& input,
": pad should be at most half of effective kernel size");
}
if (pooling_dims == 2) {
const auto memory_format = input.suggest_memory_format();
bool valid_dims = input.size(1) != 0 && input.size(2) != 0;
if (memory_format == at::MemoryFormat::ChannelsLast) {
// Expect tensor in NHWC format and allow 0-dim only for N.
TORCH_CHECK((dims == 4 && valid_dims && input.size(3) != 0),
"Expected 4D (batch mode) tensor expected for input with channels_last layout"
" with optional 0 dim batch size for input, but got: ",
input.sizes());
} else {
TORCH_CHECK((dims == 3 && input.size(0) != 0 && valid_dims) || (dims == 4 && valid_dims && input.size(3) != 0),
"Expected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got:",
input.sizes());
}
}
for (const auto dim : c10::irange(static_cast<int>(leading_dims == 2), dims)) {
TORCH_CHECK(input.size(dim) > 0, op_name, ": Expected input's non-batch dimensions to have positive length");
}
@ -786,6 +802,16 @@ static void avg_pool_backward_out_mps_template(const Tensor& grad_input,
} // namespace mps
// TODO: The MPS graph impl can sometimes give significantly better performance
// than the Metal impl for cases where the stride is 1 in all dimensions. There
// may be a code path in the graph kernel that specifically optimizes for that
// case. We should look into implementing a specialized case in Metal so we can
// avoid using the graph impl.
static bool use_graph_for_max_pool2d(IntArrayRef kernel_size, IntArrayRef stride_) {
IntArrayRef stride = stride_.empty() ? kernel_size : stride_;
return (stride[0] == 1) && (stride.size() == 1 || stride[1] == 1);
}
Tensor mps_max_pool2d(const Tensor& input,
IntArrayRef kernel_size,
IntArrayRef stride,
@ -793,24 +819,37 @@ Tensor mps_max_pool2d(const Tensor& input,
IntArrayRef dilation,
bool ceil_mode) {
Tensor output = at::empty({0}, input.options(), MemoryFormat::Contiguous);
mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
MPSGraph* mpsGraph = cachedGraph.graph();
return [mpsGraph maxPooling2DWithSourceTensor:cachedGraph.inputTensor descriptor:desc name:nil];
};
mps::pool2d_template(input,
output,
std::nullopt,
std::nullopt,
kernel_size,
stride,
padding,
dilation,
ceil_mode,
false,
std::nullopt,
pooling_op_block,
"max_pool2d");
bool use_graph = use_graph_for_max_pool2d(kernel_size, stride);
if (use_graph) {
mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
MPSGraph* mpsGraph = cachedGraph.graph();
return [mpsGraph maxPooling2DWithSourceTensor:cachedGraph.inputTensor descriptor:desc name:nil];
};
mps::pool2d_template(input,
output,
std::nullopt,
std::nullopt,
kernel_size,
stride,
padding,
dilation,
ceil_mode,
false,
std::nullopt,
pooling_op_block,
"max_pool2d");
} else {
mps::max_pool_with_indices_out_mps_template(output,
std::nullopt,
input,
kernel_size,
stride,
padding,
dilation,
ceil_mode,
/*pooling_dims=*/2,
"max_pool2d");
}
return output;
}
@ -855,32 +894,45 @@ TORCH_IMPL_FUNC(max_pool2d_with_indices_out_mps)
bool ceil_mode,
const Tensor& output,
const Tensor& indices) {
auto indices_memory_format = indices.suggest_memory_format();
bool use_graph = use_graph_for_max_pool2d(kernel_size, stride);
if (use_graph) {
auto indices_memory_format = indices.suggest_memory_format();
mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
MPSGraph* mpsGraph = cachedGraph.graph();
NSArray<MPSGraphTensor*>* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor:cachedGraph.inputTensor
descriptor:desc
name:nil];
cachedGraph.indicesTensor = mps::castMPSTensor(mpsGraph, poolOutputs[1], ScalarType::Long);
return poolOutputs[0];
};
mps::pool2d_template(input,
output,
indices,
std::nullopt,
kernel_size,
stride,
padding,
dilation,
ceil_mode,
false,
std::nullopt,
pooling_op_block,
"max_pool2d_indices");
mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
MPSGraph* mpsGraph = cachedGraph.graph();
NSArray<MPSGraphTensor*>* poolOutputs =
[mpsGraph maxPooling2DReturnIndicesWithSourceTensor:cachedGraph.inputTensor descriptor:desc name:nil];
cachedGraph.indicesTensor = mps::castMPSTensor(mpsGraph, poolOutputs[1], ScalarType::Long);
return poolOutputs[0];
};
mps::pool2d_template(input,
output,
indices,
std::nullopt,
kernel_size,
stride,
padding,
dilation,
ceil_mode,
false,
std::nullopt,
pooling_op_block,
"max_pool2d_indices");
if (indices_memory_format == MemoryFormat::ChannelsLast) {
const_cast<Tensor&>(indices) = indices.to(MemoryFormat::ChannelsLast);
}
if (indices_memory_format == MemoryFormat::ChannelsLast) {
const_cast<Tensor&>(indices) = indices.to(MemoryFormat::ChannelsLast);
} else {
mps::max_pool_with_indices_out_mps_template(output,
indices,
input,
kernel_size,
stride,
padding,
dilation,
ceil_mode,
/*pooling_dims=*/2,
"max_pool2d");
}
}