mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
Ensure large tensor int32 -> int64 indexing is enabled (#157767)
Fixes: #https://github.com/pytorch/pytorch/issues/157446 I think that this delta is worth the switch form block-ptrs especially since they are deprecated ## Perf Summary A is nightly B is this diff, so `negative` means this diff improves perf TOP 5 differences <img width="805" height="754" alt="Screenshot 2025-08-24 at 5 49 49 PM" src="https://github.com/user-attachments/assets/aa359cdf-ee9a-427d-be72-1b9aef6f3115" /> <details> <summary><strong>Full perf table (click to expand)</strong></summary> | attn_type | dtype | shape(B,Hq,M,Hkv,N,D) | TFlops Version A | TFlops Version B | | --- | --- | --- | --- | --- | | noop | torch.bfloat16 | (2, 16, 1024, 16, 1024, 64) | 258.38834144791923 | 258.6353685004612 | | causal | torch.bfloat16 | (2, 16, 1024, 16, 1024, 64) | 142.2192450677751 | 140.12393320464972 | | alibi | torch.bfloat16 | (2, 16, 1024, 16, 1024, 64) | 122.32683823617003 | 118.51603755647925 | | sliding_window | torch.bfloat16 | (2, 16, 1024, 16, 1024, 64) | 142.48556906165314 | 137.24259849208627 | | document_mask | torch.bfloat16 | (2, 16, 1024, 16, 1024, 64) | 86.59814488695922 | 84.59431398586257 | | noop | torch.bfloat16 | (2, 16, 1024, 16, 1024, 128) | 288.52679758135764 | 292.9174195871856 | | causal | torch.bfloat16 | (2, 16, 1024, 16, 1024, 128) | 172.25541683643277 | 172.94326459828508 | | alibi | torch.bfloat16 | (2, 16, 1024, 16, 1024, 128) | 164.40864610599826 | 165.035129576335 | | sliding_window | torch.bfloat16 | (2, 16, 1024, 16, 1024, 128) | 176.54876886433945 | 175.08057670028145 | | document_mask | torch.bfloat16 | (2, 16, 1024, 16, 1024, 128) | 125.22491679812626 | 121.06201152859151 | | noop | torch.bfloat16 | (2, 16, 2048, 16, 2048, 64) | 339.11952481874283 | 339.0132835601695 | | causal | torch.bfloat16 | (2, 16, 2048, 16, 2048, 64) | 227.58583240284406 | 228.21824999409597 | | alibi | torch.bfloat16 | (2, 16, 2048, 16, 2048, 64) | 185.98569659868966 | 182.32850843255093 | | sliding_window | torch.bfloat16 | (2, 16, 2048, 16, 2048, 64) | 188.9495725191772 | 180.31385312481657 | | document_mask | torch.bfloat16 | (2, 16, 2048, 16, 2048, 64) | 106.25789530994302 | 106.55084959448476 | | noop | torch.bfloat16 | (2, 16, 2048, 16, 2048, 128) | 357.6430536888533 | 363.30843452247274 | | causal | torch.bfloat16 | (2, 16, 2048, 16, 2048, 128) | 262.3241154406613 | 265.73250045488 | | alibi | torch.bfloat16 | (2, 16, 2048, 16, 2048, 128) | 249.30498953911416 | 249.35928192833785 | | sliding_window | torch.bfloat16 | (2, 16, 2048, 16, 2048, 128) | 224.74126243851808 | 223.71776504077988 | | document_mask | torch.bfloat16 | (2, 16, 2048, 16, 2048, 128) | 168.26977014013707 | 165.47991483333809 | | noop | torch.bfloat16 | (2, 16, 4096, 16, 4096, 64) | 382.8178701785897 | 384.34752965862685 | | causal | torch.bfloat16 | (2, 16, 4096, 16, 4096, 64) | 308.1449710013853 | 311.0653716044644 | | alibi | torch.bfloat16 | (2, 16, 4096, 16, 4096, 64) | 251.96365252505072 | 243.92283557225903 | | sliding_window | torch.bfloat16 | (2, 16, 4096, 16, 4096, 64) | 226.69316232745368 | 215.22769268913356 | | document_mask | torch.bfloat16 | (2, 16, 4096, 16, 4096, 64) | 153.34142545296405 | 151.9312673939401 | | noop | torch.bfloat16 | (2, 16, 4096, 16, 4096, 128) | 396.0998000753126 | 398.35036286102473 | | causal | torch.bfloat16 | (2, 16, 4096, 16, 4096, 128) | 333.5198415274966 | 344.6354466169716 | | alibi | torch.bfloat16 | (2, 16, 4096, 16, 4096, 128) | 310.5955933379696 | 305.66347819546 | | sliding_window | torch.bfloat16 | (2, 16, 4096, 16, 4096, 128) | 260.4012412689896 | 259.758666997307 | | document_mask | torch.bfloat16 | (2, 16, 4096, 16, 4096, 128) | 234.13034252182635 | 227.61676497283614 | | noop | torch.bfloat16 | (2, 16, 8192, 16, 8192, 64) | 396.17615538477196 | 401.1419104525502 | | causal | torch.bfloat16 | (2, 16, 8192, 16, 8192, 64) | 359.98648311998414 | 360.8285563463094 | | alibi | torch.bfloat16 | (2, 16, 8192, 16, 8192, 64) | 291.97720707257736 | 281.41694809965253 | | sliding_window | torch.bfloat16 | (2, 16, 8192, 16, 8192, 64) | 250.1703628419691 | 238.556760291579 | | document_mask | torch.bfloat16 | (2, 16, 8192, 16, 8192, 64) | 199.50782826294306 | 191.52327358439223 | | noop | torch.bfloat16 | (2, 16, 8192, 16, 8192, 128) | 411.0632004785396 | 413.6362648405517 | | causal | torch.bfloat16 | (2, 16, 8192, 16, 8192, 128) | 382.9404387613185 | 397.74886235657607 | | alibi | torch.bfloat16 | (2, 16, 8192, 16, 8192, 128) | 357.0998545146633 | 350.5115200772392 | | sliding_window | torch.bfloat16 | (2, 16, 8192, 16, 8192, 128) | 281.8033924428203 | 281.98601309215843 | | document_mask | torch.bfloat16 | (2, 16, 8192, 16, 8192, 128) | 282.56595134222135 | 277.4565795466672 | | noop | torch.bfloat16 | (2, 16, 16384, 16, 16384, 64) | 408.89838018149516 | 405.14531386840076 | | causal | torch.bfloat16 | (2, 16, 16384, 16, 16384, 64) | 396.07662058160264 | 393.4598228299578 | | alibi | torch.bfloat16 | (2, 16, 16384, 16, 16384, 64) | 317.8822887267849 | 304.754931401036 | | sliding_window | torch.bfloat16 | (2, 16, 16384, 16, 16384, 64) | 265.8801304948243 | 254.22961974295112 | | document_mask | torch.bfloat16 | (2, 16, 16384, 16, 16384, 64) | 227.87390579965614 | 222.19481980110393 | | noop | torch.bfloat16 | (2, 16, 16384, 16, 16384, 128) | 427.36821778477025 | 431.3766620314935 | | causal | torch.bfloat16 | (2, 16, 16384, 16, 16384, 128) | 410.67994346825 | 423.4666944003808 | | alibi | torch.bfloat16 | (2, 16, 16384, 16, 16384, 128) | 381.1968748374038 | 381.77668006420424 | | sliding_window | torch.bfloat16 | (2, 16, 16384, 16, 16384, 128) | 292.5540046358546 | 296.5439130720502 | | document_mask | torch.bfloat16 | (2, 16, 16384, 16, 16384, 128) | 321.04573768858114 | 310.7423616656888 | | noop | torch.bfloat16 | (2, 16, 32768, 16, 32768, 64) | 427.46148866769903 | 426.162091037068 | | causal | torch.bfloat16 | (2, 16, 32768, 16, 32768, 64) | 419.75580537687347 | 421.88640120274334 | | alibi | torch.bfloat16 | (2, 16, 32768, 16, 32768, 64) | 337.3208051798903 | 327.4912454675092 | | sliding_window | torch.bfloat16 | (2, 16, 32768, 16, 32768, 64) | 276.5638854539581 | 262.988360558083 | | document_mask | torch.bfloat16 | (2, 16, 32768, 16, 32768, 64) | 250.82791326036886 | 245.07367032501736 | | noop | torch.bfloat16 | (2, 16, 32768, 16, 32768, 128) | 435.8055824506086 | 441.8803729460534 | | causal | torch.bfloat16 | (2, 16, 32768, 16, 32768, 128) | 432.02638235921006 | 450.33161016596273 | | alibi | torch.bfloat16 | (2, 16, 32768, 16, 32768, 128) | 402.25525939224883 | 393.8564689669916 | | sliding_window | torch.bfloat16 | (2, 16, 32768, 16, 32768, 128) | 297.5337286675904 | 297.0131881135074 | | document_mask | torch.bfloat16 | (2, 16, 32768, 16, 32768, 128) | 343.8697037899545 | 329.8194073407783 | | noop | torch.bfloat16 | (2, 16, 1024, 4, 1024, 64) | 267.58912366821056 | 256.91606054118375 | | causal | torch.bfloat16 | (2, 16, 1024, 4, 1024, 64) | 150.81723692609629 | 146.32172267858743 | | alibi | torch.bfloat16 | (2, 16, 1024, 4, 1024, 64) | 129.51029293209245 | 122.72144394093334 | | sliding_window | torch.bfloat16 | (2, 16, 1024, 4, 1024, 64) | 147.627656359087 | 141.68956350566188 | | document_mask | torch.bfloat16 | (2, 16, 1024, 4, 1024, 64) | 87.55100546003591 | 84.91293287692788 | | noop | torch.bfloat16 | (2, 16, 1024, 4, 1024, 128) | 299.5931492743986 | 305.884253766691 | | causal | torch.bfloat16 | (2, 16, 1024, 4, 1024, 128) | 179.39026367843837 | 181.64741311605096 | | alibi | torch.bfloat16 | (2, 16, 1024, 4, 1024, 128) | 173.93547669282367 | 173.23972950980564 | | sliding_window | torch.bfloat16 | (2, 16, 1024, 4, 1024, 128) | 185.90234171599252 | 182.80844545446686 | | document_mask | torch.bfloat16 | (2, 16, 1024, 4, 1024, 128) | 128.08176696266082 | 123.27722685662111 | | noop | torch.bfloat16 | (2, 16, 2048, 4, 2048, 64) | 340.50674552770664 | 338.9071088484576 | | causal | torch.bfloat16 | (2, 16, 2048, 4, 2048, 64) | 225.4438318650432 | 230.22899884832975 | | alibi | torch.bfloat16 | (2, 16, 2048, 4, 2048, 64) | 194.15123248528312 | 185.02793973094865 | | sliding_window | torch.bfloat16 | (2, 16, 2048, 4, 2048, 64) | 200.74289714108176 | 191.76606719670647 | | document_mask | torch.bfloat16 | (2, 16, 2048, 4, 2048, 64) | 107.03564946728423 | 106.82432377861258 | | noop | torch.bfloat16 | (2, 16, 2048, 4, 2048, 128) | 371.31799283918406 | 379.7555394732925 | | causal | torch.bfloat16 | (2, 16, 2048, 4, 2048, 128) | 275.97762744310455 | 276.71106853992995 | | alibi | torch.bfloat16 | (2, 16, 2048, 4, 2048, 128) | 261.6648679783462 | 259.4127232060398 | | sliding_window | torch.bfloat16 | (2, 16, 2048, 4, 2048, 128) | 237.03108223577615 | 233.92710216149527 | | document_mask | torch.bfloat16 | (2, 16, 2048, 4, 2048, 128) | 172.13926800371152 | 168.74390922407585 | | noop | torch.bfloat16 | (2, 16, 4096, 4, 4096, 64) | 381.50199487767276 | 383.9043681999597 | | causal | torch.bfloat16 | (2, 16, 4096, 4, 4096, 64) | 307.9748883093411 | 312.2403515462001 | | alibi | torch.bfloat16 | (2, 16, 4096, 4, 4096, 64) | 251.11319684705438 | 243.17870127827277 | | sliding_window | torch.bfloat16 | (2, 16, 4096, 4, 4096, 64) | 236.3253127246763 | 223.81250201769552 | | document_mask | torch.bfloat16 | (2, 16, 4096, 4, 4096, 64) | 154.55693991756874 | 153.11360584987685 | | noop | torch.bfloat16 | (2, 16, 4096, 4, 4096, 128) | 407.11400078586615 | 413.53709886086557 | | causal | torch.bfloat16 | (2, 16, 4096, 4, 4096, 128) | 348.1705797722622 | 360.09771155957367 | | alibi | torch.bfloat16 | (2, 16, 4096, 4, 4096, 128) | 321.8593280850388 | 318.2882327401255 | | sliding_window | torch.bfloat16 | (2, 16, 4096, 4, 4096, 128) | 270.089032013835 | 268.767323026064 | | document_mask | torch.bfloat16 | (2, 16, 4096, 4, 4096, 128) | 238.07324557907788 | 228.09842078362692 | | noop | torch.bfloat16 | (2, 16, 8192, 4, 8192, 64) | 399.8172853171901 | 401.0954526332136 | | causal | torch.bfloat16 | (2, 16, 8192, 4, 8192, 64) | 363.4387330438581 | 364.13111024232677 | | alibi | torch.bfloat16 | (2, 16, 8192, 4, 8192, 64) | 294.1752429133857 | 283.7235663368415 | | sliding_window | torch.bfloat16 | (2, 16, 8192, 4, 8192, 64) | 256.8389394007649 | 246.91771015606483 | | document_mask | torch.bfloat16 | (2, 16, 8192, 4, 8192, 64) | 199.3378564292656 | 192.40439590901758 | | noop | torch.bfloat16 | (2, 16, 8192, 4, 8192, 128) | 425.5150965556111 | 430.8190098707553 | | causal | torch.bfloat16 | (2, 16, 8192, 4, 8192, 128) | 396.00437184073013 | 411.3873625655787 | | alibi | torch.bfloat16 | (2, 16, 8192, 4, 8192, 128) | 369.92803661607815 | 361.43244467343663 | | sliding_window | torch.bfloat16 | (2, 16, 8192, 4, 8192, 128) | 293.4277354412933 | 295.2529537595746 | | document_mask | torch.bfloat16 | (2, 16, 8192, 4, 8192, 128) | 288.0208673072841 | 281.51896404878863 | | noop | torch.bfloat16 | (2, 16, 16384, 4, 16384, 64) | 408.3005367220567 | 408.96116482298913 | | causal | torch.bfloat16 | (2, 16, 16384, 4, 16384, 64) | 396.90095962766304 | 396.87385456176486 | | alibi | torch.bfloat16 | (2, 16, 16384, 4, 16384, 64) | 319.0534576137999 | 302.50950358107764 | | sliding_window | torch.bfloat16 | (2, 16, 16384, 4, 16384, 64) | 270.3334977708081 | 258.8506349486557 | | document_mask | torch.bfloat16 | (2, 16, 16384, 4, 16384, 64) | 227.46824134365394 | 222.23759438128766 | | noop | torch.bfloat16 | (2, 16, 16384, 4, 16384, 128) | 438.24247309479694 | 437.7975163205371 | | causal | torch.bfloat16 | (2, 16, 16384, 4, 16384, 128) | 428.34012029699227 | 433.3215899950434 | | alibi | torch.bfloat16 | (2, 16, 16384, 4, 16384, 128) | 386.52672049728875 | 388.26216893354984 | | sliding_window | torch.bfloat16 | (2, 16, 16384, 4, 16384, 128) | 302.71976814728083 | 302.3574867306459 | | document_mask | torch.bfloat16 | (2, 16, 16384, 4, 16384, 128) | 327.39760662780986 | 308.6348428844912 | | noop | torch.bfloat16 | (2, 16, 32768, 4, 32768, 64) | 423.31308678262695 | 426.6306972137279 | | causal | torch.bfloat16 | (2, 16, 32768, 4, 32768, 64) | 412.6983690923106 | 419.4961977664297 | | alibi | torch.bfloat16 | (2, 16, 32768, 4, 32768, 64) | 337.41003544742273 | 324.2155049126126 | | sliding_window | torch.bfloat16 | (2, 16, 32768, 4, 32768, 64) | 278.7755890910794 | 265.9194286636502 | | document_mask | torch.bfloat16 | (2, 16, 32768, 4, 32768, 64) | 251.55678254755364 | 244.8843180141462 | | noop | torch.bfloat16 | (2, 16, 32768, 4, 32768, 128) | 452.5930781172308 | 457.7117122300742 | | causal | torch.bfloat16 | (2, 16, 32768, 4, 32768, 128) | 445.05676260348116 | 463.9304535499636 | | alibi | torch.bfloat16 | (2, 16, 32768, 4, 32768, 128) | 415.78302138389415 | 406.29229555271456 | | sliding_window | torch.bfloat16 | (2, 16, 32768, 4, 32768, 128) | 308.0311067300895 | 304.91354721414314 | | document_mask | torch.bfloat16 | (2, 16, 32768, 4, 32768, 128) | 351.43943626809335 | 329.4476923070317 | | noop | torch.bfloat16 | (4, 16, 1024, 16, 1024, 64) | 295.1801525813241 | 291.36521287398904 | | causal | torch.bfloat16 | (4, 16, 1024, 16, 1024, 64) | 183.23250549178067 | 182.35421238887605 | | alibi | torch.bfloat16 | (4, 16, 1024, 16, 1024, 64) | 151.56832453117747 | 151.3422139154794 | | sliding_window | torch.bfloat16 | (4, 16, 1024, 16, 1024, 64) | 171.02111935180432 | 160.72516856727913 | | document_mask | torch.bfloat16 | (4, 16, 1024, 16, 1024, 64) | 74.05765122783826 | 74.5885345035243 | | noop | torch.bfloat16 | (4, 16, 1024, 16, 1024, 128) | 314.3587394591763 | 319.2938677773619 | | causal | torch.bfloat16 | (4, 16, 1024, 16, 1024, 128) | 224.57002084153177 | 225.48868542008177 | | alibi | torch.bfloat16 | (4, 16, 1024, 16, 1024, 128) | 216.00964804143052 | 215.39576159953486 | | sliding_window | torch.bfloat16 | (4, 16, 1024, 16, 1024, 128) | 216.1174237618258 | 214.28437413525663 | | document_mask | torch.bfloat16 | (4, 16, 1024, 16, 1024, 128) | 121.08920423648368 | 119.55813661872644 | | noop | torch.bfloat16 | (4, 16, 2048, 16, 2048, 64) | 362.2193857281911 | 360.05005804275936 | | causal | torch.bfloat16 | (4, 16, 2048, 16, 2048, 64) | 279.8840217430121 | 279.5437918286659 | | alibi | torch.bfloat16 | (4, 16, 2048, 16, 2048, 64) | 227.76617121021982 | 222.8655938229316 | | sliding_window | torch.bfloat16 | (4, 16, 2048, 16, 2048, 64) | 215.43141176970562 | 207.71852284994702 | | document_mask | torch.bfloat16 | (4, 16, 2048, 16, 2048, 64) | 121.35588364218539 | 121.20636565046884 | | noop | torch.bfloat16 | (4, 16, 2048, 16, 2048, 128) | 365.1545280898012 | 373.37585444987326 | | causal | torch.bfloat16 | (4, 16, 2048, 16, 2048, 128) | 304.360119952975 | 309.1247297936263 | | alibi | torch.bfloat16 | (4, 16, 2048, 16, 2048, 128) | 287.2603904544586 | 289.25547903162595 | | sliding_window | torch.bfloat16 | (4, 16, 2048, 16, 2048, 128) | 257.9852675272418 | 257.59069234098115 | | document_mask | torch.bfloat16 | (4, 16, 2048, 16, 2048, 128) | 188.35158496670232 | 184.24683960154857 | | noop | torch.bfloat16 | (4, 16, 4096, 16, 4096, 64) | 389.9744911369211 | 388.43466897254166 | | causal | torch.bfloat16 | (4, 16, 4096, 16, 4096, 64) | 345.9228295166513 | 342.63034895210126 | | alibi | torch.bfloat16 | (4, 16, 4096, 16, 4096, 64) | 279.56334658247437 | 271.2724375402088 | | sliding_window | torch.bfloat16 | (4, 16, 4096, 16, 4096, 64) | 245.66477202810066 | 233.49688207371258 | | document_mask | torch.bfloat16 | (4, 16, 4096, 16, 4096, 64) | 170.3270720653187 | 166.23863845657382 | | noop | torch.bfloat16 | (4, 16, 4096, 16, 4096, 128) | 400.0041140827554 | 402.11182445396497 | | causal | torch.bfloat16 | (4, 16, 4096, 16, 4096, 128) | 363.64641830327434 | 375.9288663364792 | | alibi | torch.bfloat16 | (4, 16, 4096, 16, 4096, 128) | 341.5776139573363 | 335.1160003213424 | | sliding_window | torch.bfloat16 | (4, 16, 4096, 16, 4096, 128) | 281.1811770268521 | 280.21438270014005 | | document_mask | torch.bfloat16 | (4, 16, 4096, 16, 4096, 128) | 247.78716118997716 | 245.3269825179633 | | noop | torch.bfloat16 | (4, 16, 8192, 16, 8192, 64) | 403.794126680488 | 405.2353919019577 | | causal | torch.bfloat16 | (4, 16, 8192, 16, 8192, 64) | 387.079178426863 | 385.1461762057035 | | alibi | torch.bfloat16 | (4, 16, 8192, 16, 8192, 64) | 309.7847188173431 | 298.0443968374749 | | sliding_window | torch.bfloat16 | (4, 16, 8192, 16, 8192, 64) | 262.4721750159666 | 250.81679725428586 | | document_mask | torch.bfloat16 | (4, 16, 8192, 16, 8192, 64) | 205.70866004479979 | 202.9620839129557 | | noop | torch.bfloat16 | (4, 16, 8192, 16, 8192, 128) | 413.380982988662 | 418.40270594263103 | | causal | torch.bfloat16 | (4, 16, 8192, 16, 8192, 128) | 398.450064800682 | 409.6794973994029 | | alibi | torch.bfloat16 | (4, 16, 8192, 16, 8192, 128) | 372.26297458194466 | 364.44415106552196 | | sliding_window | torch.bfloat16 | (4, 16, 8192, 16, 8192, 128) | 293.0818569905912 | 292.85172400643984 | | document_mask | torch.bfloat16 | (4, 16, 8192, 16, 8192, 128) | 296.46717085592087 | 285.76362010612763 | | noop | torch.bfloat16 | (4, 16, 16384, 16, 16384, 64) | 419.3186786037592 | 426.08801580934437 | | causal | torch.bfloat16 | (4, 16, 16384, 16, 16384, 64) | 408.1648467766632 | 409.4122254207817 | | alibi | torch.bfloat16 | (4, 16, 16384, 16, 16384, 64) | 329.24396020457345 | 313.5200995121138 | | sliding_window | torch.bfloat16 | (4, 16, 16384, 16, 16384, 64) | 274.61257504571876 | 255.7801815432177 | | document_mask | torch.bfloat16 | (4, 16, 16384, 16, 16384, 64) | 232.63806001220684 | 230.03020843492314 | | noop | torch.bfloat16 | (4, 16, 16384, 16, 16384, 128) | 435.0785891054788 | 440.39101804225345 | | causal | torch.bfloat16 | (4, 16, 16384, 16, 16384, 128) | 424.86925312752817 | 435.18898057396825 | | alibi | torch.bfloat16 | (4, 16, 16384, 16, 16384, 128) | 393.000417896268 | 395.11543361225256 | | sliding_window | torch.bfloat16 | (4, 16, 16384, 16, 16384, 128) | 297.7755459218185 | 300.7208114715287 | | document_mask | torch.bfloat16 | (4, 16, 16384, 16, 16384, 128) | 331.71570861760534 | 318.07127352552885 | | noop | torch.bfloat16 | (4, 16, 32768, 16, 32768, 64) | 424.58602747137405 | 425.84897078470715 | | causal | torch.bfloat16 | (4, 16, 32768, 16, 32768, 64) | 422.66607285025725 | 423.5524945535485 | | alibi | torch.bfloat16 | (4, 16, 32768, 16, 32768, 64) | 344.8625760048626 | 331.6793888458635 | | sliding_window | torch.bfloat16 | (4, 16, 32768, 16, 32768, 64) | 282.0787281511649 | 263.7895634445868 | | document_mask | torch.bfloat16 | (4, 16, 32768, 16, 32768, 64) | 252.7301927385177 | 245.41844170037427 | | noop | torch.bfloat16 | (4, 16, 32768, 16, 32768, 128) | 437.0658069164588 | 442.9101960063628 | | causal | torch.bfloat16 | (4, 16, 32768, 16, 32768, 128) | 433.13788271434646 | 452.3873572709863 | | alibi | torch.bfloat16 | (4, 16, 32768, 16, 32768, 128) | 404.0959191546953 | 396.7077863894884 | | sliding_window | torch.bfloat16 | (4, 16, 32768, 16, 32768, 128) | 300.45502211883206 | 301.3439134717943 | | document_mask | torch.bfloat16 | (4, 16, 32768, 16, 32768, 128) | 344.11003202413934 | 330.8897663350314 | | noop | torch.bfloat16 | (4, 16, 1024, 4, 1024, 64) | 298.4364205341705 | 291.6793556507056 | | causal | torch.bfloat16 | (4, 16, 1024, 4, 1024, 64) | 187.6382133139633 | 191.05409897308772 | | alibi | torch.bfloat16 | (4, 16, 1024, 4, 1024, 64) | 156.55822078636112 | 154.178925976516 | | sliding_window | torch.bfloat16 | (4, 16, 1024, 4, 1024, 64) | 173.47765221825162 | 169.30862508068464 | | document_mask | torch.bfloat16 | (4, 16, 1024, 4, 1024, 64) | 74.5885345035243 | 74.52689061607104 | | noop | torch.bfloat16 | (4, 16, 1024, 4, 1024, 128) | 323.12233826013045 | 328.53889207933514 | | causal | torch.bfloat16 | (4, 16, 1024, 4, 1024, 128) | 236.75872140126316 | 235.8378325547398 | | alibi | torch.bfloat16 | (4, 16, 1024, 4, 1024, 128) | 227.17836523816675 | 226.75357076139966 | | sliding_window | torch.bfloat16 | (4, 16, 1024, 4, 1024, 128) | 224.07209453308036 | 224.07209453308036 | | document_mask | torch.bfloat16 | (4, 16, 1024, 4, 1024, 128) | 122.85572156047981 | 121.11642183704716 | | noop | torch.bfloat16 | (4, 16, 2048, 4, 2048, 64) | 361.3123326658092 | 360.71014086458337 | | causal | torch.bfloat16 | (4, 16, 2048, 4, 2048, 64) | 281.5287983927017 | 281.94301754758345 | | alibi | torch.bfloat16 | (4, 16, 2048, 4, 2048, 64) | 232.7456696285686 | 226.50976826432776 | | sliding_window | torch.bfloat16 | (4, 16, 2048, 4, 2048, 64) | 221.5612361744038 | 214.96188822837055 | | document_mask | torch.bfloat16 | (4, 16, 2048, 4, 2048, 64) | 121.38311528944315 | 120.85441868178513 | | noop | torch.bfloat16 | (4, 16, 2048, 4, 2048, 128) | 380.2579019244734 | 389.2520157863988 | | causal | torch.bfloat16 | (4, 16, 2048, 4, 2048, 128) | 316.95230660496924 | 317.87597790618906 | | alibi | torch.bfloat16 | (4, 16, 2048, 4, 2048, 128) | 301.07968126657323 | 298.02424098422983 | | sliding_window | torch.bfloat16 | (4, 16, 2048, 4, 2048, 128) | 267.2240756921594 | 267.16353549228154 | | document_mask | torch.bfloat16 | (4, 16, 2048, 4, 2048, 128) | 189.82761622494257 | 186.736450261963 | | noop | torch.bfloat16 | (4, 16, 4096, 4, 4096, 64) | 389.88665375406805 | 387.9125133037077 | | causal | torch.bfloat16 | (4, 16, 4096, 4, 4096, 64) | 348.70619958684887 | 346.6750499749774 | | alibi | torch.bfloat16 | (4, 16, 4096, 4, 4096, 64) | 280.5472989906087 | 271.22300822012187 | | sliding_window | torch.bfloat16 | (4, 16, 4096, 4, 4096, 64) | 250.02397620165968 | 241.22532776331445 | | document_mask | torch.bfloat16 | (4, 16, 4096, 4, 4096, 64) | 171.67817496107645 | 166.95679280483972 | | noop | torch.bfloat16 | (4, 16, 4096, 4, 4096, 128) | 412.626880230807 | 417.60238657950777 | | causal | torch.bfloat16 | (4, 16, 4096, 4, 4096, 128) | 374.8829313933945 | 389.4448546468815 | | alibi | torch.bfloat16 | (4, 16, 4096, 4, 4096, 128) | 353.20410434172436 | 345.7072490717473 | | sliding_window | torch.bfloat16 | (4, 16, 4096, 4, 4096, 128) | 292.51045924209586 | 291.66621022138287 | | document_mask | torch.bfloat16 | (4, 16, 4096, 4, 4096, 128) | 251.6264062063495 | 248.45110052911542 | | noop | torch.bfloat16 | (4, 16, 8192, 4, 8192, 64) | 404.0155784550126 | 401.90546837237514 | | causal | torch.bfloat16 | (4, 16, 8192, 4, 8192, 64) | 384.4389015599863 | 386.9684324594344 | | alibi | torch.bfloat16 | (4, 16, 8192, 4, 8192, 64) | 313.3731284132225 | 298.17074251037894 | | sliding_window | torch.bfloat16 | (4, 16, 8192, 4, 8192, 64) | 264.19199737284265 | 252.8982463999916 | | document_mask | torch.bfloat16 | (4, 16, 8192, 4, 8192, 64) | 207.03696315185684 | 202.86697323136772 | | noop | torch.bfloat16 | (4, 16, 8192, 4, 8192, 128) | 428.2436763312506 | 433.45005568619536 | | causal | torch.bfloat16 | (4, 16, 8192, 4, 8192, 128) | 411.8516531869893 | 428.2753623461049 | | alibi | torch.bfloat16 | (4, 16, 8192, 4, 8192, 128) | 384.9095037182509 | 372.90888743000744 | | sliding_window | torch.bfloat16 | (4, 16, 8192, 4, 8192, 128) | 303.2438915629836 | 302.05095952914337 | | document_mask | torch.bfloat16 | (4, 16, 8192, 4, 8192, 128) | 301.8689122735564 | 285.0363190513223 | | noop | torch.bfloat16 | (4, 16, 16384, 4, 16384, 64) | 423.13592231504805 | 420.3991500185611 | | causal | torch.bfloat16 | (4, 16, 16384, 4, 16384, 64) | 407.44527331585493 | 408.5064370765247 | | alibi | torch.bfloat16 | (4, 16, 16384, 4, 16384, 64) | 330.50050996167414 | 316.8763979925965 | | sliding_window | torch.bfloat16 | (4, 16, 16384, 4, 16384, 64) | 274.6833786307413 | 259.86098862141324 | | document_mask | torch.bfloat16 | (4, 16, 16384, 4, 16384, 64) | 232.24019584158367 | 226.52040268160232 | | noop | torch.bfloat16 | (4, 16, 16384, 4, 16384, 128) | 444.4596314237808 | 455.99558915752266 | | causal | torch.bfloat16 | (4, 16, 16384, 4, 16384, 128) | 437.4245561244369 | 455.98275147271966 | | alibi | torch.bfloat16 | (4, 16, 16384, 4, 16384, 128) | 397.3350686877605 | 397.88875599028063 | | sliding_window | torch.bfloat16 | (4, 16, 16384, 4, 16384, 128) | 308.53809114394545 | 307.1359822042007 | | document_mask | torch.bfloat16 | (4, 16, 16384, 4, 16384, 128) | 331.32379843423774 | 316.85293191675646 | | noop | torch.bfloat16 | (4, 16, 32768, 4, 32768, 64) | 422.4622274366379 | 425.0407156418684 | | causal | torch.bfloat16 | (4, 16, 32768, 4, 32768, 64) | 420.9547052783101 | 430.33779243510276 | | alibi | torch.bfloat16 | (4, 16, 32768, 4, 32768, 64) | 345.50265346504085 | 332.094855328957 | | sliding_window | torch.bfloat16 | (4, 16, 32768, 4, 32768, 64) | 280.81715528243365 | 264.6543640282054 | | document_mask | torch.bfloat16 | (4, 16, 32768, 4, 32768, 64) | 252.25635200421783 | 245.46235499490305 | | noop | torch.bfloat16 | (4, 16, 32768, 4, 32768, 128) | 452.5524207341139 | 461.7512032176736 | | causal | torch.bfloat16 | (4, 16, 32768, 4, 32768, 128) | 445.2316469907137 | 464.4523799578466 | | alibi | torch.bfloat16 | (4, 16, 32768, 4, 32768, 128) | 416.87264016717023 | 409.17124592157046 | | sliding_window | torch.bfloat16 | (4, 16, 32768, 4, 32768, 128) | 309.42579489389846 | 307.9734464665731 | | document_mask | torch.bfloat16 | (4, 16, 32768, 4, 32768, 128) | 350.50782004300623 | 330.98959545427294 | </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/157767 Approved by: https://github.com/Skylion007
This commit is contained in:
committed by
PyTorch MergeBot
parent
adecb0c9e8
commit
fc69c2bc67
@ -1514,17 +1514,21 @@ class TritonTemplate(KernelTemplate):
|
||||
|
||||
for name, val in kwargs.items():
|
||||
defines.write(f"{name} : tl.constexpr = {val}\n")
|
||||
defines = defines.getvalue()
|
||||
|
||||
fake_out = ir.Buffer(name="buf_out", layout=layout)
|
||||
kernel_name = f"triton_{self.name}"
|
||||
|
||||
numel = sympy_product(layout.size)
|
||||
buffers = itertools.chain(input_nodes, (fake_out,))
|
||||
if not TritonScheduling.can_use_32bit_indexing(numel, buffers):
|
||||
raise NotImplementedError(
|
||||
"64-bit indexing is not yet implemented for triton templates"
|
||||
)
|
||||
|
||||
if TritonScheduling.can_use_32bit_indexing(numel, buffers):
|
||||
index_dtype = "tl.int32"
|
||||
else:
|
||||
index_dtype = "tl.int64"
|
||||
|
||||
# Add index dtype to defines so it's available in the template
|
||||
defines.write(f"INDEX_DTYPE : tl.constexpr = {index_dtype}\n")
|
||||
defines = defines.getvalue()
|
||||
|
||||
kernel_options = {
|
||||
"input_nodes": input_nodes,
|
||||
|
Reference in New Issue
Block a user