Ensure large tensor int32 -> int64 indexing is enabled (#157767)

Fixes: #https://github.com/pytorch/pytorch/issues/157446

I think that this delta is worth the switch form block-ptrs especially since they are deprecated

## Perf Summary

A is nightly B is this diff, so `negative` means this diff improves perf

TOP 5 differences
<img width="805" height="754" alt="Screenshot 2025-08-24 at 5 49 49 PM" src="https://github.com/user-attachments/assets/aa359cdf-ee9a-427d-be72-1b9aef6f3115" />

<details>
  <summary><strong>Full perf table (click to expand)</strong></summary>

| attn_type | dtype | shape(B,Hq,M,Hkv,N,D) | TFlops Version A | TFlops Version B |
| --- | --- | --- | --- | --- |
| noop | torch.bfloat16 | (2, 16, 1024, 16, 1024, 64) | 258.38834144791923 | 258.6353685004612 |
| causal | torch.bfloat16 | (2, 16, 1024, 16, 1024, 64) | 142.2192450677751 | 140.12393320464972 |
| alibi | torch.bfloat16 | (2, 16, 1024, 16, 1024, 64) | 122.32683823617003 | 118.51603755647925 |
| sliding_window | torch.bfloat16 | (2, 16, 1024, 16, 1024, 64) | 142.48556906165314 | 137.24259849208627 |
| document_mask | torch.bfloat16 | (2, 16, 1024, 16, 1024, 64) | 86.59814488695922 | 84.59431398586257 |
| noop | torch.bfloat16 | (2, 16, 1024, 16, 1024, 128) | 288.52679758135764 | 292.9174195871856 |
| causal | torch.bfloat16 | (2, 16, 1024, 16, 1024, 128) | 172.25541683643277 | 172.94326459828508 |
| alibi | torch.bfloat16 | (2, 16, 1024, 16, 1024, 128) | 164.40864610599826 | 165.035129576335 |
| sliding_window | torch.bfloat16 | (2, 16, 1024, 16, 1024, 128) | 176.54876886433945 | 175.08057670028145 |
| document_mask | torch.bfloat16 | (2, 16, 1024, 16, 1024, 128) | 125.22491679812626 | 121.06201152859151 |
| noop | torch.bfloat16 | (2, 16, 2048, 16, 2048, 64) | 339.11952481874283 | 339.0132835601695 |
| causal | torch.bfloat16 | (2, 16, 2048, 16, 2048, 64) | 227.58583240284406 | 228.21824999409597 |
| alibi | torch.bfloat16 | (2, 16, 2048, 16, 2048, 64) | 185.98569659868966 | 182.32850843255093 |
| sliding_window | torch.bfloat16 | (2, 16, 2048, 16, 2048, 64) | 188.9495725191772 | 180.31385312481657 |
| document_mask | torch.bfloat16 | (2, 16, 2048, 16, 2048, 64) | 106.25789530994302 | 106.55084959448476 |
| noop | torch.bfloat16 | (2, 16, 2048, 16, 2048, 128) | 357.6430536888533 | 363.30843452247274 |
| causal | torch.bfloat16 | (2, 16, 2048, 16, 2048, 128) | 262.3241154406613 | 265.73250045488 |
| alibi | torch.bfloat16 | (2, 16, 2048, 16, 2048, 128) | 249.30498953911416 | 249.35928192833785 |
| sliding_window | torch.bfloat16 | (2, 16, 2048, 16, 2048, 128) | 224.74126243851808 | 223.71776504077988 |
| document_mask | torch.bfloat16 | (2, 16, 2048, 16, 2048, 128) | 168.26977014013707 | 165.47991483333809 |
| noop | torch.bfloat16 | (2, 16, 4096, 16, 4096, 64) | 382.8178701785897 | 384.34752965862685 |
| causal | torch.bfloat16 | (2, 16, 4096, 16, 4096, 64) | 308.1449710013853 | 311.0653716044644 |
| alibi | torch.bfloat16 | (2, 16, 4096, 16, 4096, 64) | 251.96365252505072 | 243.92283557225903 |
| sliding_window | torch.bfloat16 | (2, 16, 4096, 16, 4096, 64) | 226.69316232745368 | 215.22769268913356 |
| document_mask | torch.bfloat16 | (2, 16, 4096, 16, 4096, 64) | 153.34142545296405 | 151.9312673939401 |
| noop | torch.bfloat16 | (2, 16, 4096, 16, 4096, 128) | 396.0998000753126 | 398.35036286102473 |
| causal | torch.bfloat16 | (2, 16, 4096, 16, 4096, 128) | 333.5198415274966 | 344.6354466169716 |
| alibi | torch.bfloat16 | (2, 16, 4096, 16, 4096, 128) | 310.5955933379696 | 305.66347819546 |
| sliding_window | torch.bfloat16 | (2, 16, 4096, 16, 4096, 128) | 260.4012412689896 | 259.758666997307 |
| document_mask | torch.bfloat16 | (2, 16, 4096, 16, 4096, 128) | 234.13034252182635 | 227.61676497283614 |
| noop | torch.bfloat16 | (2, 16, 8192, 16, 8192, 64) | 396.17615538477196 | 401.1419104525502 |
| causal | torch.bfloat16 | (2, 16, 8192, 16, 8192, 64) | 359.98648311998414 | 360.8285563463094 |
| alibi | torch.bfloat16 | (2, 16, 8192, 16, 8192, 64) | 291.97720707257736 | 281.41694809965253 |
| sliding_window | torch.bfloat16 | (2, 16, 8192, 16, 8192, 64) | 250.1703628419691 | 238.556760291579 |
| document_mask | torch.bfloat16 | (2, 16, 8192, 16, 8192, 64) | 199.50782826294306 | 191.52327358439223 |
| noop | torch.bfloat16 | (2, 16, 8192, 16, 8192, 128) | 411.0632004785396 | 413.6362648405517 |
| causal | torch.bfloat16 | (2, 16, 8192, 16, 8192, 128) | 382.9404387613185 | 397.74886235657607 |
| alibi | torch.bfloat16 | (2, 16, 8192, 16, 8192, 128) | 357.0998545146633 | 350.5115200772392 |
| sliding_window | torch.bfloat16 | (2, 16, 8192, 16, 8192, 128) | 281.8033924428203 | 281.98601309215843 |
| document_mask | torch.bfloat16 | (2, 16, 8192, 16, 8192, 128) | 282.56595134222135 | 277.4565795466672 |
| noop | torch.bfloat16 | (2, 16, 16384, 16, 16384, 64) | 408.89838018149516 | 405.14531386840076 |
| causal | torch.bfloat16 | (2, 16, 16384, 16, 16384, 64) | 396.07662058160264 | 393.4598228299578 |
| alibi | torch.bfloat16 | (2, 16, 16384, 16, 16384, 64) | 317.8822887267849 | 304.754931401036 |
| sliding_window | torch.bfloat16 | (2, 16, 16384, 16, 16384, 64) | 265.8801304948243 | 254.22961974295112 |
| document_mask | torch.bfloat16 | (2, 16, 16384, 16, 16384, 64) | 227.87390579965614 | 222.19481980110393 |
| noop | torch.bfloat16 | (2, 16, 16384, 16, 16384, 128) | 427.36821778477025 | 431.3766620314935 |
| causal | torch.bfloat16 | (2, 16, 16384, 16, 16384, 128) | 410.67994346825 | 423.4666944003808 |
| alibi | torch.bfloat16 | (2, 16, 16384, 16, 16384, 128) | 381.1968748374038 | 381.77668006420424 |
| sliding_window | torch.bfloat16 | (2, 16, 16384, 16, 16384, 128) | 292.5540046358546 | 296.5439130720502 |
| document_mask | torch.bfloat16 | (2, 16, 16384, 16, 16384, 128) | 321.04573768858114 | 310.7423616656888 |
| noop | torch.bfloat16 | (2, 16, 32768, 16, 32768, 64) | 427.46148866769903 | 426.162091037068 |
| causal | torch.bfloat16 | (2, 16, 32768, 16, 32768, 64) | 419.75580537687347 | 421.88640120274334 |
| alibi | torch.bfloat16 | (2, 16, 32768, 16, 32768, 64) | 337.3208051798903 | 327.4912454675092 |
| sliding_window | torch.bfloat16 | (2, 16, 32768, 16, 32768, 64) | 276.5638854539581 | 262.988360558083 |
| document_mask | torch.bfloat16 | (2, 16, 32768, 16, 32768, 64) | 250.82791326036886 | 245.07367032501736 |
| noop | torch.bfloat16 | (2, 16, 32768, 16, 32768, 128) | 435.8055824506086 | 441.8803729460534 |
| causal | torch.bfloat16 | (2, 16, 32768, 16, 32768, 128) | 432.02638235921006 | 450.33161016596273 |
| alibi | torch.bfloat16 | (2, 16, 32768, 16, 32768, 128) | 402.25525939224883 | 393.8564689669916 |
| sliding_window | torch.bfloat16 | (2, 16, 32768, 16, 32768, 128) | 297.5337286675904 | 297.0131881135074 |
| document_mask | torch.bfloat16 | (2, 16, 32768, 16, 32768, 128) | 343.8697037899545 | 329.8194073407783 |
| noop | torch.bfloat16 | (2, 16, 1024, 4, 1024, 64) | 267.58912366821056 | 256.91606054118375 |
| causal | torch.bfloat16 | (2, 16, 1024, 4, 1024, 64) | 150.81723692609629 | 146.32172267858743 |
| alibi | torch.bfloat16 | (2, 16, 1024, 4, 1024, 64) | 129.51029293209245 | 122.72144394093334 |
| sliding_window | torch.bfloat16 | (2, 16, 1024, 4, 1024, 64) | 147.627656359087 | 141.68956350566188 |
| document_mask | torch.bfloat16 | (2, 16, 1024, 4, 1024, 64) | 87.55100546003591 | 84.91293287692788 |
| noop | torch.bfloat16 | (2, 16, 1024, 4, 1024, 128) | 299.5931492743986 | 305.884253766691 |
| causal | torch.bfloat16 | (2, 16, 1024, 4, 1024, 128) | 179.39026367843837 | 181.64741311605096 |
| alibi | torch.bfloat16 | (2, 16, 1024, 4, 1024, 128) | 173.93547669282367 | 173.23972950980564 |
| sliding_window | torch.bfloat16 | (2, 16, 1024, 4, 1024, 128) | 185.90234171599252 | 182.80844545446686 |
| document_mask | torch.bfloat16 | (2, 16, 1024, 4, 1024, 128) | 128.08176696266082 | 123.27722685662111 |
| noop | torch.bfloat16 | (2, 16, 2048, 4, 2048, 64) | 340.50674552770664 | 338.9071088484576 |
| causal | torch.bfloat16 | (2, 16, 2048, 4, 2048, 64) | 225.4438318650432 | 230.22899884832975 |
| alibi | torch.bfloat16 | (2, 16, 2048, 4, 2048, 64) | 194.15123248528312 | 185.02793973094865 |
| sliding_window | torch.bfloat16 | (2, 16, 2048, 4, 2048, 64) | 200.74289714108176 | 191.76606719670647 |
| document_mask | torch.bfloat16 | (2, 16, 2048, 4, 2048, 64) | 107.03564946728423 | 106.82432377861258 |
| noop | torch.bfloat16 | (2, 16, 2048, 4, 2048, 128) | 371.31799283918406 | 379.7555394732925 |
| causal | torch.bfloat16 | (2, 16, 2048, 4, 2048, 128) | 275.97762744310455 | 276.71106853992995 |
| alibi | torch.bfloat16 | (2, 16, 2048, 4, 2048, 128) | 261.6648679783462 | 259.4127232060398 |
| sliding_window | torch.bfloat16 | (2, 16, 2048, 4, 2048, 128) | 237.03108223577615 | 233.92710216149527 |
| document_mask | torch.bfloat16 | (2, 16, 2048, 4, 2048, 128) | 172.13926800371152 | 168.74390922407585 |
| noop | torch.bfloat16 | (2, 16, 4096, 4, 4096, 64) | 381.50199487767276 | 383.9043681999597 |
| causal | torch.bfloat16 | (2, 16, 4096, 4, 4096, 64) | 307.9748883093411 | 312.2403515462001 |
| alibi | torch.bfloat16 | (2, 16, 4096, 4, 4096, 64) | 251.11319684705438 | 243.17870127827277 |
| sliding_window | torch.bfloat16 | (2, 16, 4096, 4, 4096, 64) | 236.3253127246763 | 223.81250201769552 |
| document_mask | torch.bfloat16 | (2, 16, 4096, 4, 4096, 64) | 154.55693991756874 | 153.11360584987685 |
| noop | torch.bfloat16 | (2, 16, 4096, 4, 4096, 128) | 407.11400078586615 | 413.53709886086557 |
| causal | torch.bfloat16 | (2, 16, 4096, 4, 4096, 128) | 348.1705797722622 | 360.09771155957367 |
| alibi | torch.bfloat16 | (2, 16, 4096, 4, 4096, 128) | 321.8593280850388 | 318.2882327401255 |
| sliding_window | torch.bfloat16 | (2, 16, 4096, 4, 4096, 128) | 270.089032013835 | 268.767323026064 |
| document_mask | torch.bfloat16 | (2, 16, 4096, 4, 4096, 128) | 238.07324557907788 | 228.09842078362692 |
| noop | torch.bfloat16 | (2, 16, 8192, 4, 8192, 64) | 399.8172853171901 | 401.0954526332136 |
| causal | torch.bfloat16 | (2, 16, 8192, 4, 8192, 64) | 363.4387330438581 | 364.13111024232677 |
| alibi | torch.bfloat16 | (2, 16, 8192, 4, 8192, 64) | 294.1752429133857 | 283.7235663368415 |
| sliding_window | torch.bfloat16 | (2, 16, 8192, 4, 8192, 64) | 256.8389394007649 | 246.91771015606483 |
| document_mask | torch.bfloat16 | (2, 16, 8192, 4, 8192, 64) | 199.3378564292656 | 192.40439590901758 |
| noop | torch.bfloat16 | (2, 16, 8192, 4, 8192, 128) | 425.5150965556111 | 430.8190098707553 |
| causal | torch.bfloat16 | (2, 16, 8192, 4, 8192, 128) | 396.00437184073013 | 411.3873625655787 |
| alibi | torch.bfloat16 | (2, 16, 8192, 4, 8192, 128) | 369.92803661607815 | 361.43244467343663 |
| sliding_window | torch.bfloat16 | (2, 16, 8192, 4, 8192, 128) | 293.4277354412933 | 295.2529537595746 |
| document_mask | torch.bfloat16 | (2, 16, 8192, 4, 8192, 128) | 288.0208673072841 | 281.51896404878863 |
| noop | torch.bfloat16 | (2, 16, 16384, 4, 16384, 64) | 408.3005367220567 | 408.96116482298913 |
| causal | torch.bfloat16 | (2, 16, 16384, 4, 16384, 64) | 396.90095962766304 | 396.87385456176486 |
| alibi | torch.bfloat16 | (2, 16, 16384, 4, 16384, 64) | 319.0534576137999 | 302.50950358107764 |
| sliding_window | torch.bfloat16 | (2, 16, 16384, 4, 16384, 64) | 270.3334977708081 | 258.8506349486557 |
| document_mask | torch.bfloat16 | (2, 16, 16384, 4, 16384, 64) | 227.46824134365394 | 222.23759438128766 |
| noop | torch.bfloat16 | (2, 16, 16384, 4, 16384, 128) | 438.24247309479694 | 437.7975163205371 |
| causal | torch.bfloat16 | (2, 16, 16384, 4, 16384, 128) | 428.34012029699227 | 433.3215899950434 |
| alibi | torch.bfloat16 | (2, 16, 16384, 4, 16384, 128) | 386.52672049728875 | 388.26216893354984 |
| sliding_window | torch.bfloat16 | (2, 16, 16384, 4, 16384, 128) | 302.71976814728083 | 302.3574867306459 |
| document_mask | torch.bfloat16 | (2, 16, 16384, 4, 16384, 128) | 327.39760662780986 | 308.6348428844912 |
| noop | torch.bfloat16 | (2, 16, 32768, 4, 32768, 64) | 423.31308678262695 | 426.6306972137279 |
| causal | torch.bfloat16 | (2, 16, 32768, 4, 32768, 64) | 412.6983690923106 | 419.4961977664297 |
| alibi | torch.bfloat16 | (2, 16, 32768, 4, 32768, 64) | 337.41003544742273 | 324.2155049126126 |
| sliding_window | torch.bfloat16 | (2, 16, 32768, 4, 32768, 64) | 278.7755890910794 | 265.9194286636502 |
| document_mask | torch.bfloat16 | (2, 16, 32768, 4, 32768, 64) | 251.55678254755364 | 244.8843180141462 |
| noop | torch.bfloat16 | (2, 16, 32768, 4, 32768, 128) | 452.5930781172308 | 457.7117122300742 |
| causal | torch.bfloat16 | (2, 16, 32768, 4, 32768, 128) | 445.05676260348116 | 463.9304535499636 |
| alibi | torch.bfloat16 | (2, 16, 32768, 4, 32768, 128) | 415.78302138389415 | 406.29229555271456 |
| sliding_window | torch.bfloat16 | (2, 16, 32768, 4, 32768, 128) | 308.0311067300895 | 304.91354721414314 |
| document_mask | torch.bfloat16 | (2, 16, 32768, 4, 32768, 128) | 351.43943626809335 | 329.4476923070317 |
| noop | torch.bfloat16 | (4, 16, 1024, 16, 1024, 64) | 295.1801525813241 | 291.36521287398904 |
| causal | torch.bfloat16 | (4, 16, 1024, 16, 1024, 64) | 183.23250549178067 | 182.35421238887605 |
| alibi | torch.bfloat16 | (4, 16, 1024, 16, 1024, 64) | 151.56832453117747 | 151.3422139154794 |
| sliding_window | torch.bfloat16 | (4, 16, 1024, 16, 1024, 64) | 171.02111935180432 | 160.72516856727913 |
| document_mask | torch.bfloat16 | (4, 16, 1024, 16, 1024, 64) | 74.05765122783826 | 74.5885345035243 |
| noop | torch.bfloat16 | (4, 16, 1024, 16, 1024, 128) | 314.3587394591763 | 319.2938677773619 |
| causal | torch.bfloat16 | (4, 16, 1024, 16, 1024, 128) | 224.57002084153177 | 225.48868542008177 |
| alibi | torch.bfloat16 | (4, 16, 1024, 16, 1024, 128) | 216.00964804143052 | 215.39576159953486 |
| sliding_window | torch.bfloat16 | (4, 16, 1024, 16, 1024, 128) | 216.1174237618258 | 214.28437413525663 |
| document_mask | torch.bfloat16 | (4, 16, 1024, 16, 1024, 128) | 121.08920423648368 | 119.55813661872644 |
| noop | torch.bfloat16 | (4, 16, 2048, 16, 2048, 64) | 362.2193857281911 | 360.05005804275936 |
| causal | torch.bfloat16 | (4, 16, 2048, 16, 2048, 64) | 279.8840217430121 | 279.5437918286659 |
| alibi | torch.bfloat16 | (4, 16, 2048, 16, 2048, 64) | 227.76617121021982 | 222.8655938229316 |
| sliding_window | torch.bfloat16 | (4, 16, 2048, 16, 2048, 64) | 215.43141176970562 | 207.71852284994702 |
| document_mask | torch.bfloat16 | (4, 16, 2048, 16, 2048, 64) | 121.35588364218539 | 121.20636565046884 |
| noop | torch.bfloat16 | (4, 16, 2048, 16, 2048, 128) | 365.1545280898012 | 373.37585444987326 |
| causal | torch.bfloat16 | (4, 16, 2048, 16, 2048, 128) | 304.360119952975 | 309.1247297936263 |
| alibi | torch.bfloat16 | (4, 16, 2048, 16, 2048, 128) | 287.2603904544586 | 289.25547903162595 |
| sliding_window | torch.bfloat16 | (4, 16, 2048, 16, 2048, 128) | 257.9852675272418 | 257.59069234098115 |
| document_mask | torch.bfloat16 | (4, 16, 2048, 16, 2048, 128) | 188.35158496670232 | 184.24683960154857 |
| noop | torch.bfloat16 | (4, 16, 4096, 16, 4096, 64) | 389.9744911369211 | 388.43466897254166 |
| causal | torch.bfloat16 | (4, 16, 4096, 16, 4096, 64) | 345.9228295166513 | 342.63034895210126 |
| alibi | torch.bfloat16 | (4, 16, 4096, 16, 4096, 64) | 279.56334658247437 | 271.2724375402088 |
| sliding_window | torch.bfloat16 | (4, 16, 4096, 16, 4096, 64) | 245.66477202810066 | 233.49688207371258 |
| document_mask | torch.bfloat16 | (4, 16, 4096, 16, 4096, 64) | 170.3270720653187 | 166.23863845657382 |
| noop | torch.bfloat16 | (4, 16, 4096, 16, 4096, 128) | 400.0041140827554 | 402.11182445396497 |
| causal | torch.bfloat16 | (4, 16, 4096, 16, 4096, 128) | 363.64641830327434 | 375.9288663364792 |
| alibi | torch.bfloat16 | (4, 16, 4096, 16, 4096, 128) | 341.5776139573363 | 335.1160003213424 |
| sliding_window | torch.bfloat16 | (4, 16, 4096, 16, 4096, 128) | 281.1811770268521 | 280.21438270014005 |
| document_mask | torch.bfloat16 | (4, 16, 4096, 16, 4096, 128) | 247.78716118997716 | 245.3269825179633 |
| noop | torch.bfloat16 | (4, 16, 8192, 16, 8192, 64) | 403.794126680488 | 405.2353919019577 |
| causal | torch.bfloat16 | (4, 16, 8192, 16, 8192, 64) | 387.079178426863 | 385.1461762057035 |
| alibi | torch.bfloat16 | (4, 16, 8192, 16, 8192, 64) | 309.7847188173431 | 298.0443968374749 |
| sliding_window | torch.bfloat16 | (4, 16, 8192, 16, 8192, 64) | 262.4721750159666 | 250.81679725428586 |
| document_mask | torch.bfloat16 | (4, 16, 8192, 16, 8192, 64) | 205.70866004479979 | 202.9620839129557 |
| noop | torch.bfloat16 | (4, 16, 8192, 16, 8192, 128) | 413.380982988662 | 418.40270594263103 |
| causal | torch.bfloat16 | (4, 16, 8192, 16, 8192, 128) | 398.450064800682 | 409.6794973994029 |
| alibi | torch.bfloat16 | (4, 16, 8192, 16, 8192, 128) | 372.26297458194466 | 364.44415106552196 |
| sliding_window | torch.bfloat16 | (4, 16, 8192, 16, 8192, 128) | 293.0818569905912 | 292.85172400643984 |
| document_mask | torch.bfloat16 | (4, 16, 8192, 16, 8192, 128) | 296.46717085592087 | 285.76362010612763 |
| noop | torch.bfloat16 | (4, 16, 16384, 16, 16384, 64) | 419.3186786037592 | 426.08801580934437 |
| causal | torch.bfloat16 | (4, 16, 16384, 16, 16384, 64) | 408.1648467766632 | 409.4122254207817 |
| alibi | torch.bfloat16 | (4, 16, 16384, 16, 16384, 64) | 329.24396020457345 | 313.5200995121138 |
| sliding_window | torch.bfloat16 | (4, 16, 16384, 16, 16384, 64) | 274.61257504571876 | 255.7801815432177 |
| document_mask | torch.bfloat16 | (4, 16, 16384, 16, 16384, 64) | 232.63806001220684 | 230.03020843492314 |
| noop | torch.bfloat16 | (4, 16, 16384, 16, 16384, 128) | 435.0785891054788 | 440.39101804225345 |
| causal | torch.bfloat16 | (4, 16, 16384, 16, 16384, 128) | 424.86925312752817 | 435.18898057396825 |
| alibi | torch.bfloat16 | (4, 16, 16384, 16, 16384, 128) | 393.000417896268 | 395.11543361225256 |
| sliding_window | torch.bfloat16 | (4, 16, 16384, 16, 16384, 128) | 297.7755459218185 | 300.7208114715287 |
| document_mask | torch.bfloat16 | (4, 16, 16384, 16, 16384, 128) | 331.71570861760534 | 318.07127352552885 |
| noop | torch.bfloat16 | (4, 16, 32768, 16, 32768, 64) | 424.58602747137405 | 425.84897078470715 |
| causal | torch.bfloat16 | (4, 16, 32768, 16, 32768, 64) | 422.66607285025725 | 423.5524945535485 |
| alibi | torch.bfloat16 | (4, 16, 32768, 16, 32768, 64) | 344.8625760048626 | 331.6793888458635 |
| sliding_window | torch.bfloat16 | (4, 16, 32768, 16, 32768, 64) | 282.0787281511649 | 263.7895634445868 |
| document_mask | torch.bfloat16 | (4, 16, 32768, 16, 32768, 64) | 252.7301927385177 | 245.41844170037427 |
| noop | torch.bfloat16 | (4, 16, 32768, 16, 32768, 128) | 437.0658069164588 | 442.9101960063628 |
| causal | torch.bfloat16 | (4, 16, 32768, 16, 32768, 128) | 433.13788271434646 | 452.3873572709863 |
| alibi | torch.bfloat16 | (4, 16, 32768, 16, 32768, 128) | 404.0959191546953 | 396.7077863894884 |
| sliding_window | torch.bfloat16 | (4, 16, 32768, 16, 32768, 128) | 300.45502211883206 | 301.3439134717943 |
| document_mask | torch.bfloat16 | (4, 16, 32768, 16, 32768, 128) | 344.11003202413934 | 330.8897663350314 |
| noop | torch.bfloat16 | (4, 16, 1024, 4, 1024, 64) | 298.4364205341705 | 291.6793556507056 |
| causal | torch.bfloat16 | (4, 16, 1024, 4, 1024, 64) | 187.6382133139633 | 191.05409897308772 |
| alibi | torch.bfloat16 | (4, 16, 1024, 4, 1024, 64) | 156.55822078636112 | 154.178925976516 |
| sliding_window | torch.bfloat16 | (4, 16, 1024, 4, 1024, 64) | 173.47765221825162 | 169.30862508068464 |
| document_mask | torch.bfloat16 | (4, 16, 1024, 4, 1024, 64) | 74.5885345035243 | 74.52689061607104 |
| noop | torch.bfloat16 | (4, 16, 1024, 4, 1024, 128) | 323.12233826013045 | 328.53889207933514 |
| causal | torch.bfloat16 | (4, 16, 1024, 4, 1024, 128) | 236.75872140126316 | 235.8378325547398 |
| alibi | torch.bfloat16 | (4, 16, 1024, 4, 1024, 128) | 227.17836523816675 | 226.75357076139966 |
| sliding_window | torch.bfloat16 | (4, 16, 1024, 4, 1024, 128) | 224.07209453308036 | 224.07209453308036 |
| document_mask | torch.bfloat16 | (4, 16, 1024, 4, 1024, 128) | 122.85572156047981 | 121.11642183704716 |
| noop | torch.bfloat16 | (4, 16, 2048, 4, 2048, 64) | 361.3123326658092 | 360.71014086458337 |
| causal | torch.bfloat16 | (4, 16, 2048, 4, 2048, 64) | 281.5287983927017 | 281.94301754758345 |
| alibi | torch.bfloat16 | (4, 16, 2048, 4, 2048, 64) | 232.7456696285686 | 226.50976826432776 |
| sliding_window | torch.bfloat16 | (4, 16, 2048, 4, 2048, 64) | 221.5612361744038 | 214.96188822837055 |
| document_mask | torch.bfloat16 | (4, 16, 2048, 4, 2048, 64) | 121.38311528944315 | 120.85441868178513 |
| noop | torch.bfloat16 | (4, 16, 2048, 4, 2048, 128) | 380.2579019244734 | 389.2520157863988 |
| causal | torch.bfloat16 | (4, 16, 2048, 4, 2048, 128) | 316.95230660496924 | 317.87597790618906 |
| alibi | torch.bfloat16 | (4, 16, 2048, 4, 2048, 128) | 301.07968126657323 | 298.02424098422983 |
| sliding_window | torch.bfloat16 | (4, 16, 2048, 4, 2048, 128) | 267.2240756921594 | 267.16353549228154 |
| document_mask | torch.bfloat16 | (4, 16, 2048, 4, 2048, 128) | 189.82761622494257 | 186.736450261963 |
| noop | torch.bfloat16 | (4, 16, 4096, 4, 4096, 64) | 389.88665375406805 | 387.9125133037077 |
| causal | torch.bfloat16 | (4, 16, 4096, 4, 4096, 64) | 348.70619958684887 | 346.6750499749774 |
| alibi | torch.bfloat16 | (4, 16, 4096, 4, 4096, 64) | 280.5472989906087 | 271.22300822012187 |
| sliding_window | torch.bfloat16 | (4, 16, 4096, 4, 4096, 64) | 250.02397620165968 | 241.22532776331445 |
| document_mask | torch.bfloat16 | (4, 16, 4096, 4, 4096, 64) | 171.67817496107645 | 166.95679280483972 |
| noop | torch.bfloat16 | (4, 16, 4096, 4, 4096, 128) | 412.626880230807 | 417.60238657950777 |
| causal | torch.bfloat16 | (4, 16, 4096, 4, 4096, 128) | 374.8829313933945 | 389.4448546468815 |
| alibi | torch.bfloat16 | (4, 16, 4096, 4, 4096, 128) | 353.20410434172436 | 345.7072490717473 |
| sliding_window | torch.bfloat16 | (4, 16, 4096, 4, 4096, 128) | 292.51045924209586 | 291.66621022138287 |
| document_mask | torch.bfloat16 | (4, 16, 4096, 4, 4096, 128) | 251.6264062063495 | 248.45110052911542 |
| noop | torch.bfloat16 | (4, 16, 8192, 4, 8192, 64) | 404.0155784550126 | 401.90546837237514 |
| causal | torch.bfloat16 | (4, 16, 8192, 4, 8192, 64) | 384.4389015599863 | 386.9684324594344 |
| alibi | torch.bfloat16 | (4, 16, 8192, 4, 8192, 64) | 313.3731284132225 | 298.17074251037894 |
| sliding_window | torch.bfloat16 | (4, 16, 8192, 4, 8192, 64) | 264.19199737284265 | 252.8982463999916 |
| document_mask | torch.bfloat16 | (4, 16, 8192, 4, 8192, 64) | 207.03696315185684 | 202.86697323136772 |
| noop | torch.bfloat16 | (4, 16, 8192, 4, 8192, 128) | 428.2436763312506 | 433.45005568619536 |
| causal | torch.bfloat16 | (4, 16, 8192, 4, 8192, 128) | 411.8516531869893 | 428.2753623461049 |
| alibi | torch.bfloat16 | (4, 16, 8192, 4, 8192, 128) | 384.9095037182509 | 372.90888743000744 |
| sliding_window | torch.bfloat16 | (4, 16, 8192, 4, 8192, 128) | 303.2438915629836 | 302.05095952914337 |
| document_mask | torch.bfloat16 | (4, 16, 8192, 4, 8192, 128) | 301.8689122735564 | 285.0363190513223 |
| noop | torch.bfloat16 | (4, 16, 16384, 4, 16384, 64) | 423.13592231504805 | 420.3991500185611 |
| causal | torch.bfloat16 | (4, 16, 16384, 4, 16384, 64) | 407.44527331585493 | 408.5064370765247 |
| alibi | torch.bfloat16 | (4, 16, 16384, 4, 16384, 64) | 330.50050996167414 | 316.8763979925965 |
| sliding_window | torch.bfloat16 | (4, 16, 16384, 4, 16384, 64) | 274.6833786307413 | 259.86098862141324 |
| document_mask | torch.bfloat16 | (4, 16, 16384, 4, 16384, 64) | 232.24019584158367 | 226.52040268160232 |
| noop | torch.bfloat16 | (4, 16, 16384, 4, 16384, 128) | 444.4596314237808 | 455.99558915752266 |
| causal | torch.bfloat16 | (4, 16, 16384, 4, 16384, 128) | 437.4245561244369 | 455.98275147271966 |
| alibi | torch.bfloat16 | (4, 16, 16384, 4, 16384, 128) | 397.3350686877605 | 397.88875599028063 |
| sliding_window | torch.bfloat16 | (4, 16, 16384, 4, 16384, 128) | 308.53809114394545 | 307.1359822042007 |
| document_mask | torch.bfloat16 | (4, 16, 16384, 4, 16384, 128) | 331.32379843423774 | 316.85293191675646 |
| noop | torch.bfloat16 | (4, 16, 32768, 4, 32768, 64) | 422.4622274366379 | 425.0407156418684 |
| causal | torch.bfloat16 | (4, 16, 32768, 4, 32768, 64) | 420.9547052783101 | 430.33779243510276 |
| alibi | torch.bfloat16 | (4, 16, 32768, 4, 32768, 64) | 345.50265346504085 | 332.094855328957 |
| sliding_window | torch.bfloat16 | (4, 16, 32768, 4, 32768, 64) | 280.81715528243365 | 264.6543640282054 |
| document_mask | torch.bfloat16 | (4, 16, 32768, 4, 32768, 64) | 252.25635200421783 | 245.46235499490305 |
| noop | torch.bfloat16 | (4, 16, 32768, 4, 32768, 128) | 452.5524207341139 | 461.7512032176736 |
| causal | torch.bfloat16 | (4, 16, 32768, 4, 32768, 128) | 445.2316469907137 | 464.4523799578466 |
| alibi | torch.bfloat16 | (4, 16, 32768, 4, 32768, 128) | 416.87264016717023 | 409.17124592157046 |
| sliding_window | torch.bfloat16 | (4, 16, 32768, 4, 32768, 128) | 309.42579489389846 | 307.9734464665731 |
| document_mask | torch.bfloat16 | (4, 16, 32768, 4, 32768, 128) | 350.50782004300623 | 330.98959545427294 |

</details>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157767
Approved by: https://github.com/Skylion007
This commit is contained in:
drisspg
2025-08-25 17:55:38 +00:00
committed by PyTorch MergeBot
parent adecb0c9e8
commit fc69c2bc67
6 changed files with 98 additions and 120 deletions

View File

@ -1514,17 +1514,21 @@ class TritonTemplate(KernelTemplate):
for name, val in kwargs.items():
defines.write(f"{name} : tl.constexpr = {val}\n")
defines = defines.getvalue()
fake_out = ir.Buffer(name="buf_out", layout=layout)
kernel_name = f"triton_{self.name}"
numel = sympy_product(layout.size)
buffers = itertools.chain(input_nodes, (fake_out,))
if not TritonScheduling.can_use_32bit_indexing(numel, buffers):
raise NotImplementedError(
"64-bit indexing is not yet implemented for triton templates"
)
if TritonScheduling.can_use_32bit_indexing(numel, buffers):
index_dtype = "tl.int32"
else:
index_dtype = "tl.int64"
# Add index dtype to defines so it's available in the template
defines.write(f"INDEX_DTYPE : tl.constexpr = {index_dtype}\n")
defines = defines.getvalue()
kernel_options = {
"input_nodes": input_nodes,