Compare commits

...

412 Commits

Author SHA1 Message Date
f063efb639 Update
[ghstack-poisoned]
2025-10-31 15:06:32 -07:00
cf37d81dc1 Update (base update)
[ghstack-poisoned]
2025-10-31 15:06:32 -07:00
2699f5410b Revert "[xpu][feature] Integrate OneDNN SDPA training forward/backward into XPU OVERRIDEABLE Backend (#162454)"
This reverts commit fd68d409ada709450ced3030bde89ec662a3f7b7.

Reverted https://github.com/pytorch/pytorch/pull/162454 on behalf of https://github.com/atalman due to internal build failure ([comment](https://github.com/pytorch/pytorch/pull/162454#issuecomment-3475009089))
2025-10-31 21:58:52 +00:00
9970fb97ff Fix Tril Triu SymInt (#166627)
Fixes #165613

### Summary:

- This MR fixes an issue where `torch.tril `and `torch.triu` with dynamic diagonal values cause torch.export to incorrectly infer unnecessary constraints between dynamic dimensions.
-  Ensured proper SymInt type annotations for diagonal parameter
-  Updated C++ implementation to correctly handle SymInt diagonal values.

### Impacts:
module: dynamic shapes

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166627
Approved by: https://github.com/ezyang, https://github.com/Skylion007
2025-10-31 21:53:20 +00:00
dfebdcab86 [GraphPartition] cache get_free_symbol_uses (#166338)
Graph partition relies on `get_free_symbol_uses()` to collect symbol inputs.
ee7434be82/torch/_inductor/scheduler.py (L4869-L4885)

I empirically observed that `get_free_symbol_uses()` becomes slower for larger graphs. Specifically, I tried to aten fallback for torchtitan which results in 10k+ aten nodes. When processing the 600-th node, it takes seconds to `get_free_symbol_uses()` for 1 node.

Why? Because `get_free_symbol_uses()` may recursively call another `get_free_symbol_uses()`, which could recursively run many times.
ee7434be82/torch/_inductor/ir.py (L4541-L4543)

This PR fixes the issue by caching the results of `get_free_symbol_uses()`. I validated on torchtitan that the issue is fixed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166338
Approved by: https://github.com/eellison
2025-10-31 21:24:05 +00:00
b09fb481e0 [CD] Upgrade GCC version to 13 for XPU build (#162474)
Follow #152426
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162474
Approved by: https://github.com/zxiiro, https://github.com/atalman
2025-10-31 21:15:37 +00:00
4e7232c5da [MPS] Fix smooth_l1_loss backward for fp16 (#166687)
And enable fp16 implementation for CPU, which simplifies OpInfo definitions for the op

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166687
Approved by: https://github.com/Skylion007
ghstack dependencies: #166214
2025-10-31 21:13:46 +00:00
93a70c717a Revert "Add CUDA MXFP4 scaled mm support via. FBGEMM (#166526)"
This reverts commit e3ae0594d16134632ff587c9ab400d4148c83e9f.

Reverted https://github.com/pytorch/pytorch/pull/166526 on behalf of https://github.com/atalman due to Failing internal test ([comment](https://github.com/pytorch/pytorch/pull/166526#issuecomment-3474907536))
2025-10-31 21:10:28 +00:00
d97144d31e [5/N] Remove unused loop variables in tests (#166716)
This PR removes unused loop variables in tests.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166716
Approved by: https://github.com/Lucaskabela, https://github.com/Skylion007
2025-10-31 20:47:57 +00:00
e4043884c7 [dynamo, 3.14] fix segfault due to improper create_call_function_ex (#166678)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166678
Approved by: https://github.com/malfet
2025-10-31 20:44:53 +00:00
4a7bc1d522 [BE][Typing][Dynamo] Type misc files in torch/_dynamo/variables/ (#166569)
Provides type coverage to ~3000 LOC and 200 methods in  `torch/_dynamo/variables/`

This is the first part of the final step to having 100% strict type coverage in dynamo - see previous comments in https://github.com/pytorch/pytorch/pull/166535 (combined into this one PR because ghstack was giving issues...)

### Coverage report:
```
mypy torch_dynamo/variables --linecount-report /tmp/coverage_log
```
Compare before to after - we go from 3826 to 7221 lines covered

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166569
Approved by: https://github.com/williamwen42, https://github.com/Skylion007
2025-10-31 20:42:27 +00:00
8209a0506b [Pytorch] Enable aarch64 convert autovec only on clang (#166739)
Summary: We've noted issues with modern GCC versions. Until further investigation is carried, we'll leave the code only enabled on clang

Test Plan: CI

Differential Revision: D85968395

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166739
Approved by: https://github.com/mcfi, https://github.com/Skylion007, https://github.com/robert-hardwick
2025-10-31 20:22:33 +00:00
70aeb49198 [dynamo] clarify graph break handling/logging in symbolic_convert (#166587)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166587
Approved by: https://github.com/Lucaskabela
ghstack dependencies: #166476, #166477, #166586
2025-10-31 20:13:16 +00:00
cf9a834f39 [BE] Move GreenContext implementation details to cpp (#166462)
- Remove all complex defines logic from the header
- Make GreenContext constructor private, as  it should only be created via the static method as singleton
- Delete unused `getContext` and `getGreenContext` methods
- Rename `CUDA_HAS_GREEN_CONTEXT` to `HAS_CUDA_GREEN_CONTEXT()`, which results in compilation error if one accidentally makes a typo
- Suppress `-Wunused-private-field` is GreenContext is not available
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166462
Approved by: https://github.com/ngimel, https://github.com/eqy
2025-10-31 20:11:02 +00:00
856a7a5298 Add missing device to namedtensor tests (#166717)
This PR passes unused `device` argument to tests.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166717
Approved by: https://github.com/Skylion007
2025-10-31 20:04:41 +00:00
ef8d97efcf fix broken nn_convolution test (#166666)
Summary: Broken by oss diff during oncall by third party contributor

Test Plan: buck test 'fbcode//mode/dev-nosan' fbcode//caffe2/test:nn_convolution -- --run-disabled

Differential Revision: D85899891

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166666
Approved by: https://github.com/atalman, https://github.com/seemethere, https://github.com/Skylion007
2025-10-31 19:59:50 +00:00
d2be06f673 [cpu][fix] Update ACL version to fix crashes with tensor sizes > 2^31-1 (#165904)
----

- Updates Arm Compute Library (ACL) to v52.6.0
- v52.6.0 contains https://github.com/ARM-software/ComputeLibrary/pull/1201 which fixes crashes with tensors of sizes > 2^31-1

fixes: #165654

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165904
Approved by: https://github.com/malfet
2025-10-31 19:37:26 +00:00
08f4535378 Refactor AOTAutogradCacheEntry into AOTAutogradResult (#166656)
This PR refactors the name AOTAutogradCacheEntry into AOTAutogradResult, and BundledAOTAutogradCacheEntry into BundledAOTAutogradResult. It also moves all coresponding files to a new file, `aot_autograd_result`, which is analogous to `output_code.py` from Inductor.

Having all these be called cache entries made sense when all we used them for was caching. But with AOT compile using BundledAOTAutogradCacheEntry, we want a more generalized naming structure.

This is a no-op change,  and all existing tests should pass.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166656
Approved by: https://github.com/zhxchen17
ghstack dependencies: #166650
2025-10-31 18:54:09 +00:00
30157d30f0 Add regional aot eager support to AOTAutogradCacheEntry (#166650)
This PR does two things:

- It genericizes `BundledAOTAutogradCacheEntry` to support *any* outputcode, not just CompiledFxGraphs
- It adds a brand new OutputCode for the `aot_eager_regional_inductor` backend, i.e. a graph module that has regional inductor components in it.

This allows BundledAOTAutogradCache to just integrate nicely with inductor out of the box, but more importantly, it allows the result of aot_autograd to be fully serializable when using `aot_eager_regional_inductor`. This will allow us to AOT precompile cases where we have an eager graph that has scooped up inductor bits.

It's a bit unfortunate that the naming makes BundledAOTAutogradCacheEntry sound like its primary use is for caching, but really the more common use is going to be as an AOTAutogradOutput. It may be worth revisiting how to refactor/rename these in a later PR:

- AOTAutogradCacheEntry -> AOTAutogradResult
- BundledAOTAutogradCacheEntry -> BundledAOTAutogradResult

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166650
Approved by: https://github.com/zhxchen17
2025-10-31 18:54:09 +00:00
b470e59c38 partitioner option to ignore partitioner_tag for abstract usage (#166725)
Partitioner functionality is appealing to use in different scenarios (E.g. Autoparallel)

We have special logic about "partitioner_tag" from meta that is only needed for forward/backward split.

Adding optional argument to avoid it and do only generic split based on inputs/outputs.

Potentially we want to make `_extract_graph_with_inputs_outputs` without underscore :)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166725
Approved by: https://github.com/bdhirsh
2025-10-31 18:50:02 +00:00
85b85f6c2c Revert "[pytree] add treespec_{leaf,tuple,dict} functions for args_spec modification (#160843)"
This reverts commit 108bb224f77842593009214ebf6258030b934642.

Reverted https://github.com/pytorch/pytorch/pull/160843 on behalf of https://github.com/atalman due to failing internal builds ([comment](https://github.com/pytorch/pytorch/pull/160843#issuecomment-3474354428))
2025-10-31 18:31:32 +00:00
b71966f67b [PyTorch] Improve aarch64 performance of bfloat16 ops - retry (#166028) (#166641)
Summary:

PR allows compiler to better optimize some bfloat16-based operations, when ran on NEON

Retrying to land the code, after noting that these expressions became available in recent compiler versions.

Current CI benchmark ‎binary_test.py will measure affected codepaths.

Benchmarks show measurable improvements on clang-19, when targeting armv9-a+sve2:

Before:
bfloat16 add: 250.503us
bfloat16 sub: 245.674us
bfloat16 neg: 113.945us
bfloat16 abs: 115.953us
bfloat16 reciprocal: 262.602us

After:
bfloat16 add: 203.862us ---> 23% higher throughput
bfloat16 sub: 201.526us ---> 22% higher throughput
bfloat16 neg: 68.416us ---> 67% higher throughput
bfloat16 abs: 71.003us  ---> 63% higher throughput
bfloat16 reciprocal: 177.834us ---> 48% higher throughput

Test Plan:
Correctness:

buck2 test mode/opt //caffe2/test:test_ops
buck2 test mode/opt //caffe2/test:torch

Performance:

buck2 run mode/opt //caffe2/benchmarks/operator_benchmark/fb:operator_benchmark_test

Reviewed By: mcfi

Differential Revision: D85809843

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166641
Approved by: https://github.com/Skylion007, https://github.com/malfet
2025-10-31 18:21:04 +00:00
0a06d8ea3b Update
[ghstack-poisoned]
2025-10-31 11:05:43 -07:00
dd0e7aa6bf Update (base update)
[ghstack-poisoned]
2025-10-31 11:05:43 -07:00
0947765eb9 Cache even more work for return_and_correct_aliasing (#166365)
Yet another pass found even more work we can move to be done only once. This seems to knock a few microseconds off the DTensor dispatch fast path.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166365
Approved by: https://github.com/bdhirsh
2025-10-31 18:03:05 +00:00
239e7b541a [ROCm][CI] upgrade nightly wheels to ROCm 7.1 (#166730)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166730
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-10-31 17:30:47 +00:00
ffaa6578b7 Revise deprecation warning for ONNX exporter (#166692)
Updated deprecation warning for ONNX export to reflect the current state.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166692
Approved by: https://github.com/titaiwangms
2025-10-31 17:23:55 +00:00
365ed62f61 Document LibTorch ABI more, add README to headeronly (#166661)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166661
Approved by: https://github.com/mikaylagawarecki, https://github.com/albanD
2025-10-31 17:18:13 +00:00
fcc1063566 Revert "[BE][Typing][Dynamo] Type misc files in torch/_dynamo/variables/ (#166569)"
This reverts commit aa9c96af041b26c9c55adac490f3449b98f27d06.

Reverted https://github.com/pytorch/pytorch/pull/166569 on behalf of https://github.com/Lucaskabela due to Lintrunner not fixed due to race condition at landing ([comment](https://github.com/pytorch/pytorch/pull/166569#issuecomment-3474012637))
2025-10-31 16:59:33 +00:00
121235956b update Node.is_impure check if subgraph contains impure ops (#166609)
Summary:
## Context
when `const_fold.split_const_subgraphs` sees a `call_module` node that is a GraphModule, by the existing implementation it can mark this node as const-foldable when it shouldn't.

For example, a parent graph contains a `call_module` to a subgraph that has no inputs but contain impure ops inside.
```
parent graph():
    %sub : [num_users=1] = call_module[target=sub](args = (), kwargs = {})
    %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%sub, slice(None, None, None)), kwargs = {})
    return (getitem,)

submodule graph():
    %randn : [num_users=1] = call_function[target=torch.ops.aten.randn.default](args = ([5, 10],), kwargs = {device: cpu, pin_memory: False})
    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%randn, 1), kwargs = {})
    return (add,)
```
when `submodule` graph is fed to const_fold.split_const_subgraph, it would come out unmodified since randn is impure.

But if the `submodule` is called by a `parent` graph, when `parent` is fed to const_fold.split_const_subgraph, it would come out folded.
```
parent after fold graph():
    %_fx_const_folded_attrs : [num_users=1] = get_attr[target=_FX_CONST_FOLDED_ATTRS]
    return (_fx_const_folded_attrs,)
```

This is because `node.is_impure()` check inside `const_fold.split_const_subgraph` fail through, leading the call_module node to be marked as pure.

## Fix

We can update `fx.node.Node.is_impure` function to check for ops inside a call_module node with an additional `subgraph_has_impure_ops` check:
- if a call_module node calls a GraphModule,
- check any call_function nodes are impure ops
- recursively check any call_module nodes that call GraphModule

If the call_module subgraph has impure ops, return True to `is_impure`

Test Plan: added tests to test_fx_const_fold.py

Differential Revision: D85798483

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166609
Approved by: https://github.com/blaine-rister
2025-10-31 16:58:18 +00:00
aa9c96af04 [BE][Typing][Dynamo] Type misc files in torch/_dynamo/variables/ (#166569)
Provides type coverage to ~3000 LOC and 200 methods in  `torch/_dynamo/variables/`

This is the first part of the final step to having 100% strict type coverage in dynamo - see previous comments in https://github.com/pytorch/pytorch/pull/166535 (combined into this one PR because ghstack was giving issues...)

### Coverage report:
```
mypy torch_dynamo/variables --linecount-report /tmp/coverage_log
```
Compare before to after - we go from 3826 to 7221 lines covered

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166569
Approved by: https://github.com/williamwen42
2025-10-31 16:56:50 +00:00
c3b71d5499 [ROCm][CI] remove relaxed tolerance for tf32 tests (#166478)
Instead of relaxing tolerances for certain unit tests that exercise TF32 on MI300, skip the tests until hipblaslt accuracy is improved.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166478
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
Co-authored-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
2025-10-31 16:15:42 +00:00
1e3600b528 [MPS] Move logaddexp/logaddexp2 to Metal and support complex (#166670)
NOTE: Complex inputs are only supported in `logaddexp`. Since `logaddexp2` does not support complex inputs for CPU, it is not enabled for MPS in this PR either.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166670
Approved by: https://github.com/malfet
2025-10-31 16:15:02 +00:00
fee7624bd6 [PT2] set choice handler in config (#166607)
Summary:
We were setting the custom inductor choice using `torch._inductor.virtualized.V.set_choices_handler(CustomInductorChoices())`. However, this leads to inconsistent behaviors, even for jobs that are submitted back to back.

In this diff, we pass in the choice handler via an inductor config and overwrite the default behavior when the config is provided. This sovles the inconsistent behavior.

Test Plan: see D85785892 (internal only)

Differential Revision: D85785879

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166607
Approved by: https://github.com/eellison
2025-10-31 15:40:05 +00:00
24e94e021a [ROCm][CI] create ROCm 7.1 magma tarball (#166693)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166693
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-10-31 15:20:00 +00:00
69be99ee51 Remove manually synced arch versions in tools/nightly.py (#166616)
Discussed with @atalman offline. To reduce duplicate changes and reduce the number of files to change when updating arch versions.

------

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166616
Approved by: https://github.com/ezyang
2025-10-31 15:11:28 +00:00
034e951b0c [CUDA][cuBLASLt] addmm -- extend bias fusions to cases with (1 by n) shapes (#166307)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166307
Approved by: https://github.com/eqy
2025-10-31 14:30:41 +00:00
160ab53dd5 Update weight tensor initialization in RMSNormalization (#166550)
Ensure a >1d tensor as weight for ORT compatibility.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166550
Approved by: https://github.com/titaiwangms
2025-10-31 14:29:27 +00:00
5bcfdae71d Revert "Make PT2 compile backprop through custom op without autograd key a hard error (#166367)"
This reverts commit 4acc66f1192ab7743abcc50383aefc5447447f9d.

Reverted https://github.com/pytorch/pytorch/pull/166367 on behalf of https://github.com/atalman due to internal build failures ([comment](https://github.com/pytorch/pytorch/pull/166367#issuecomment-3473150269))
2025-10-31 13:44:05 +00:00
4e8ba37ce3 Revert "[BE] Move GreenContext implementation details to cpp (#166462)"
This reverts commit 5d288bc3f73873887f681e15af83c5525e6a60bd.

Reverted https://github.com/pytorch/pytorch/pull/166462 on behalf of https://github.com/atalman due to Sorry, Reverting. Failure: test/test_matmul_cuda.py::TestMatmulCudaCUDA::test_greencontext_carveout_cuda [GH job link](https://github.com/pytorch/pytorch/actions/runs/18962393091/job/54154156892) [HUD commit link](85b035ca9c) ([comment](https://github.com/pytorch/pytorch/pull/166462#issuecomment-3473060299))
2025-10-31 13:20:48 +00:00
26534e9809 Revert "[GraphPartition] cache get_free_symbol_uses (#166338)"
This reverts commit a6b1ef17173f56ba93ac97ff4384fa4060b5e41e.

Reverted https://github.com/pytorch/pytorch/pull/166338 on behalf of https://github.com/atalman due to Failure: test/nn/test_convolution.py::TestConvolutionNN::test_conv3d_overflow_values [GH job link](https://github.com/pytorch/pytorch/actions/runs/18961173726/job/54149112920) [HUD commit link](a6b1ef1717) ([comment](https://github.com/pytorch/pytorch/pull/166338#issuecomment-3472980329))
2025-10-31 12:57:56 +00:00
657f8c3e21 Revert "Fix torch.full with dynamic tensor fill_value in torch.compile (#166554)"
This reverts commit 32066772b3dee643b1657b8957f32b5ac8b1390a.

Reverted https://github.com/pytorch/pytorch/pull/166554 on behalf of https://github.com/atalman due to Failure: test/nn/test_pooling.py::TestPoolingNNDeviceTypeCPU::test_max_pool_nan_inf_cpu_float32 [GH job link](https://github.com/pytorch/pytorch/actions/runs/18959368975/job/54144148546) [HUD commit link](32066772b3) ([comment](https://github.com/pytorch/pytorch/pull/166554#issuecomment-3472976911))
2025-10-31 12:55:31 +00:00
b0831930ed [inductor] Mark / restrict tests that only work if ATen is used for matmul (#166518)
These tests only work if max_autotune=False (default), which for matmul means falling back to ATen. This PR just documents / makes that transparent.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166518
Approved by: https://github.com/eellison
2025-10-31 12:29:06 +00:00
c01636e1bc Fixes the sparse tensor issue (#163535)
Fixes #148324

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163535
Approved by: https://github.com/janeyx99
2025-10-31 11:48:31 +00:00
fd68d409ad [xpu][feature] Integrate OneDNN SDPA training forward/backward into XPU OVERRIDEABLE Backend (#162454)
This is the second PR split from https://github.com/pytorch/pytorch/pull/156272

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162454
Approved by: https://github.com/guangyey, https://github.com/EikanWang, https://github.com/drisspg
2025-10-31 11:20:38 +00:00
0d3a4f7155 [CD] Enable Inductor performance test for xpu (#166289)
Add Dynamo benchmark performance tests for XPU backend

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166289
Approved by: https://github.com/EikanWang, https://github.com/atalman
2025-10-31 10:52:07 +00:00
108bb224f7 [pytree] add treespec_{leaf,tuple,dict} functions for args_spec modification (#160843)
The goal of this PR is to provide a standard way to create simple treespec instances and hide the implementation details of the `PyTreeSpec` class.

Changes:

1. Add function `treespec_leaf()` to replace `LeafSpec()`.
2. Add function `treespec_tuple(...)` and `treespec_dict(...)` to create treespec for `tuple` / `dict` which is used for `*args` / `**kwargs`. This avoids direct modification to `treespec` instances that rely on the implementation details of the `PyTreeSpec` class.
3. Change `len(spec.children_specs)` to `spec.num_children`.
4. Change `isinstance(spec, LeafSpec)` to `spec.is_leaf()`.

------

Pull Request resolved: https://github.com/pytorch/pytorch/pull/160843
Approved by: https://github.com/mlazos
2025-10-31 10:33:16 +00:00
fc8ac1216c [4/N] Remove unused loop variables in tests (#166690)
This PR removes unused loop variables in tests.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166690
Approved by: https://github.com/justinchuby, https://github.com/mlazos
2025-10-31 10:20:48 +00:00
030de07aff [2/N] Use 'is' in callable comparisons (#166685)
It is generally advised to use `is/is not` for comparisons against torch functions.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166685
Approved by: https://github.com/xmfan, https://github.com/mlazos
2025-10-31 08:08:07 +00:00
9ec069e91b Update
[ghstack-poisoned]
2025-10-31 00:28:07 -07:00
7d67a41db4 make FXConverter.generate use V.fake_mode instead of _detect_fake_mode_from_gm (#166591)
Summary:
FXConverter configurs _node_metadata_hook passing in `fake_mode` explicitly, which is relevant for cases down the line like `_generate_triton_call` that inserts a `triton_kernel_wrapper_mutation` node.

This `fake_mode` is obtained from `_detect_fake_mode_from_gm`, which can be different from inductor set `V.fake_mode`.

For example, while `V.fake_mode` is not None, `_detect_fake_mode_from_gm` can be **None** for a parent graph containing only a submodule which has no input args and only constants
```
parent graph():
    %sub : [num_users=1] = call_module[target=sub](args = (), kwargs = {})
    %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%sub, slice(None, None, None)), kwargs = {})
    return (getitem,)

submodule graph():
    %randn : [num_users=1] = call_function[target=torch.ops.aten.randn.default](args = ([5, 10],), kwargs = {device: cuda, pin_memory: False})
    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%randn, 1), kwargs = {})
    return (add,)

```

Getting this discrepnancy is flawed, it makes `_node_metadata_hook` try running inputs in a different "fake_mode" or no fake_mode when the rest of lowering uses `V.fake_mode`. In some cases where input is placed on custom non-gpu device, it can even complain with "requires device to be started" or tensor device mismatch.

So this diff updates FXConverter.generate to use `V.fake_mode` which is populated by inductor properly.

Test Plan:
added a test `test_const_folded_subgraph` in `test_fxir_backend.py`, this test:
- creates a graph module that calls a subgraph with no inputs and containing only const-foldable ops
- const fold the subgraph
- run FXConverter.generate, expect `fake_mode` used to code-generate is not None

On the prior implementation when `_detect_fake_mode_from_gm` was used, this test would fail as fake_mode would be `None`.

With this change, the test passes, `fake_mode` is properly collected from `V.fake_mode` which is not None.

Differential Revision: D85767475

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166591
Approved by: https://github.com/blaine-rister, https://github.com/mlazos, https://github.com/eellison
2025-10-31 05:52:07 +00:00
1f668ef3c3 Update (base update)
[ghstack-poisoned]
2025-10-30 22:24:54 -07:00
248954f687 Update
[ghstack-poisoned]
2025-10-30 22:24:54 -07:00
5518fdfe90 Update (base update)
[ghstack-poisoned]
2025-10-30 22:18:42 -07:00
110c021241 Update
[ghstack-poisoned]
2025-10-30 22:18:42 -07:00
ab036276f1 Update (base update)
[ghstack-poisoned]
2025-10-30 22:04:13 -07:00
ab97de8c82 Update
[ghstack-poisoned]
2025-10-30 22:04:13 -07:00
85b035ca9c [nativert] Downcast triton double arguments to floats (#166620)
This diff tries to fix a limitation in Sigmoid + Triton interaction, where float arguments are not correctly passed. NativeRT passes float arguments as double, while triton kernels were reading as a float, resulting in wrong values.

---

## Limitations in (de)seriazliation

In triton, float arguments to a kernel are encoded as "fp32" ([code](https://github.com/triton-lang/triton-cpu/blob/main-merged/python/triton/runtime/jit.py#L310-L326)):
```
        elif isinstance(arg, float):
            return ("fp32", None)
```
But it seems like that torch export serde uses double ([code](d2eff5d454/torch/_export/serde/export_schema.thrift (L149))) because Thrift only has the double type:
```
union Argument {
  10: bool as_none;
  20: TensorArgument as_tensor;
  30: list<TensorArgument> as_tensors;
  50: i64 as_int;
  70: list<i64> as_ints;
  80: double as_float;   ===> actually double
...
```
`TritonKernel` constructor loads attributes from a node, where `Constant` represents the variant type. And it only has `double` ([code](d2eff5d454/torch/nativert/graph/Graph.h (L86))):
```
using Constant = std::variant<
    None,
    int64_t,
    std::vector<int64_t>,
    double,    ===> triton float is loaded as double
```

So, NativeRT passes float arguments (originally in Triton) as double to triton kernels. But, all of the triton backends (nvidia, amd and cpu) are reading them as float because the signature still says `fp32`.

D84423898 was the current workaround: wrapping float arguments with tensors.

## The Fix

Fixing the thrift definition isn't viable because Thrift only supports double type. It's also possible to fix on the triton side: it can downcast from double to float. But I needed to fix all backends.

Instead, I think this diff would be the most effective way: when building `TritonKernel`, have downcasted float values, right after loading double arguments.

Test Plan:
```
buck test fbcode//mode/opt-amd-gpu fbcode//caffe2/test:test_export --
```

Differential Revision: D85747160

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166620
Approved by: https://github.com/XueningXu
2025-10-31 03:52:20 +00:00
267d0197bf [dynamo] fix error_on_graph_break bug where non-empty checkpoint results in unwanted graph break resumption (#166586)
Fixes https://github.com/pytorch/pytorch/issues/166589

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166586
Approved by: https://github.com/Lucaskabela
ghstack dependencies: #166476, #166477
2025-10-31 03:36:27 +00:00
1dec8a67a8 [dynamo, nested graph breaks] add disable_nested_graph_breaks decorator/context manager (#166477)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166477
Approved by: https://github.com/Lucaskabela, https://github.com/Skylion007
ghstack dependencies: #166476
2025-10-31 03:36:27 +00:00
797cd80b26 [dynamo, nested graph breaks] codegen dead nested cells correctly (#166476)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166476
Approved by: https://github.com/Lucaskabela
2025-10-31 03:36:27 +00:00
7d39401fa0 Revert "[BE][Typing][Dynamo] Type misc files in torch/_dynamo/variables/ (#166569)"
This reverts commit f1e4c42b6ef3d3cea08ab3babb693e3ce42cf08b.

Reverted https://github.com/pytorch/pytorch/pull/166569 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/166569#issuecomment-3471180280))
2025-10-31 03:31:01 +00:00
e3ae0594d1 Add CUDA MXFP4 scaled mm support via. FBGEMM (#166526)
Summary:

* Pull in `f4f4bf16` from FBGemm to provide MXFP4 support for CUDA
* Add testing

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166526
Approved by: https://github.com/drisspg, https://github.com/ngimel
2025-10-31 03:17:27 +00:00
f1e4c42b6e [BE][Typing][Dynamo] Type misc files in torch/_dynamo/variables/ (#166569)
Provides type coverage to ~3000 LOC and 200 methods in  `torch/_dynamo/variables/`

This is the first part of the final step to having 100% strict type coverage in dynamo - see previous comments in https://github.com/pytorch/pytorch/pull/166535 (combined into this one PR because ghstack was giving issues...)

### Coverage report:
```
mypy torch_dynamo/variables --linecount-report /tmp/coverage_log
```
Compare before to after - we go from 3826 to 7221 lines covered

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166569
Approved by: https://github.com/williamwen42
2025-10-31 02:57:59 +00:00
d3e511f07c [Inductor] support masked vectorization for the tail_loop for fp8 datatype (#163324)
**Summary:**
Support masked vectorization for the tail_loop for fp8 datatype.

**Example:**
```
import torch

def fn(
    x,
    scale,
    zero_point,
    quant_min,
    quant_max,
    dtype,
):
    x = torch.ops.quantized_decomposed.dequantize_per_tensor(
        x,
        scale,
        zero_point,
        quant_min,
        quant_max,
        dtype,
    )
    x = torch.relu(x)
    x = torch.ops.quantized_decomposed.quantize_per_tensor(
        x, scale, zero_point, quant_min, quant_max, dtype
    )
    return x

quant_min = -128
quant_max = 127
dtype = torch.float8_e4m3fn
x = torch.clamp(torch.randn((1, 7, 7, 9), dtype=torch.float32) * 100, quant_min, quant_max).to(dtype)
zero_point = 100
scale = 0.01

with torch.no_grad():
    compiled_fn = torch.compile(fn)
    compiled_fn(x, scale, zero_point, quant_min, quant_max, dtype)
```

**Generated code:**

- Before
```
cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0 = async_compile.cpp_pybinding(['const at::Float8_e4m3fn*', 'at::Float8_e4m3fn*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C"  void  kernel(const at::Float8_e4m3fn* in_ptr0,
                       at::Float8_e4m3fn* out_ptr0)
{
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(441L); x0+=static_cast<int64_t>(16L))
        {
            {
                if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(432L)))
                {
                    auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                    auto tmp1 = at::vec::convert<float>(tmp0);
                    auto tmp2 = static_cast<float>(100.0);
                    auto tmp3 = at::vec::Vectorized<float>(tmp2);
                    auto tmp4 = tmp1 - tmp3;
                    auto tmp5 = static_cast<float>(0.01);
                    auto tmp6 = at::vec::Vectorized<float>(tmp5);
                    auto tmp7 = tmp4 * tmp6;
                    auto tmp8 = (tmp7);
                    auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0));
                    auto tmp10 = tmp9 * tmp3;
                    auto tmp11 = tmp10.round();
                    auto tmp12 = tmp11 + tmp3;
                    auto tmp13 = static_cast<float>(-128.0);
                    auto tmp14 = at::vec::Vectorized<float>(tmp13);
                    auto tmp15 = at::vec::maximum(tmp12, tmp14);
                    auto tmp16 = static_cast<float>(127.0);
                    auto tmp17 = at::vec::Vectorized<float>(tmp16);
                    auto tmp18 = at::vec::minimum(tmp15, tmp17);
                    auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18);
                    tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                }
                if(C10_UNLIKELY(x0 >= static_cast<int64_t>(432L) && x0 < static_cast<int64_t>(441L)))
                {
                    for (int64_t x0_tail = static_cast<int64_t>(432L);x0_tail < static_cast<int64_t>(441L); x0_tail++)
                    {
                        auto tmp0 = in_ptr0[static_cast<int64_t>(x0_tail)];
                        auto tmp1 = c10::convert<float>(tmp0);
                        auto tmp2 = static_cast<float>(100.0);
                        auto tmp3 = float(tmp1 - tmp2);
                        auto tmp4 = static_cast<float>(0.01);
                        auto tmp5 = float(tmp3 * tmp4);
                        auto tmp6 = c10::convert<float>(tmp5);
                        auto tmp7 = std::max(tmp6, decltype(tmp6)(0));
                        auto tmp8 = float(tmp7 * tmp2);
                        auto tmp9 = std::nearbyint(tmp8);
                        auto tmp10 = float(tmp9 + tmp2);
                        auto tmp11 = static_cast<float>(-128.0);
                        auto tmp12 = max_propagate_nan(tmp10, tmp11);
                        auto tmp13 = static_cast<float>(127.0);
                        auto tmp14 = min_propagate_nan(tmp12, tmp13);
                        auto tmp15 = c10::convert<at::Float8_e4m3fn>(tmp14);
                        out_ptr0[static_cast<int64_t>(x0_tail)] = tmp15;
                    }
                }
            }
        }
    }
}
''')

async_compile.wait(globals())
del async_compile

class Runner:
    def __init__(self, partitions):
        self.partitions = partitions

    def recursively_apply_fns(self, fns):
        new_callables = []
        for fn, c in zip(fns, self.partitions):
            new_callables.append(fn(c))
        self.partitions = new_callables

    def call(self, args):
        arg0_1, = args
        args.clear()
        assert_size_stride(arg0_1, (1, 7, 7, 9), (441, 63, 9, 1))
        buf0 = empty_strided_cpu((1, 7, 7, 9), (441, 63, 9, 1), torch.float8_e4m3fn)
        # [Provenance debug handles] cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0:1
        cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0(arg0_1, buf0)
        del arg0_1
        return (buf0, )
```
- After
```
cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0 = async_compile.cpp_pybinding(['const at::Float8_e4m3fn*', 'at::Float8_e4m3fn*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C"  void  kernel(const at::Float8_e4m3fn* in_ptr0,
                       at::Float8_e4m3fn* out_ptr0)
{
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(441L); x0+=static_cast<int64_t>(16L))
        {
            {
                if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(432L)))
                {
                    auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                    auto tmp1 = at::vec::convert<float>(tmp0);
                    auto tmp2 = static_cast<float>(100.0);
                    auto tmp3 = at::vec::Vectorized<float>(tmp2);
                    auto tmp4 = tmp1 - tmp3;
                    auto tmp5 = static_cast<float>(0.01);
                    auto tmp6 = at::vec::Vectorized<float>(tmp5);
                    auto tmp7 = tmp4 * tmp6;
                    auto tmp8 = (tmp7);
                    auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0));
                    auto tmp10 = tmp9 * tmp3;
                    auto tmp11 = tmp10.round();
                    auto tmp12 = tmp11 + tmp3;
                    auto tmp13 = static_cast<float>(-128.0);
                    auto tmp14 = at::vec::Vectorized<float>(tmp13);
                    auto tmp15 = at::vec::maximum(tmp12, tmp14);
                    auto tmp16 = static_cast<float>(127.0);
                    auto tmp17 = at::vec::Vectorized<float>(tmp16);
                    auto tmp18 = at::vec::minimum(tmp15, tmp17);
                    auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18);
                    tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                }
                if(C10_UNLIKELY(x0 >= static_cast<int64_t>(432L) && x0 < static_cast<int64_t>(441L)))
                {
                    auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(9L));
                    auto tmp1 = at::vec::convert<float>(tmp0);
                    auto tmp2 = static_cast<float>(100.0);
                    auto tmp3 = at::vec::Vectorized<float>(tmp2);
                    auto tmp4 = tmp1 - tmp3;
                    auto tmp5 = static_cast<float>(0.01);
                    auto tmp6 = at::vec::Vectorized<float>(tmp5);
                    auto tmp7 = tmp4 * tmp6;
                    auto tmp8 = (tmp7);
                    auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0));
                    auto tmp10 = tmp9 * tmp3;
                    auto tmp11 = tmp10.round();
                    auto tmp12 = tmp11 + tmp3;
                    auto tmp13 = static_cast<float>(-128.0);
                    auto tmp14 = at::vec::Vectorized<float>(tmp13);
                    auto tmp15 = at::vec::maximum(tmp12, tmp14);
                    auto tmp16 = static_cast<float>(127.0);
                    auto tmp17 = at::vec::Vectorized<float>(tmp16);
                    auto tmp18 = at::vec::minimum(tmp15, tmp17);
                    auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18);
                    tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(9L));
                }
            }
        }
    }
}
''')

async_compile.wait(globals())
del async_compile

class Runner:
    def __init__(self, partitions):
        self.partitions = partitions

    def recursively_apply_fns(self, fns):
        new_callables = []
        for fn, c in zip(fns, self.partitions):
            new_callables.append(fn(c))
        self.partitions = new_callables

    def call(self, args):
        arg0_1, = args
        args.clear()
        assert_size_stride(arg0_1, (1, 7, 7, 9), (441, 63, 9, 1))
        buf0 = empty_strided_cpu((1, 7, 7, 9), (441, 63, 9, 1), torch.float8_e4m3fn)
        # [Provenance debug handles] cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0:1
        cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0(arg0_1, buf0)
        del arg0_1
        return (buf0, )
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163324
Approved by: https://github.com/Xia-Weiwen, https://github.com/mingfeima, https://github.com/jansel
2025-10-31 02:53:56 +00:00
d3be06cbdc [MTIAGraph][Pytorch][2/n] Add binding for Python to C++, and hook for Pytorch to Fbcode (#165963)
Summary:
This diff is the binding and hook layer for MTIA Graph, including
1. binding between Python and C++
2. hook between Pytorch and mtia fbcode
<img width="1780" height="754" alt="image" src="https://github.com/user-attachments/assets/31e24e5b-8324-42d8-8d3b-59536bc18340" />

[Doc](https://docs.google.com/document/d/1Q3xdZAIqhBvuy2HxGDfJyXVmxYXUEeYSZSwsp7bcJF8/edit?tab=t.osb46a42t6wb#heading=h.ayp9tkk08x00)

Test Plan: Will be tested in the python implementation which will use the binding and hook

Differential Revision: D84457757

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165963
Approved by: https://github.com/malfet, https://github.com/albanD
2025-10-31 02:52:51 +00:00
1129605415 [ROCm][CI] create ROCm 7.1 images for binary builds (#166665)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166665
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-10-31 02:52:37 +00:00
a6b1ef1717 [GraphPartition] cache get_free_symbol_uses (#166338)
Graph partition relies on `get_free_symbol_uses()` to collect symbol inputs.
ee7434be82/torch/_inductor/scheduler.py (L4869-L4885)

I empirically observed that `get_free_symbol_uses()` becomes slower for larger graphs. Specifically, I tried to aten fallback for torchtitan which results in 10k+ aten nodes. When processing the 600-th node, it takes seconds to `get_free_symbol_uses()` for 1 node.

Why? Because `get_free_symbol_uses()` may recursively call another `get_free_symbol_uses()`, which could recursively run many times.
ee7434be82/torch/_inductor/ir.py (L4541-L4543)

This PR fixes the issue by caching the results of `get_free_symbol_uses()`. I validated on torchtitan that the issue is fixed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166338
Approved by: https://github.com/eellison
2025-10-31 02:50:10 +00:00
12577064dd [MPS] Fix crash when max/min ops called for complex types (#166214)
Raise an exception, as it's meaningless and results in segfault otherwise:
```
% python -c "import torch;torch.rand(10, dtype=torch.cfloat, device='mps').amax()"
(mpsFileLoc): /AppleInternal/Library/BuildRoots/4~B6shugDBannYeMBGCfhw7wjvNJOfy4BrawZ7TdI/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm:176:0: error: 'mps.reduction_max' op operand #0 must be tensor of mps native type values, but got 'tensor<10xcomplex<f32>>'
(mpsFileLoc): /AppleInternal/Library/BuildRoots/4~B6shugDBannYeMBGCfhw7wjvNJOfy4BrawZ7TdI/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm:176:0: note: see current operation: %2 = "mps.reduction_max"(%arg0, %1) <{keep_dims, propagate_nans}> : (tensor<10xcomplex<f32>>, tensor<1xsi32>) -> tensor<1xcomplex<f32>>
(mpsFileLoc): /AppleInternal/Library/BuildRoots/4~B6shugDBannYeMBGCfhw7wjvNJOfy4BrawZ7TdI/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm:176:0: error: 'mps.reduction_max' op operand #0 must be tensor of mps native type values, but got 'tensor<10xcomplex<f32>>'
(mpsFileLoc): /AppleInternal/Library/BuildRoots/4~B6shugDBannYeMBGCfhw7wjvNJOfy4BrawZ7TdI/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm:176:0: note: see current operation: %2 = "mps.reduction_max"(%arg0, %1) <{keep_dims, propagate_nans}> : (tensor<10xcomplex<f32>>, tensor<1xsi32>) -> tensor<1xcomplex<f32>>
/AppleInternal/Library/BuildRoots/4~B6shugDBannYeMBGCfhw7wjvNJOfy4BrawZ7TdI/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphExecutable.mm:1347: failed assertion `original module failed verification'
zsh: abort      python -c
```

To be tested by `test_ops.py`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166214
Approved by: https://github.com/dcci, https://github.com/kulinseth, https://github.com/Skylion007
ghstack dependencies: #166272
2025-10-31 02:37:20 +00:00
24b6eb7727 [Inductor] Enable Custom op Autotune Decompositions and Parameter Tuning (#164212)
This PR introduces CustomOp autotuning. It allows user to provide a CustomOpConfig:
(1) to register (optional) multiple decomposition implementations for custom operations and
(2) to register parameter tuning knobs and values they want to tune for the decompositions
so that inductor automatically select the best-performing variant through Inductor's autotune benchmarking.

Example:
```python
 register_custom_op_autotuning(
            custom_op=my_attention_op,
            configs=[
                CustomOpConfig(attention_impl, head_dim=32, method='chunked'),
                CustomOpConfig(attention_impl, head_dim=64, method='tiled'),
                CustomOpConfig(head_dim=128), # no decompositions
            ],
            input_gen_fns={
                "query": lambda fake: torch.randn_like(fake, device='cuda'),
                "key": lambda fake: torch.randn_like(fake, device='cuda'),
                "value": lambda fake: torch.randn_like(fake, device='cuda'),
            }
    )
```

**CustomOpConfig**: Each CustomOpConfig defines exactly one autotuning variant with specific parameter values and optional decomposition implementation with PyTorch aten ops. Users can register their own tuning knobs and optional decomposition functions for the same custom operation. The system automatically benchmarks all variants to select the best performing. If no decomposition is provided in the config, the CustomOp's default implementation will be used.

**Custom Input Generation**: Users can provide custom input generators via an optional `input_gen_fns` to control how synthetic inputs are created during benchmarking. This enables more realistic performance testing by generating inputs that match expected data distributions and characteristics for each tensor argument.

**More Examples with autotune logs:**:
1. Allow user to register customOp decompositions with tuning parameters for autotuning. Example usage:
```python
from torch._inductor.kernel.custom_op import CustomOpConfig, register_custom_op_autotuning

def decompose_k_implementation(a: torch.Tensor, b: torch.Tensor, k_splits: int = 4) -> torch.Tensor:
    """Matrix multiply with k-way decomposition."""
         # Implementation...with k_splits

@torch.library.custom_op("my_lib::decompose_k", mutates_args=())
def test_decompose_k_op(
        a: torch.Tensor, b: torch.Tensor, k_splits: int
    ) -> torch.Tensor:
        return decompose_k_implementation(a, b, k_splits)

# Register autotuning with different k_splits values
register_custom_op_autotuning(
    custom_op=test_decompose_k_op,
    configs=[
        CustomOpConfig(decompose_k_implementation, k_splits=2),
        CustomOpConfig(decompose_k_implementation, k_splits=32),
        CustomOpConfig(decompose_k_implementation, k_splits=64),
        CustomOpConfig(k_splits=128), # can make decomposition optional, then use default impl test_decompose_k_op
        CustomOpConfig(k_splits=256)
    ],
    input_gen_fns={
        "a": lambda fake: torch.randn_like(fake, device='cuda') * 0.1,
        "b": lambda fake: torch.randn_like(fake, device='cuda') * 0.1,
    }
)
```

Example result:
```
{"num_choices": 6, "num_triton_choices": 0, "best_kernel": "test_decompose_k_autotuned_fallback_default", "best_time": 0.09980800002813339}
AUTOTUNE test_decompose_k_autotuned(256x65536, 65536x1024)
strides: [65536, 1], [1024, 1]
dtypes: torch.float16, torch.float16
  test_decompose_k_autotuned_fallback_default 0.0998 ms 100.0%
  test_decompose_k_autotuned_decompose_k_implementation_k_splits_2_0 0.1096 ms 91.0% CustomOp decompose_k_implementation_k_splits_2
  test_decompose_k_autotuned_decompose_k_implementation_k_splits_32_1 0.1277 ms 78.2% CustomOp decompose_k_implementation_k_splits_32
  test_decompose_k_autotuned_decompose_k_implementation_k_splits_64_2 0.1454 ms 68.6% CustomOp decompose_k_implementation_k_splits_64
  test_decompose_k_autotuned_decompose_k_implementation_k_splits_128_3 0.1536 ms 65.0% CustomOp decompose_k_implementation_k_splits_128
  test_decompose_k_autotuned_decompose_k_implementation_k_splits_256_4 0.2084 ms 47.9% CustomOp decompose_k_implementation_k_splits_256
```

2. Allow user to tune parameter knob by passing the parameter and values in the CustomOpConfig.
**Example**
```python
def mlp_variants(input_tensor, gate_weight, up_weight, down_weight, method):
    """MLP implementation with different computational approaches."""
    if method == 0:
        # Standard separate matmuls
        # ... implementation
    elif method == 1:
        # Batched approach with torch.mm
        # ... implementation
    elif method == 2:
        # Fused weights approach
        # ... implementation

@torch.library.custom_op("my_lib::mlp_op", mutates_args=())
        def mlp_op(
            input_tensor: torch.Tensor,
            gate_weight: torch.Tensor,
            up_weight: torch.Tensor,
            down_weight: torch.Tensor,
            method: int,
        ) -> torch.Tensor:
            return mlp_variants(
                input_tensor, gate_weight, up_weight, down_weight, method=method
            )

register_custom_op_autotuning(
    custom_op=mlp_op,
    configs=[
        CustomOpConfig(method=0),
        CustomOpConfig(method=1),
        CustomOpConfig(method=2),
        # method=0 is the default fallback in the original op
    ],
    input_gen_fns={
        "input_tensor": lambda fake: torch.randn_like(fake, device='cuda') * 0.1,
        "gate_weight": lambda fake: torch.randn_like(fake, device='cuda') * 0.05,
        # ... other input generators
    }
)

```

Example result:
```
AUTOTUNE test_mlp_autotuned(4x32x512, 512x1024, 512x1024, 1024x256)
  test_mlp_autotuned_mlp_variants_method_2 0.0181 ms 100.0% CustomOp mlp_variants_method_2
  test_mlp_autotuned_mlp_variants_method_1 0.0185 ms 97.8% CustomOp mlp_variants_method_1
  test_mlp_autotuned_mlp_default_fallback_method_0 0.0198 ms 91.4% CustomOp fallback
```

### Test Suite (`test/inductor/test_custom_op_autotune.py`)

*   **RMSNorm autotuning**: Tests different RMSNorm implementations with dynamic input shapes
*   **MLP autotuning**: Tests different MLP decomposition and tuning "method" parameter
*   **DecomposeK**: Tests different k_splits values for matrix multiplication decomposition with k dim split
*   **Multi-parameter tuning**: Tests configs with multiple tuning parameters (scale_mode, chunk_size)

### Next Step:
- Enable Max-autotune with user passed in max-autotune config. https://github.com/pytorch/pytorch/pull/165526/files
- Support inline epilogue fusion for selected best customop decomposition with surrounding elementwise ops. https://github.com/pytorch/pytorch/pull/165952/files
- Support customop autotune considering fusion with multiTemplateBuffer. WIP

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164212
Approved by: https://github.com/zou3519
2025-10-31 02:28:00 +00:00
32066772b3 Fix torch.full with dynamic tensor fill_value in torch.compile (#166554)
Fixes #166253

## Summary
When `torch.full` is called with a 0-D tensor as `fill_value` inside a `torch.compile`'d function, the value was being incorrectly cached, causing subsequent calls with different values to return the first value.

## Root Cause
The Dynamo handler for `torch.full` was calling `aten._local_scalar_dense` to convert tensor fill_values to Python scalars at compile time, which baked the value into the compiled graph as a constant.

## Solution
Modified the Dynamo handler to decompose `torch.full(size, tensor_fill_value)` into `empty(size).fill_(tensor_fill_value)` when `fill_value` is a `TensorVariable`, keeping the fill value dynamic in the compiled graph.

## Testing
Added test case that verifies torch.full works correctly with dynamic tensor fill_values across multiple calls and dtypes.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166554
Approved by: https://github.com/Lucaskabela
2025-10-31 00:56:02 +00:00
47f0024310 [CI][BE] Factor out repeated test code (#166481)
Into `_run_single_arg_fwd`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166481
Approved by: https://github.com/Skylion007
2025-10-31 00:52:50 +00:00
98d640bb11 Remove AT_USE_HIPSPARSE_GENERIC_API (#166393)
This macro is not used in OSS anymore.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166393
Approved by: https://github.com/ezyang
2025-10-31 00:49:09 +00:00
5d288bc3f7 [BE] Move GreenContext implementation details to cpp (#166462)
- Remove all complex defines logic from the header
- Make GreenContext constructor private, as  it should only be created via the static method as singleton
- Delete unused `getContext` and `getGreenContext` methods
- Rename `CUDA_HAS_GREEN_CONTEXT` to `HAS_CUDA_GREEN_CONTEXT()`, which results in compilation error if one accidentally makes a typo
- Suppress `-Wunused-private-field` is GreenContext is not available
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166462
Approved by: https://github.com/ngimel, https://github.com/eqy
2025-10-31 00:48:01 +00:00
bfb47ec50e [dynamo] support tracing new typing union syntax X | Y (#166599)
To do in a followup - I think there's an approach to reconstruct typing variables.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166599
Approved by: https://github.com/SherlockNoMad, https://github.com/anijain2305, https://github.com/Skylion007
2025-10-30 23:59:27 +00:00
7a0cd8ed09 [ROCm] Disable __builtin_amdgcn_rcpf for gfx90a (#166454)
Improves accuracy for some failing tests.

test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py::TestClipGradNormWorldSize4::test_clip_grad_norm_2d [GH job link](https://github.com/pytorch/pytorch/actions/runs/18930221123/job/54046876467) [HUD commit link](f20bf77874)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166454
Approved by: https://github.com/jerrymannil, https://github.com/jeffdaily
2025-10-30 23:39:00 +00:00
984e64b2cd [inductor] Fix constant folder (#166655)
Fixes https://fb.workplace.com/groups/1028545332188949/permalink/1351999569843522/ where the resulting graph of constant folder uses a sym node which has been created later. Graph diff: https://www.internalfb.com/intern/diffing/?paste_number=2014609054

Before:
```
    %full_65 : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([%sym_size_int_47, 768], 1), kwargs = {dtype: torch.int64, layout: torch.strided, device: cuda:0, pin_memory: False})
    %select_18 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%full_65, 1, 0), kwargs = {})
    %mul_2792 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_18, 0), kwargs = {})
    %embedding_4 : [num_users=1] = call_function[target=torch.ops.aten.embedding.default](args = (%_uv__surface_embeddings_weight, %mul_2792), kwargs = {})
```

After:
```
    %full_65 : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([%sym_size_int_47, 768], 1), kwargs = {dtype: torch.int64, layout: torch.strided, device: cuda:0, pin_memory: False})
    %full_default_1 : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([%sym_size_int_150], 0), kwargs = {dtype: torch.int64, layout: torch.strided, device: cuda:0, pin_memory: False})
    %embedding_4 : [num_users=1] = call_function[target=torch.ops.aten.embedding.default](args = (%_uv__surface_embeddings_weight, %full_default_1), kwargs = {})
    ...
    %sym_size_int_150 : [num_users=7] = call_function[target=torch.ops.aten.sym_size.int](args = (%view_193, 0), kwargs = {})
```

I couldn't figure out a small repro for this :/

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166655
Approved by: https://github.com/eellison
2025-10-30 22:51:28 +00:00
b9bcb37f40 [DebugMode] store stringify args by default (#166347)
DebugMode currently stores dispatch call args & kwargs, which is all intermediate tensors and more. This quickly OOMed on GPU when trying to debug some torchtitan / llama 8b models.

This defaults to storing the stringified version, adding a flag `DebugMode(store_original_args=True)` if users want to store the original args as-is (and for BC).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166347
Approved by: https://github.com/yushangdi
2025-10-30 22:12:23 +00:00
7e3b9d105e [CP][BE][2/2] Refactor the code structure (#166501)
Our CP codebase now contains several files and we are adding more. This
PR refactors the code to consolidate the files into a context_parallel
folder but keep the import so that the existing users of CP won't be
affected.

Unfortunately, we have to split this PR into two PRs as the PyTorch
infra cannot accept a PR with 3000+ LoC change and git cannot recognize
that _context_parallel/_attention.py is moved from _attention.py because
we want to keep BC.

This is the second PR.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166501
Approved by: https://github.com/Skylion007
ghstack dependencies: #166456
2025-10-30 22:07:07 +00:00
45c3f02d69 [ROCm] moved gfx1100 back to experimental status for AOTriton (#166397)
According to next commit to AOTriton:
8625c4faee

These changes missed in 0.11b release:
https://github.com/pytorch/pytorch/pull/161754

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166397
Approved by: https://github.com/jeffdaily
2025-10-30 21:43:01 +00:00
f5543e3741 [wip] fix searchsorted non dense (#165064)
Fix for https://github.com/pytorch/pytorch/issues/163528

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165064
Approved by: https://github.com/benjaminglass1, https://github.com/mlazos
2025-10-30 21:21:24 +00:00
5fc2c7a2a1 [ROCm][inductor] More configs for pointwise kernels. (#166470)
This config improves performance by 250% on some kernels that contain `t1.atomic_add(...)`. Again, we conditionalize for ROCm/HIP, so there is no impact to NV.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166470
Approved by: https://github.com/PaulZhang12, https://github.com/mlazos, https://github.com/eellison, https://github.com/jansel
2025-10-30 21:20:12 +00:00
7692fa09cd [Code Clean] Clean asserts in torch/ao/quantization/fx/* (#165420)
Replace assert statements with explicit if/raise patterns in:

- torch/ao/quantization/fx/* (177 errors)

fix partialy #164878

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165420
Approved by: https://github.com/RohitRathore1, https://github.com/fffrog, https://github.com/albanD
2025-10-30 20:53:36 +00:00
df71b70727 [cuDNN][conv] Re-enable cuDNN for 3D convolutions (fixed in 9.15+) (#166480)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166480
Approved by: https://github.com/Skylion007, https://github.com/malfet
2025-10-30 20:47:20 +00:00
80ba6e458f Add warning when users have incomplete setup for type checking (#166603)
Looking for feedback on this approach.
Received user reports of spurious pyrefly errors for users using hg instead of git. I think this was due to the fact that when using a venv and git, `make setup-env` installs requirements and pulls from a nightly torch wheel, which is needed for pyrefly to type check properly.

Initial documentation for `make setup-env` I found here: https://github.com/pytorch/pytorch/blob/main/CONTRIBUTING.md#developing-pytorch

Testing:
```
hg clone --git ssh://git@github.com/pytorch/pytorch.git
conda create -n pytorch_env python=3.10 # (or manually create venv instead of using script)
cd pytorch
pip install -r requirements.txt
pip install -r requirements-build.txt
lintrunner init
# check how many pyrefly errors - 15,709 errors (11,693 ignored)
lintrunner # confirm error message / warning appears
>>> General linter failure:
  Warning (PYREFLY) nightly-wheel-not-run
    pytorch-nightly.pth not found. You may need to run make setup-env or make
    setup-env-conda to install nightly binaries and type stubs.
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166603
Approved by: https://github.com/aorenste
2025-10-30 20:37:44 +00:00
71a1b16912 Update
[ghstack-poisoned]
2025-10-30 13:33:54 -07:00
0d50e5d8d4 [3/N] Fix unused loop variables (#166509)
This PR removes unused loop variables in tests.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166509
Approved by: https://github.com/Lucaskabela, https://github.com/Skylion007
2025-10-30 20:13:51 +00:00
99b05d1b78 Better 1x128, 128x128 error handling on non-Hopper (#166639)
Summary:

Blockwise 1x128 and 128x128 scaling is only available on CUDA >= 12.9
and only on Hopper GPUs. Attempting to run on B200 would give a
hard-to-debug `CUBLAS_STATUS_NOT_SUPPORTED`.

Add a more helpful `NotImplementedError` to catch this case.

Also more explicitly disable ROCm builds for relevant methods, based on
lack of support per [hipBLASlt
docs](https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/reference/datatypes.html#_CPPv4N28hipblasLtMatmulMatrixScale_t40HIPBLASLT_MATMUL_MATRIX_SCALE_VEC128_32FE).

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166639
Approved by: https://github.com/drisspg
2025-10-30 20:13:06 +00:00
8a9de56600 Update (base update)
[ghstack-poisoned]
2025-10-30 13:07:07 -07:00
10d93b9adb Update
[ghstack-poisoned]
2025-10-30 13:07:07 -07:00
f911d64750 [CUDA] xFail max-autotune grouped gemm tests on devices with insufficient SM count (#165921)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165921
Approved by: https://github.com/ngimel
2025-10-30 20:05:07 +00:00
52db60170d Enable verify_dynamo on Python 3.13 (#166497)
Dynamo now supports Python 3.13.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166497
Approved by: https://github.com/Lucaskabela, https://github.com/williamwen42
2025-10-30 19:52:32 +00:00
56838bad5f [CP][BE][1/2] Refactor the code structure (#166456)
Our CP codebase now contains several files and we are adding more. This PR refactors the code to consolidate the files into a context_parallel folder but keep the import so that the existing users of CP won't be affected.

Unfortunately, we have to split this PR into two PRs as the PyTorch infra cannot accept a PR with 3000+ LoC change and git cannot recognize that _context_parallel/_attention.py is moved from _attention.py because we want to keep BC.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166456
Approved by: https://github.com/Skylion007
2025-10-30 19:46:49 +00:00
ad3a56ab98 Add a compile-time flag to trigger verbose logging for device-side asserts (#166171)
Summary:
Using `CUDA_KERNEL_ASSERT_PRINTF` inside kernels allows us to log invalid values to the console (that can be in turn used to surface _hopefully_ more clearer error messages).

This does have an impact in the number of registers needed for the values being logged (I confirmed via diffing PTX that there is no other impact relative to using `__assert_fail`)

To avoid causing perf bottlenecks, this change adds a compile-time switch to enable more verbose errors in some of the common kernels that cause DSAs. There is also a Buck config that can be used to configure this switch more conveniently.

## Alternatives considered
I considered making the behavior of `CUDA_KERNEL_ASSERT_PRINTF` controllable via a compile-time macro instead of writing another wrapper for it but there are kernels where the extra register pressure is not as severe and in those cases, having more useful error messages by default is pretty useful.

Test Plan:
## Simple Python Driver:
```
# scatter_errors.py
import torch
def main() -> None:
    a = torch.rand(128, device="cuda:0")
    idx = torch.randint(0, 128, (100,), device="cuda:0")
    idx[0] = 9999
    b = torch.scatter(a, 0, idx, 555.0)
    print(b)
```

When running normally via:
```
$ buck2 run @//mode/opt  :scatter_errors
```
we see the followng DSA message:
```
fbcode/caffe2/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:410: operator(): block: [0,0,0], thread: [0,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
```

Running via:
```
$  buck2 run @//mode/opt -c fbcode.c10_enable_verbose_assert=1 :scatter_errors
```
however produces:
```
[CUDA_KERNEL_ASSERT] fbcode/caffe2/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:410: operator(): block: [0,0,0], thread: [0,0,0]: Assertion failed: `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"`: Expected 0 <= idx_dim < index_size (128), but got idx_dim = 9999
```

Differential Revision: D85185987

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166171
Approved by: https://github.com/ngimel
2025-10-30 19:43:46 +00:00
a7fd0b4001 [ROCm][CI] fix disk space message (#166645)
Fixes diskspace cutoff to say that the machine does not have difference=100 - diskspace_cutoff_int space available.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166645
Approved by: https://github.com/jeffdaily
2025-10-30 19:38:34 +00:00
181ee3bd42 fix: Add missing signals_to_handle to launcher logging (#166631)
Fixes #166630

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166631
Approved by: https://github.com/Skylion007

Co-authored-by: Aaron Gokaslan <aaronGokaslan@gmail.com>
2025-10-30 19:31:25 +00:00
0ec0549823 Introduce a new API torch.xpu.get_per_process_memory_fraction (#165511)
# Motivation
Aligned with other backends, this PR introduces a new API torch.xpu.get_per_process_memory_fraction to allow user to retrieve the allowed memory fraction per a single process.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165511
Approved by: https://github.com/EikanWang, https://github.com/ezyang
ghstack dependencies: #165508, #165509, #165510
2025-10-30 19:30:09 +00:00
8221ee6db9 [xpu] Fix type annotation for ProcessGroupXCCL (#166418)
After #163049, this PR fixes the type annotations to match the actual implementation for ProcessGroupXCCL::Options.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166418
Approved by: https://github.com/guangyey, https://github.com/ezyang
2025-10-30 19:29:06 +00:00
b939de26d1 Avoid writing temporary modules to disk (#157713)
In some cases the warning from #147744 still gets emitted because [atexit hooks aren't called](https://github.com/python/cpython/pull/114279).

Even in those cases, if the atexit hooks _were_ called you could end up with issues due to the directory being deleted in one process, but still being used elsewhere.

It's better all round to load these modules entirely in-memory.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157713
Approved by: https://github.com/xush6528
2025-10-30 19:11:16 +00:00
694db5f549 Use 'is' in callable comparisons (#166624)
Just like we use `is/is not` for class comparisons, it is generally advised to use `is/is not` for comparisons against torch functions.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166624
Approved by: https://github.com/Lucaskabela, https://github.com/Skylion007
2025-10-30 19:00:09 +00:00
639a0b1239 Remove torch.distributed.tensor.OpSchema.has_symints (#163667)
It appears to be unused based on `cd torch; rg has_symints`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163667
Approved by: https://github.com/xmfan, https://github.com/azahed98, https://github.com/albanD
ghstack dependencies: #162990
2025-10-30 18:57:17 +00:00
398775a43e [CodeClean] Replace std::runtime_error with TORCH_CHECK (#165119)
As the title stated.

**Changes**:
- torch/csrc/inductor(Part 2)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165119
Approved by: https://github.com/janeyx99
ghstack dependencies: #165139
2025-10-30 18:43:58 +00:00
fcd5f8c352 [CodeClean] Remove the Unused MACRO for AOT Inductor Runtime (#165139)
As the title stated.

- AOTI_TORCH_CHECK depend on TORCH_CHECK_MSG which located in c10/util/Exception.h, which maybe break BC
- AOTI_TORCH_CHECK is not used everywhere
- STD_TORCH_CHECK have ABI check tests.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165139
Approved by: https://github.com/Skylion007, https://github.com/janeyx99
2025-10-30 18:43:58 +00:00
4acc66f119 Make PT2 compile backprop through custom op without autograd key a hard error (#166367)
Signed-off-by: Edward Z. Yang <ezyang@meta.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166367
Approved by: https://github.com/bdhirsh
2025-10-30 18:43:07 +00:00
8f40a0c634 Revert "address DDE in matmul decomp (#166541)"
This reverts commit 90519402c2006237f891289a0afdec804515aa73.

Reverted https://github.com/pytorch/pytorch/pull/166541 on behalf of https://github.com/atalman due to breaks internal test ([comment](https://github.com/pytorch/pytorch/pull/166541#issuecomment-3469382334))
2025-10-30 18:11:33 +00:00
a5c3c08d10 [Pytorch] Use exp_u20 for aarch64's erf (#166594)
Summary:
After a precision study, we concluded it is ok to use ACL's exp function on f32's erf()
We can keep erf inline this way.

Benchmarks show about 91% higher throughput when processing a tensor of 1M elements, compiling with clang-19:

Before:
f32 erf: 2539.179us
After:
f32 erf: 1329.063us

Test Plan:
Correctness:

buck2 test mode/opt //caffe2/test:test_ops
buck2 test mode/opt //caffe2/test:torch

Performance:

buck2 run mode/opt //caffe2/benchmarks/operator_benchmark/fb:operator_benchmark_test

Differential Revision: D85730452

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166594
Approved by: https://github.com/mcfi, https://github.com/fadara01
2025-10-30 18:09:05 +00:00
a553ea9ea4 Fix missing symbol when printing guards (#165723)
Fixes #165177

When converting guards to sources if we were unable to get the expected symbol from symbol_to_source then try to get it from var_to_sources.

I was unable to make a simpler repro than what was described in the issue (which relies on llama3 - so inappropriate for a unit test).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165723
Approved by: https://github.com/bobrenjc93
2025-10-30 18:03:51 +00:00
ba71e9ca9a [DeviceMesh] Isolate pg creation logic in Device Mesh into a separate func _init_one_process_group (#166614)
To makes pg cache change easier and code modularization, we isolate the logic of process group creation into a separate function named `_init_one_process_group`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166614
Approved by: https://github.com/lw
2025-10-30 17:57:41 +00:00
694d205143 Revert "shrink_group implementation to expose ncclCommShrink API (#164518)"
This reverts commit 311ea0dec0c50f395e6dac7b3875e81ee243fceb.

Reverted https://github.com/pytorch/pytorch/pull/164518 on behalf of https://github.com/atalman due to breaks internal builds Error: from logging_utils import ( ModuleNotFoundError: No module named 'logging_utils' ([comment](https://github.com/pytorch/pytorch/pull/164518#issuecomment-3469308568))
2025-10-30 17:52:29 +00:00
629293f568 bucket all reduce (#166528)
Bucket all reduce in bucketer, thanks to @IvanKobzarev's earlier pr.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166528
Approved by: https://github.com/IvanKobzarev
ghstack dependencies: #166527
2025-10-30 17:12:34 +00:00
c37802a8c4 use multi-dtype bucketing (#166527)
Make the bucketer use multi-dtype bucketing for all gathers.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166527
Approved by: https://github.com/IvanKobzarev, https://github.com/ezyang
2025-10-30 16:54:49 +00:00
0a3ac47c0a Revert "[user-streams] Fix stream graph output semantics (#164819)"
This reverts commit f5cb9a4c68d9271c58ef4d3257210984b8e85099.

Reverted https://github.com/pytorch/pytorch/pull/164819 on behalf of https://github.com/atalman due to breaks CI ([comment](https://github.com/pytorch/pytorch/pull/164819#issuecomment-3469018283))
2025-10-30 16:53:32 +00:00
e83be7042e Fix pyrefly errors on main (#166548)
Fixes existing errors to keep noise from lintrunner to a minimum

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166548
Approved by: https://github.com/Lucaskabela, https://github.com/mlazos
2025-10-30 16:47:27 +00:00
fb545fb068 Add MXFP4 grouped gemm support via. FBGEMM kernels (#166530)
Summary:

* Extend `_scaled_grouped_mm_v2` to include MXFP4 support
* Add testing to existing grouped routines

Test Plan:

```
pytest -svv -k "mxfp4 and group" test/test_scaled_matmul_cuda.py
```

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166530
Approved by: https://github.com/drisspg
2025-10-30 16:46:11 +00:00
2df2c316e2 [devx] Fix invalid symbol definition emitted in fx_graph_runnable.py (#166529)
Summary: When emitting symbolic variable definition in fx_graph_runnable.py, we need to check if a SymNode is actually an expression, so that we won't generate something like "s27*s53**2 = 36".

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166529
Approved by: https://github.com/mlazos
ghstack dependencies: #166432
2025-10-30 16:40:12 +00:00
08b0a8f11a [Inductor] Fix an inductor_provenance bug (#166432)
Summary: Fix an inductor_provenance related error seen when running TORCH_COMPILE_DEBUG generated fx_graph_runnable.py.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166432
Approved by: https://github.com/mlazos
2025-10-30 16:40:12 +00:00
3f1824742c Revert "Fix comparing inductor actual strides vs bw graph for activations should not throw DDE. (#166277)"
This reverts commit b2a0f90501dd3a16a6ccaf4c49e1c10f6df4ce1d.

Reverted https://github.com/pytorch/pytorch/pull/166277 on behalf of https://github.com/atalman due to Breaks internal executorch tests ([comment](https://github.com/pytorch/pytorch/pull/166277#issuecomment-3468696623))
2025-10-30 15:49:23 +00:00
bbb7d2270b [inductor] print 0.0 as 0 for triton (#164291)
Fixes https://github.com/pytorch/pytorch/issues/164157
Fixes https://github.com/pytorch/pytorch/issues/164086

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164291
Approved by: https://github.com/bobrenjc93, https://github.com/mlazos
2025-10-30 15:15:25 +00:00
6a5a436624 DTensor: C++ compute_global_tensor_info (#162990)
compute_global_tensor_info is on the hot path for DTensor.{from,to}_local. More incremental progress toward C++.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162990
Approved by: https://github.com/ezyang
2025-10-30 15:10:54 +00:00
ad559072db [triton][sigmoid] Fix kernel cache and serialization issue for triton sigmoid + CUDA kernel bug (#166568)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166568
Approved by: https://github.com/minjang
2025-10-30 14:54:54 +00:00
ad02bd13df Revert "[user-streams] Add current stream source (#165211)"
This reverts commit 79aee77381b21d41c77148e5ff84c4b351aaf144.

Reverted https://github.com/pytorch/pytorch/pull/165211 on behalf of https://github.com/atalman due to failure: test/test_python_dispatch.py::TestPythonDispatch::test_return_stream [GH job link](https://github.com/pytorch/pytorch/actions/runs/18942517662/job/54086481693) [HUD commit link](7563f61cc8) ([comment](https://github.com/pytorch/pytorch/pull/165211#issuecomment-3468332362))
2025-10-30 14:34:43 +00:00
7563f61cc8 Make bucketing aware of collective LIFO semantics (#166324)
In the initial pr for overlapping preserving bucketing, for a graph like:

```
def foo(...):
     ag = all_gather(...)
     hiding_compute = mm(...)
     wait(ag)
```

We would add dependencies from mm -> ag, and wait from wait -> hiding_compute, to prevent bucketing reordering these collectives so that overlap no long occurred. however, there is an additional way for bucketing to prevent overlap.

If we were to reorder another collective so the graph looked like:

```
def foo(...):
     ag = all_gather(...)
     ar = all_reduce(...)
     wait(ar)
     hiding_compute = mm(...)
     wait(ag)
```

Overlap would not occur, because the wait for the all reduce would also force realization of every collective enqueued on the same stream prior to the all reduce. NCCL uses a single stream per process group.

To model, we set a set a strict ordering of all collective starts, waits, and hiding compute initially when bucketing. Then, when trying to add a collective to a bucket, we will see if we interfere with overlap for all of the following possible bucketings:

[move collective start to bucket start, move bucket start to collective start] x [move collective wait to bucket wait x move bucket wait to collective wait].

For any of these positions, we check if overlap would have been interfered with because of stream queue semantics. Then, if not, we remove the moving start and wait from the constrained ordering of collectives, and see if it's topologically valid to merge the nodes.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166324
Approved by: https://github.com/IvanKobzarev
ghstack dependencies: #166309
2025-10-30 13:37:00 +00:00
fa8e073a4e Revert "[triton][sigmoid] Fix kernel cache and serialization issue for triton sigmoid + CUDA kernel bug (#166568)"
This reverts commit d46d8d6f54b15ded4f2483c7bde31be124281ab8.

Reverted https://github.com/pytorch/pytorch/pull/166568 on behalf of https://github.com/atalman due to Failed test/test_extension_utils.py::TestExtensionUtils::test_external_module_register_with_renamed_backend [GH job link](https://github.com/pytorch/pytorch/actions/runs/18931754443/job/54050880312) [HUD commit link](d46d8d6f54) ([comment](https://github.com/pytorch/pytorch/pull/166568#issuecomment-3468008894))
2025-10-30 13:31:47 +00:00
95b5534773 Revert "[user-streams] Track symbolic current stream (#165212)"
This reverts commit a5335263d32b5be2b2647661334d81225c3cc3fc.

Reverted https://github.com/pytorch/pytorch/pull/165212 on behalf of https://github.com/atalman due to test/test_rename_privateuse1_to_existing_device.py::TestRenamePrivateuseoneToExistingBackend::test_external_module_register_with_existing_backend [GH job link](https://github.com/pytorch/pytorch/actions/runs/18930365446/job/54046768884) [HUD commit link](a5335263d3) ([comment](https://github.com/pytorch/pytorch/pull/165212#issuecomment-3467968796))
2025-10-30 13:24:56 +00:00
9ee1afbf66 Revert "[user-streams] Handle returning the current stream with/without device index (#165356)"
This reverts commit f1af679270392c83e03808c8af5e2cbe3cdf16ce.

Reverted https://github.com/pytorch/pytorch/pull/165356 on behalf of https://github.com/atalman due to test/test_rename_privateuse1_to_existing_device.py::TestRenamePrivateuseoneToExistingBackend::test_external_module_register_with_existing_backend [GH job link](https://github.com/pytorch/pytorch/actions/runs/18930365446/job/54046768884) [HUD commit link](a5335263d3) ([comment](https://github.com/pytorch/pytorch/pull/165356#issuecomment-3467967061))
2025-10-30 13:22:24 +00:00
f60751024e Revert "[2/N] Add strict parameter to Python zip calls (#166257)"
This reverts commit 39e5cdddf7e57881c52473d1288a66f0222527e1.

Reverted https://github.com/pytorch/pytorch/pull/166257 on behalf of https://github.com/atalman due to Failing: test/distributed/fsdp/test_fsdp_mixed_precision.py::TestFSDPTrainEval::test_train_ema_eval_flow [GH job link](https://github.com/pytorch/pytorch/actions/runs/18934047991/job/54057218160) [HUD commit link](39e5cdddf7) ([comment](https://github.com/pytorch/pytorch/pull/166257#issuecomment-3467955332))
2025-10-30 13:20:00 +00:00
2de4cf2102 [1/N] Remove unused loop variables (#166258)
This PR removes unused loop variables.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166258
Approved by: https://github.com/Lucaskabela, https://github.com/mlazos
2025-10-30 12:22:25 +00:00
369f2d6951 [3/N] fix typo in other folders (#166606)
fix typo in other folders

#166374
#166126

_typos.toml
```bash
[files]
extend-exclude = ["tools/linter/dictionary.txt"]
[default.extend-words]
nd = "nd"
arange = "arange"
Nd = "Nd"
GLOBALs = "GLOBALs"
hte = "hte"
iy = "iy"
PN = "PN"
Dout = "Dout"
optin = "optin"
gam = "gam"
PTD = "PTD"
Sur = "Sur"
nin = "nin"
tme = "tme"
inpt = "inpt"
mis = "mis"
Raison = "Raison"
ouput = "ouput"
nto = "nto"
Onwer = "Onwer"
callibrate = "callibrate"
ser = "ser"
Metdata = "Metdata"
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166606
Approved by: https://github.com/ezyang
2025-10-30 10:30:40 +00:00
32920926f0 [xpu][fix] [Inductor] Avoid using tl.sqrt_rn on XPU before triton is ready (#165740)
Fixes #165738

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165740
Approved by: https://github.com/etaf, https://github.com/EikanWang, https://github.com/chuanqi129, https://github.com/desertfire
2025-10-30 09:24:24 +00:00
39e5cdddf7 [2/N] Add strict parameter to Python zip calls (#166257)
This PR adds `strict=True/False` to zip calls in test utils. strict=True is passed when possible.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166257
Approved by: https://github.com/janeyx99
2025-10-30 08:10:10 +00:00
2829d48bd1 [xpu][test][1/N] Port 3 fsdp distributed test cases to Intel GPU (#161476)
For https://github.com/pytorch/pytorch/issues/114850, we will port 3 distributed tests to Intel GPU.
We could enable Intel GPU with the following methods and try the best to keep the original code styles:

- use "torch.accelerator.current_accelerator()" to determine the accelerator backend
- use "requires_accelerator_dist_backend" to enable "xccl"
- enabled XPU for some test path
- skip some test cases that Intel GPU does not support

Pull Request resolved: https://github.com/pytorch/pytorch/pull/161476
Approved by: https://github.com/weifengpy, https://github.com/guangyey
2025-10-30 07:30:04 +00:00
f1af679270 [user-streams] Handle returning the current stream with/without device index (#165356)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165356
Approved by: https://github.com/anijain2305
ghstack dependencies: #164304, #164522, #164819, #165211, #165212
2025-10-30 07:20:25 +00:00
d46d8d6f54 [triton][sigmoid] Fix kernel cache and serialization issue for triton sigmoid + CUDA kernel bug (#166568)
Differential Revision: D85792537

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166568
Approved by: https://github.com/minjang
2025-10-30 06:17:39 +00:00
a5335263d3 [user-streams] Track symbolic current stream (#165212)
merge into stream tests

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165212
Approved by: https://github.com/anijain2305
ghstack dependencies: #164304, #164522, #164819, #165211
2025-10-30 04:58:53 +00:00
79aee77381 [user-streams] Add current stream source (#165211)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165211
Approved by: https://github.com/anijain2305
ghstack dependencies: #164304, #164522, #164819
2025-10-30 04:58:53 +00:00
f5cb9a4c68 [user-streams] Fix stream graph output semantics (#164819)
Preivously, we would stash a single stream value we constructed at trace time in a global and return the same value from repeated calls to the graph.

With this PR, we construct the stream value in advance, reference the constructed value in the graph via the lookup table, and if that value is returned as an output, read the value from the lookup table and return it (in bytecode, not as a graph output, since we don't support arbitrary stream outputs).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164819
Approved by: https://github.com/anijain2305
ghstack dependencies: #164304, #164522
2025-10-30 04:58:46 +00:00
f20bf77874 [audio hash update] update the pinned audio hash (#166597)
This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned audio hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166597
Approved by: https://github.com/pytorchbot
2025-10-30 04:28:30 +00:00
75f798e05b [inductor][mi350] add tech specs for MI350 (#166576)
Summary:
was digging through matmul padding for other work, and I noticed that the compute bound checking won't work on MI350 since we haven't supplied the tech specs yet.

I added MI350 specs following the predefined format

Test Plan: CI

Differential Revision: D85804980

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166576
Approved by: https://github.com/leitian
2025-10-30 03:46:52 +00:00
476b149a00 bwd pass (#164504)
**Summary**
This implements the backward pass for the Varlen API and registers `_varlen_attn()` as a custom op.

**Benchmarking**

To benchmark, we compare runtime and TFLOPs against the current SDPA approach with padding.

Settings:

- 1 H100 machine
- `batch_size=8`, `max_seq_len=2048`, `embed_dim=1024`, `num_heads=16`
- dtype `torch.bfloat16`
- `is_causal=False`
- for variable length, we set sequences to be random multiples of 64 up to `max_seq_len`
- 100 runs

|        | Variable Length API | SDPA     |
|--------|--------------------|----------|
| Runtime | 0.8189142608642578 ms       | 3.263883056640625 ms  |
| TFLOPs | 268.652       | 158.731  |

We can see that runtime for Varlen is >3x faster

**Testing**

Run `python test/test_varlen_attention.py` for unit tests where we verify basic functionality and confirm numerical match between varlen gradients vs SDPA.

For custom op testing, `test_custom_op_registration` uses logging mode to verify that `_varlen_attn()` was called and tests with `torch.compile`. `test_custom_op_compliances` uses `torch.library.opcheck()` to verify.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164504
Approved by: https://github.com/drisspg
2025-10-30 03:46:37 +00:00
845da9c817 [ONNX] Ignore pyrefly errors in torchlib (#166588)
Fixes #166475

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166588
Approved by: https://github.com/titaiwangms
2025-10-30 03:43:52 +00:00
0918bf321c [xpu][test] Reuse native_mm and mix_order_reduction for Intel GPU. (#166384)
This PR reused native_mm and mix_order_reduction for Intel GPU and enabled the corresonding test.
Fixes #165370

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166384
Approved by: https://github.com/jansel
2025-10-30 03:38:35 +00:00
90519402c2 address DDE in matmul decomp (#166541)
Address https://github.com/pytorch/pytorch/issues/165081
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166541
Approved by: https://github.com/mlazos
2025-10-30 03:19:29 +00:00
791ca80d3a Enable local tensor mode for DTensor attention and convolution tests (#166406)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166406
Approved by: https://github.com/ezyang
2025-10-30 02:48:02 +00:00
5cbdade914 Fix a syntactic error in test_indexing.py (#166390)
This PR fixes a syntactic error in test_indexing.py by a misplaced `if else` expression.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166390
Approved by: https://github.com/jerryzh168
2025-10-30 02:28:01 +00:00
0187db88d4 [ROCm][CI] Create periodic-rocm-mi200.yml (#166544)
* We are separating out the rocm jobs of the periodic workflow
* We are introducing a new label `ciflow/periodic-rocm-mi200` to allow us to run distributed tests only on ROCm runners, without triggering many other jobs on the `periodic.yml` workflow (via `ciflow/periodic`)
* This new workflow will also be triggered via the `ciflow/periodic`, thus maintaining the old status quo.
* We are reverting to the `linux.rocm.gpu.4` label since it targets a lot more CI nodes at this point than the K8s/ARC-based `linux.rocm.gpu.mi250.4` label, as that is still having some network/scaling issues.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166544
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-10-30 02:08:07 +00:00
311ea0dec0 shrink_group implementation to expose ncclCommShrink API (#164518)
Closes #164529

To expose the new [ncclCommShrink](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommshrink) API to PyTorch.

This is useful when you need to exclude certain GPUs or nodes from a collective operation, for example in fault tolerance scenarios or when dynamically adjusting resource utilization.

For more info:  [Shrinking a communicator](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#shrinking-a-communicator)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164518
Approved by: https://github.com/kwen2501
2025-10-30 01:50:54 +00:00
cf7756da38 Bump uv from 0.9.5 to 0.9.6 in /.ci/lumen_cli (#166578)
Bumps [uv](https://github.com/astral-sh/uv) from 0.9.5 to 0.9.6.
- [Release notes](https://github.com/astral-sh/uv/releases)
- [Changelog](https://github.com/astral-sh/uv/blob/main/CHANGELOG.md)
- [Commits](https://github.com/astral-sh/uv/compare/0.9.5...0.9.6)

---
updated-dependencies:
- dependency-name: uv
  dependency-version: 0.9.6
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2025-10-29 18:28:14 -07:00
e380028a51 [inductor][choices] lookup table choices 1/3 (#164978)
\# why

- enable users to control which choices get used on which inputs
- reduce lowering time, and pin kernel selection, by selecting
  them for the inputs

\# what

- a new InductorChoices subclass that implements a lookup table
- a README explaining the usage
- corresponding testing

- currently only supports templates that go through
  `V.choices.get_template_configs`

\# testing

```
python3 -bb -m pytest test/inductor/test_lookup_table.py -v
```

Differential Revision: [D85685743](https://our.internmc.facebook.com/intern/diff/D85685743)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164978
Approved by: https://github.com/PaulZhang12, https://github.com/eellison, https://github.com/mlazos
2025-10-30 01:28:01 +00:00
b4403bfc62 Add waitcounters for torch.compile subprocess pool (#164527)
Summary:
This ads waitcounter for whether or not the pool is running, as well as if we
are running jobs.

This also ads waitcounters for the first job within a pool. First job and running are working correctly. The job waitcounter seems to either be detecting a leak of a job, or is broken subtly.

Test Plan:
We've tested this internally and see valid ods metrics.

Note that we may be leaking jobs, or the job logic may not be handling an exception correctly.

Differential Revision: D83705931

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164527
Approved by: https://github.com/masnesral
2025-10-30 01:15:26 +00:00
12c12466b0 [ROCm][CI] remove amdgpu from install_rocm.sh (#166575)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166575
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-10-30 01:08:33 +00:00
f4d05feb7a Repro dynamo issue for union typed annotation (#166443)
when nested function has type annotation using "|",  it fails.

it works fine with `Union[torch.Tensor, DTensor]` tho.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166443
Approved by: https://github.com/anijain2305
2025-10-30 01:05:15 +00:00
7481622237 [symbolic shapes] remove maybe_guard_rel warning (#166553)
Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166553
Approved by: https://github.com/laithsakka
2025-10-30 00:57:28 +00:00
b2a0f90501 Fix comparing inductor actual strides vs bw graph for activations should not throw DDE. (#166277)
Fix https://github.com/pytorch/pytorch/issues/163894

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166277
Approved by: https://github.com/Lucaskabela
2025-10-30 00:34:05 +00:00
cc8d8ae726 Update
[ghstack-poisoned]
2025-10-29 17:30:05 -07:00
a9a4667a82 Update (base update)
[ghstack-poisoned]
2025-10-29 17:11:31 -07:00
ec867a6636 Update
[ghstack-poisoned]
2025-10-29 17:11:31 -07:00
14d4a77495 disable current modes instead of no dispatch in estimation (#166571)
otherwise, the custom estimation's TorchDispatchModes will be disabled.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166571
Approved by: https://github.com/SherlockNoMad, https://github.com/bdhirsh
2025-10-29 23:24:41 +00:00
3d4ca228be Remove METADATA.bzl files (#166574)
(meta-internal, should not be synced)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166574
Approved by: https://github.com/bigfootjon
2025-10-29 23:17:41 +00:00
c3d205d598 helper function for replacing nodes in aug graph (#166309)
When we do bucketing, we replace starts and waits with new nodes. This pr adds a helper to transfer the augmented graph additional deps.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166309
Approved by: https://github.com/IvanKobzarev
2025-10-29 23:08:33 +00:00
c54e2c5b41 [User-streams] Make torch.Event weakref compatible (#164522)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164522
Approved by: https://github.com/williamwen42
ghstack dependencies: #164304
2025-10-29 23:06:31 +00:00
c3047938a0 [user-streams] Make device-agnostic streams weakref compatible (#164304)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164304
Approved by: https://github.com/williamwen42, https://github.com/colesbury
2025-10-29 23:06:31 +00:00
d2eff5d454 Add python stack trace to AOTI generated code (#160539)
Summary:
We add a thread_local KernelContext object so Strobelight (and other potential profilers) can read the stack trace information of the running kernel.

This will bring extra overhead, so we guard this behind the `cpp.enable_kernel_profile` flag.

Example output code:

```cpp
#include <torch/csrc/inductor/aoti_runtime/kernel_context_tls.h>
namespace torch::aot_inductor {
thread_local KernelContext* tls_kernel_context = nullptr;
}
// Other code .....
void AOTInductorModel::run_impl(
    AtenTensorHandle*
        input_handles, // array of input AtenTensorHandle; handles
                        // are stolen; the array itself is borrowed
    AtenTensorHandle*
        output_handles, // array for writing output AtenTensorHandle; handles
                        // will be stolen by the caller; the array itself is
                        // borrowed
    DeviceStreamType stream,
    AOTIProxyExecutorHandle proxy_executor
) {
    __check_inputs_outputs(input_handles, output_handles);
    auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 4);
    auto arg2_1 = std::move(inputs[0]);
    auto arg3_1 = std::move(inputs[1]);
    auto arg4_1 = std::move(inputs[2]);
    auto arg5_1 = std::move(inputs[3]);
    [[maybe_unused]] auto& fc1_weight = constants_->at(0);
    [[maybe_unused]] auto& fc1_bias = constants_->at(1);
    inputs.clear();
    [[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());
    static constexpr int64_t int_array_0[] = {8L, 16L};
    static constexpr int64_t int_array_1[] = {16L, 1L};
    AtenTensorHandle buf0_handle;
    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cpu, this->device_idx_, &buf0_handle));
    RAIIAtenTensorHandle buf0(buf0_handle);
    // Topologically Sorted Source Nodes: [linear], Original ATen: [aten.t, aten.addmm]
    // [Provenance debug handles] aoti_torch_cpu_addmm_out:1
    static constexpr int64_t int_array_2[] = {10L, 16L};
    static constexpr int64_t int_array_3[] = {1L, 10L};
    {
    KernelContextGuard _ctx("aoti_torch_cpu_addmm_out", R"(
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 829, in forward
    x = self.fc1(x)
  File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/torch/nn/modules/linear.py", line 134, in forward
    return F.linear(input, self.weight, self.bias)
)");
    RAIIAtenRecordFunctionHandle record_aoti_torch_cpu_addmm_out_("aoti_torch_cpu_addmm_out", nullptr);
    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cpu_addmm_out(buf0, fc1_bias, arg2_1, wrap_with_raii_handle_if_needed(reinterpret_tensor_wrapper(fc1_weight, 2, int_array_2, int_array_3, 0L)), 1L, 1L));
    }
    arg2_1.reset();
    auto buf1 = std::move(buf0);  // reuse
    static constexpr int64_t int_array_4[] = {10L, 20L};
    static constexpr int64_t int_array_5[] = {20L, 1L};
    AtenTensorHandle buf2_handle;
    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cpu, this->device_idx_, &buf2_handle));
    RAIIAtenTensorHandle buf2(buf2_handle);
    // [Provenance debug handles] cpp_fused_mul_relu_sigmoid_0:2
    {
    KernelContextGuard _ctx("cpp_fused_mul_relu_sigmoid_0", R"(
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 831, in forward
    x = self.sigmoid(x)
  File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/torch/nn/modules/activation.py", line 359, in forward
    return torch.sigmoid(input)
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 830, in forward
    x = self.relu(x)
  File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/torch/nn/modules/activation.py", line 144, in forward
    return F.relu(input, inplace=self.inplace)
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 832, in forward
    d = a * 3.14
)");
    cpp_fused_mul_relu_sigmoid_0((float*)(buf1.data_ptr()), (const float*)(arg3_1.data_ptr()), (float*)(buf2.data_ptr()));
    }
    arg3_1.reset();
    static constexpr int64_t int_array_6[] = {10L, 30L};
    static constexpr int64_t int_array_7[] = {30L, 1L};
    AtenTensorHandle buf3_handle;
    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_6, int_array_7, cached_torch_dtype_float32, cached_torch_device_type_cpu, this->device_idx_, &buf3_handle));
    RAIIAtenTensorHandle buf3(buf3_handle);
    // Topologically Sorted Source Nodes: [mul, addmm], Original ATen: [aten.mul, aten.addmm]
    // [Provenance debug handles] aoti_torch_cpu_addmm_out:3
    {
    KernelContextGuard _ctx("aoti_torch_cpu_addmm_out", R"(
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 833, in forward
    y = torch.addmm(c, d, b)
)");
    RAIIAtenRecordFunctionHandle record_aoti_torch_cpu_addmm_out_("aoti_torch_cpu_addmm_out", nullptr);
    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cpu_addmm_out(buf3, arg5_1, buf2, arg4_1, 1L, 1L));
    }
    arg4_1.reset();
    arg5_1.reset();
    buf2.reset();
    auto buf4 = std::move(buf3);  // reuse
    // [Provenance debug handles] cpp_fused_gelu_1:4
    {
    KernelContextGuard _ctx("cpp_fused_gelu_1", R"(
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 834, in forward
    z = torch.nn.functional.gelu(y)
)");
    cpp_fused_gelu_1((float*)(buf4.data_ptr()));
    }
    output_handles[0] = buf1.release();
    output_handles[1] = buf4.release();
} // AOTInductorModel::run_impl
```

Test Plan:
```
buck run mode/dev-nosan fbcode//caffe2/test/inductor:provenance_tracing -- -r  stack_traces
```

Rollback Plan:

Differential Revision: D78436007

Pull Request resolved: https://github.com/pytorch/pytorch/pull/160539
Approved by: https://github.com/yiming0416
2025-10-29 22:47:52 +00:00
972030fe2e Revert "[pytree] add treespec_{leaf,tuple,dict} functions for args_spec modification (#160843)"
This reverts commit 284716a691580cf0508a7c5a4f9f7306a32092ad.

Reverted https://github.com/pytorch/pytorch/pull/160843 on behalf of https://github.com/atalman due to failing internal torchrec test' ([comment](https://github.com/pytorch/pytorch/pull/160843#issuecomment-3464647878))
2025-10-29 22:46:48 +00:00
d401e4e70a [ROCm][CUDA] add unit test utility busy_wait_for_flag (#166218)
torch.cuda._busy_wait_for_flag() will launch a kernel that spins until a flag is set by a corresponding torch.cuda._clear_flag(). These **must** be run on separate streams or it will deadlock.

When used correctly these kernels will put work on the GPU that is more predictable than torch.cuda._sleep() in cases where the unit test is depending on the GPU being busy.

Fixes #120318.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166218
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-10-29 22:40:23 +00:00
f1a3440715 FC/BC policy for libtorch stable ABI (#163991)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163991
Approved by: https://github.com/janeyx99
ghstack dependencies: #163899
2025-10-29 22:35:36 +00:00
82ff07c788 Add py 3.14 CI docker build pytorch-linux-jammy-py3.14-clang12 (#164791)
Related to https://github.com/pytorch/pytorch/issues/156856
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164791
Approved by: https://github.com/huydhn, https://github.com/malfet, https://github.com/albanD
2025-10-29 22:21:22 +00:00
737b510015 Update (base update)
[ghstack-poisoned]
2025-10-29 15:06:45 -07:00
e8a8a6aa0d Update
[ghstack-poisoned]
2025-10-29 15:06:45 -07:00
e0604d3170 [dynamo] Fix ListIterator tracking mutations to original list (#166350)
Currently ListIteratorVariable copies the underlying list, which prevents it
from seeing mutations to the original list.  Remove the copy to match cpython behavior.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166350
Approved by: https://github.com/guilhermeleobas
ghstack dependencies: #166349, #162768
2025-10-29 21:54:37 +00:00
8101fd46d4 [dynamo] Implement iter with a polyfill (#162768)
Currently most variable trackers implement `iter` via `_call_iter_tuple_list`.
This makes it difficult to customize the behavior of `iter` for different
variable types.  Instead, implement `iter` via a polyfill, which will delegate
to the appropriate `__iter__` method.

While this method is more flexible, it increases the overhead of dynamo tracing.
For example, `iter(x)` will generate 9x more instructions than the current
implementation for common iterable types.  Microbenchmarking shows a ~6x
slowdown for this operation.  I suspect this would be much less for realistic
workloads, but more work would be needed to get specific numbers.  If the
performance is a concern we could also consider adding a fast path for types
that are known to correctly implement `__iter__`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162768
Approved by: https://github.com/guilhermeleobas
ghstack dependencies: #166349
2025-10-29 21:54:37 +00:00
3d4a2d8a93 [dynamo] Add __iter__ for iterable VariableTrackers (#166349)
This is in preparation for implementing iter with a polyfill

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166349
Approved by: https://github.com/guilhermeleobas
2025-10-29 21:54:37 +00:00
59ddfb69a7 [cpu/gpu split] (#165696)
Summary: cpu/gpu split. cuda is default due to some downstream targets configurations.

Test Plan: test in CI

Differential Revision: D80712802

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165696
Approved by: https://github.com/jeffdaily, https://github.com/malfet, https://github.com/atalman
2025-10-29 21:44:52 +00:00
bebabd7fce [Graph Partition] move custom rules to inductor config (#166458)
This PR adds `custom_should_partition_ops: list[str]` to specify the name of custom ops upon which graph partition happens. It works with cache since it is a `list[str]` in the config file. The op name should be of format "mylib::baz".

Close: #165341

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166458
Approved by: https://github.com/ProExpertProg, https://github.com/eellison, https://github.com/zou3519
2025-10-29 21:43:58 +00:00
56a809aa07 [DTensor] Fix torch.all() using incorrect reduction operator (#165924)
Fixes #165923
Corrects the reduction operation to be product.

Enables "all" in the boolean tensor tests.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165924
Approved by: https://github.com/malfet, https://github.com/Skylion007
2025-10-29 20:58:35 +00:00
b33762bd2f Fix incomplete test_memory_plots_metadata (#166508)
The different context cases were not fully tested before this PR.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166508
Approved by: https://github.com/Skylion007
2025-10-29 20:55:00 +00:00
f02708c2be [DeviceMesh] Remove slicing submesh warning messages and clean up in fsdp params (#166466)
Differential Revision: [D85735294](https://our.internmc.facebook.com/intern/diff/D85735294)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166466
Approved by: https://github.com/fegin
2025-10-29 20:52:49 +00:00
a186aa8d6c [ONNX] Change stacklevel in warning message for export (#166558)
Change to 3 so that the warning shows user callsite. (Where user calls torch.onnx.export)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166558
Approved by: https://github.com/titaiwangms
2025-10-29 20:45:25 +00:00
48c3b71ecc transform fr traces for ft (#166149)
Summary:
- the ranks in the default pg config are local ranks
- however fr trace analysis requires them to be global ranks
- so we transform the local ranks to global ranks before the analysis kicks in based on a cli flag

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166149
Approved by: https://github.com/fduwjj
2025-10-29 20:44:48 +00:00
2c9f877fa7 Revert "[PyTorch] Improve aarch64 performance of bfloat16 ops (#166028)"
This reverts commit 3e77a2b478f596a8a0aef0af502f6bb1a247aa85.

Otherwise it fails ARM build with older compilers with errors that looks
as follows:
```
vec128_bfloat16_neon.h:666:12: error: operation not permitted on type ‘bfloat16_t’
  666 |   return (-x) * y - z;
```

For more self-contained example see https://godbolt.org/z/bbY4xWh45
(that compiles the same code using clang-15 and clang-19)
2025-10-29 13:35:59 -07:00
fc540cefd4 set pg name based on ranks (#166182)
Summary:
- in torchft we have multiple default pg's, 1 for each task group
- for flight recorder to work, each of these need to have a different name, so entries can be matched
- change the `init_process_group` api to optionally take a list of ranks. if provided, we use the hash of the ranks as the name of the pg. for torchft, we'll pass global ranks here so the default pg have a different name on each task group

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166182
Approved by: https://github.com/fduwjj
2025-10-29 20:13:48 +00:00
d1a6e006e0 Fix syntax for pyrefly errors (#166496)
Last one! This ensures all existing suppressions match the syntax expected and will silence only one error code

pyrefly check
lintrunner

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166496
Approved by: https://github.com/Skylion007, https://github.com/mlazos
2025-10-29 20:00:25 +00:00
e349670c32 Update (base update)
[ghstack-poisoned]
2025-10-29 11:57:21 -07:00
685b332015 Update
[ghstack-poisoned]
2025-10-29 11:57:21 -07:00
fa560e1158 [ao][pruning] Replace assert statements with AssertionError exceptions (#164926)
Replace assert statement with explicit ValueError exception to ensure the validation check is not removed when Python runs with optimization flag (-O).

This is a draft PR to confirm the process.

Fixes partially #164878.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164926
Approved by: https://github.com/fffrog, https://github.com/albanD

Co-authored-by: Jiawei Li <ljw1101.vip@gmail.com>
2025-10-29 17:46:46 +00:00
a3fe1825aa Fix incomplete torch.cdist tests (#166507)
Because the `p` value is not used.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166507
Approved by: https://github.com/Skylion007
2025-10-29 17:11:07 +00:00
deb776319b [ROCm] Reduce duplication in bfloat16_support_literal definition (#166147)
This PR refactors the bfloat16_support_literal constant in the PyTorch build logic to eliminate duplicated ROCm-specific code.

Previously, there were two nearly identical branches for ROCM_VERSION < 70000 and ROCM_VERSION >= 70000, differing only by a single typedef. These have been unified into one conditional block with a minimal version guard inside. (https://github.com/ROCm/pytorch/pull/2502)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166147
Approved by: https://github.com/jerrymannil, https://github.com/jeffdaily
2025-10-29 16:59:03 +00:00
d7040e6d75 Revert "[dynamo][guards] 1/N Guard selectively for DTensor (#165824)"
This reverts commit ee7434be822cf6e75b4566d8159f550ee233d8ae.

Reverted https://github.com/pytorch/pytorch/pull/165824 on behalf of https://github.com/anijain2305 due to internal job failed ([comment](https://github.com/pytorch/pytorch/pull/165824#issuecomment-3462667536))
2025-10-29 16:52:31 +00:00
35f3572fa4 Revert "[ROCm] Enable group gemm through CK (#166334)"
This reverts commit 1fa520ea654f5fc0b3c65ce6e056dd73442dd65d.

Reverted https://github.com/pytorch/pytorch/pull/166334 on behalf of https://github.com/atalman due to Internal build failures ([comment](https://github.com/pytorch/pytorch/pull/166334#issuecomment-3462640668))
2025-10-29 16:45:02 +00:00
bc5111cd8d [Inductor] Prevent kernel fusion with too many unique inputs and outputs (#166275)
MTIA triton currently has a limit that it can't support the cases when there are too many input/output buffers. This PR adds the limitation to prevent large fusion with many input/output buffer.

Differential Revision: [D85509351](https://our.internmc.facebook.com/intern/diff/D85509351/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166275
Approved by: https://github.com/eellison
ghstack dependencies: #166274
2025-10-29 16:41:34 +00:00
398fdd32bb [Inductor] Lower fallback nodes annotated with "should_fallback" (#166339)
Summary:
This PR introduces an inductor-level fallback mechanism that gives users control over which operations or subgraphs Inductor should lower and which should fall back to preexisting kernels. This has similar motivation as #164776 in providing flexibility to selectively disable Inductor lowering for specific nodes.

The implementation simply adds a check for the `"should_fallback"` metadata annotation on FX graph nodes. If this is set to `True`, the lowerer falls back before attempting the normal lowering path. Note that since these are user-directed fallbacks dependent upon specific, customized conditions, use `add_to_fallback_set=False` to avoid permanent overwrites of inductor's lowering/fallback rules.

Simple example marking nodes for fallback based on custom predicates:

```
def should_fallback_predicate(node: torch.fx.Node, pred: Callable[torch.fx.Node, bool]):
    # Apply predicate and mark for fallback if needed
    if self.predicate(node):
         node.meta["should_fallback"] = True
```

Test Plan: added a CI test

Differential Revision: D85347587

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166339
Approved by: https://github.com/blaine-rister, https://github.com/eellison
2025-10-29 16:33:55 +00:00
5fd1d41e62 Revert "[user-streams] Make device-agnostic streams weakref compatible (#164304)"
This reverts commit bfc2050db975e589795cd3eceaed2e83bf89ad35.

Reverted https://github.com/pytorch/pytorch/pull/164304 on behalf of https://github.com/atalman due to Breaks periodic: test/dynamo/test_streams.py::TestStreams::test_stream_weakref [GH job link](https://github.com/pytorch/pytorch/actions/runs/18909552619/job/53979171605) [HUD commit link](cde81e92b9) ([comment](https://github.com/pytorch/pytorch/pull/164304#issuecomment-3462489278))
2025-10-29 16:09:54 +00:00
c594950e86 Revert "nn.Linear: nD contiguous input + bias -- dispatch to addmm also when weight is sparse (#166071)"
This reverts commit 467c21ad9ae4133c20a3c098a0355e9ac20d48aa.

Reverted https://github.com/pytorch/pytorch/pull/166071 on behalf of https://github.com/atalman due to Multiple CI breakages: test/profiler/test_profiler_tree.py::TestProfilerTree::test_profiler_experimental_tree_with_stack_and_modules [GH job link](https://github.com/pytorch/pytorch/actions/runs/18909087335/job/53976915830) [HUD commit link](467c21ad9a) ([comment](https://github.com/pytorch/pytorch/pull/166071#issuecomment-3462458968))
2025-10-29 16:05:30 +00:00
14102fb1f3 add new line in log (#164240)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164240
Approved by: https://github.com/ColinPeppler, https://github.com/Skylion007, https://github.com/ezyang
ghstack dependencies: #164075
2025-10-29 16:03:32 +00:00
5cdbcb5233 Revert "[User-streams] Make torch.Event weakref compatible (#164522)"
This reverts commit cde81e92b95eee9af2879c9c75f7b03699ca72ad.

Reverted https://github.com/pytorch/pytorch/pull/164522 on behalf of https://github.com/atalman due to Breaks periodic: test/dynamo/test_streams.py::TestStreams::test_stream_weakref [GH job link](https://github.com/pytorch/pytorch/actions/runs/18909552619/job/53979171605) [HUD commit link](cde81e92b9) ([comment](https://github.com/pytorch/pytorch/pull/164522#issuecomment-3462450571))
2025-10-29 16:03:03 +00:00
eae701cad0 Add scaffolding for StableIValue FC/BC (no PoC) (#164332)
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land

**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164332
Approved by: https://github.com/janeyx99
ghstack dependencies: #164356, #166373, #163683
2025-10-29 15:41:45 +00:00
8f51556daa Add scaffolding for aoti_torch_call_dispatcher BC with native ops (#163683)
Part 1 of plan in https://docs.google.com/document/d/1MaX51H5aEQE5XnOlnZIpf9oCYwzGrTWkgBACxNzsmWE/edit?usp=sharing

- Upgrade `aoti_torch_call_dispatcher` to v2 with an `extension_build_version`
- Allow registration of StableIValue stack  --> IValue stack adapters for schema changes

#### Note: This PR does not include a linter that tells the user to add the upgrader if the schema changes, which is an important piece that will be added in a separate PR

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163683
Approved by: https://github.com/janeyx99
ghstack dependencies: #164356, #166373
2025-10-29 15:41:45 +00:00
c0bbda37e8 Move static from_ivalue/to_ivalue to new shim_common.cpp (#166373)
Move `from_ivalue` and `to_ivalue` and their dependents `StableIValueBoxedKernel`, `aoti_torch_library_impl` `aoti_torch_call_dispatcher` into new (non-aoti shim_common.cpp)

This is in prep for the above PRs where I add v2s (`torch_call_dispatcher` and `torch_library_impl`) that are versioning aware

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166373
Approved by: https://github.com/janeyx99
ghstack dependencies: #164356
2025-10-29 15:41:36 +00:00
fefb546b91 Add TORCH_TARGET_VERSION for stable ABI (#164356)
And update it so comparisons can be done by the preprocessor

**Note: We also need to gate in shim.h and figure out how to enforce this**

Differential Revision: [D85683549](https://our.internmc.facebook.com/intern/diff/D85683549)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164356
Approved by: https://github.com/janeyx99
2025-10-29 15:41:28 +00:00
d6d6fa26f5 Revert "bwd pass (#164504)"
This reverts commit f36f372acc28062e0988d84699c62689b0d89a6e.

Reverted https://github.com/pytorch/pytorch/pull/164504 on behalf of https://github.com/jeffdaily due to CI had been clean for both cuda and rocm before merge, broke post merge? ([comment](https://github.com/pytorch/pytorch/pull/164504#issuecomment-3462116676))
2025-10-29 15:10:40 +00:00
467c21ad9a nn.Linear: nD contiguous input + bias -- dispatch to addmm also when weight is sparse (#166071)
As per title.

It seems safe to be able to generalize to arbitrary contiguous inputs since `at::matmul` is likely to do the flattening to avoid `baddmm`.

Additionally, we guard for bias to be 1D and contiguous which is guaranteed to be fused with no copies.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166071
Approved by: https://github.com/ngimel
2025-10-29 13:13:40 +00:00
4a94591321 filter out alloc-free pairs from trace plot (#165752)
Summary:
When dealing with a large memory trace, the resulting plot can be challenging to interpret and analyze.
This commit introduces a feature that enables filtering of allocations that have already been freed, providing a more focused view.
The remaining events in the plot often warrant closer examination, as they may be indicative of potential out-of-memory (OOM) issues.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165752
Approved by: https://github.com/zdevito
2025-10-29 12:44:54 +00:00
5e7272b60a Revert "[BE] Move GreenContext implementation details to cpp (#166462)"
This reverts commit afaaaa314cc9358a10e9b1986642d49c00773560.

Reverted https://github.com/pytorch/pytorch/pull/166462 on behalf of https://github.com/atalman due to multiple internal build failures ([comment](https://github.com/pytorch/pytorch/pull/166462#issuecomment-3461145801))
2025-10-29 11:59:41 +00:00
1dd6b76914 Revert "[1/N] Remove unused loop variables (#166258)"
This reverts commit 76b2c37045e52540ec51e967aa7b6436a6b9b174.

Reverted https://github.com/pytorch/pytorch/pull/166258 on behalf of https://github.com/atalman due to breaks test/distributed/test_serialization.py::TestSerialization::test_weights_only [GH job link](https://github.com/pytorch/pytorch/actions/runs/18894311802/job/53929321703) [HUD commit link](76b2c37045) ([comment](https://github.com/pytorch/pytorch/pull/166258#issuecomment-3460964612))
2025-10-29 11:10:37 +00:00
284716a691 [pytree] add treespec_{leaf,tuple,dict} functions for args_spec modification (#160843)
The goal of this PR is to provide a standard way to create simple treespec instances and hide the implementation details of the `PyTreeSpec` class.

Changes:

1. Add function `treespec_leaf()` to replace `LeafSpec()`.
2. Add function `treespec_tuple(...)` and `treespec_dict(...)` to create treespec for `tuple` / `dict` which is used for `*args` / `**kwargs`. This avoids direct modification to `treespec` instances that rely on the implementation details of the `PyTreeSpec` class.
3. Change `len(spec.children_specs)` to `spec.num_children`.
4. Change `isinstance(spec, LeafSpec)` to `spec.is_leaf()`.

------

Pull Request resolved: https://github.com/pytorch/pytorch/pull/160843
Approved by: https://github.com/mlazos
2025-10-29 09:16:24 +00:00
8b188647cf [2/N] Fix unused loop variables (#166500)
This PR removes unused loop variables.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166500
Approved by: https://github.com/mlazos
2025-10-29 08:30:35 +00:00
96b61844a7 [BE]: Update nvshmem to 3.4.5 (#164046)
Release notes can be found here: https://docs.nvidia.com/nvshmem/release-notes-install-guide/release-notes/release-3405.html main difference is the addition of a CPU assisted IBGDA fallback which should allow NVSHMEM IBGDA to work on way more systems without admin intervention and without using GDRCopy.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164046
Approved by: https://github.com/ezyang, https://github.com/kwen2501
2025-10-29 07:32:05 +00:00
1b655a87ef [xpu][test] Enable more UTs for Intel GPU. (#166047)
This PR enables additional Inductor unit tests for Intel GPU. Due to the increased number of test cases, the number of runners has been extended from 8 to 12 to prevent CI timeouts.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166047
Approved by: https://github.com/jansel

Co-authored-by: Deng, Daisy <daisy.deng@intel.com>
Co-authored-by: Jason Ansel <jansel@jansel.net>
2025-10-29 06:25:36 +00:00
cb6966704c Add merge rule for PrivateUse1 Module (#166394)
Add merge rights for the following people:
- albanD
- fffrog
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166394
Approved by: https://github.com/ezyang
2025-10-29 06:13:44 +00:00
982686a3d9 Update (base update)
[ghstack-poisoned]
2025-10-28 22:15:09 -07:00
1abaea6382 Update
[ghstack-poisoned]
2025-10-28 22:15:09 -07:00
17d5aa4767 disable jiterator for complex tan and tanh (#165250)
Fixes #100842

Disable jiterator for complex tan and tanh kernels due to accuracy issues, matching the existing approach used for acos, acosh, asin, and asinh. Reverts to thrust implementation which provides better numerical accuracy.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165250
Approved by: https://github.com/ezyang
2025-10-29 04:59:01 +00:00
cde81e92b9 [User-streams] Make torch.Event weakref compatible (#164522)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164522
Approved by: https://github.com/williamwen42
ghstack dependencies: #162903, #164343, #164344, #164507, #162901, #164304
2025-10-29 04:57:23 +00:00
bfc2050db9 [user-streams] Make device-agnostic streams weakref compatible (#164304)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164304
Approved by: https://github.com/williamwen42, https://github.com/colesbury
ghstack dependencies: #162903, #164343, #164344, #164507, #162901
2025-10-29 04:57:23 +00:00
c5701d0ab5 [ONNX] Create fake implementations for onnx ops; fix boolean mask in attention (#165780)
Previously we rely on the concreate implementation to generate fake implementation. This makes the fake implementation overly complicated and breaks in some cases when there are dynamic shapes.

This PR updates onnx op registration to instead take a dedicated fake implementation.

**Also fixed: When boolean mask is supplied to torch sdpa, it was previously taken the negation, which is incorrect.**

Fix https://github.com/pytorch/pytorch/issues/164909 Also taken changes from https://github.com/pytorch/pytorch/pull/156635

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165780
Approved by: https://github.com/titaiwangms
2025-10-29 04:51:49 +00:00
23669d02a6 [user-cuda-streams] Add cuda streams test suite (#162901)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162901
Approved by: https://github.com/williamwen42
ghstack dependencies: #162903, #164343, #164344, #164507
2025-10-29 04:46:08 +00:00
e8d887ae3f [user-streams] Support streams as contexts (#164507)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164507
Approved by: https://github.com/williamwen42
ghstack dependencies: #162903, #164343, #164344
2025-10-29 04:46:08 +00:00
774abb018e [ptd] Fix test config in destroy_pg (#166463)
Summary: When device_type is CPU we will not use device id from CUDA which is enabled in https://github.com/pytorch/pytorch/pull/161015. However, we should not exclude the case when the accelerator itself is CPU. This PR fixes it.

Test Plan: UT

Differential Revision: D85714901

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166463
Approved by: https://github.com/mori360, https://github.com/fegin
2025-10-29 04:35:04 +00:00
0e19561e23 Add back Windows and macOS to tensorboard tests (#166389)
This PR adds back tensorboard tests on Windows and macOS because the dependency issue is resolved.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166389
Approved by: https://github.com/Skylion007
2025-10-29 04:34:57 +00:00
1fa520ea65 [ROCm] Enable group gemm through CK (#166334)
Fixes #161366
All the 4 types of dimension matrix are supported.
2d-2d, 2d-3d, 3d-3d, 3d-2d. The corresponding test cases in test_matmul_cuda are working
for both forward and backward pass.
The CK path is enabled for gfx942, gfx950.
ToDo: Need to enable support on gfx90a since the ck kernel used in this commit produces gpu error,
might require a different CK kernel config, based on the profiler result on gfx90a.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166334
Approved by: https://github.com/jeffdaily, https://github.com/pruthvistony
2025-10-29 04:32:38 +00:00
c2e3cc7aed [Inductor] No longer throw error in bmm out_dtype lowering due to template heuristics (#166457)
Fixes https://github.com/pytorch/pytorch/issues/165892

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166457
Approved by: https://github.com/coconutruben
2025-10-29 04:27:13 +00:00
5849eea129 [vision hash update] update the pinned vision hash (#166356)
This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166356
Approved by: https://github.com/pytorchbot
2025-10-29 04:14:16 +00:00
924482a6f6 Replace NUMA inheritance approach (#166026)
# Context
Previously, we would modify the parent process's NUMA bindings in order to force child process to inherit them.

However, this would not work correctly if `start_method="forkserver"`, because the subprocesses would actually inherit their bindings from the forkserver middleman process. In this case, the inherited affinity would actually be incorrect for all but the first subprocess (because the forkserver process would get created lazily, and hence inherit and then stick with the bindings intended for the first subprocess).

# This PR
* `str` entrypoints: Use `numactl` CLI
* `Callable` entrypoints: Wrap the `Callable` entrypoint and call `os.sched_setaffinity` inside it.

Hopefully this will be the last necessary iteration.

# Test Plan
## Automated
`$ pytest test/test_numa_binding.py`

## Manual
Verified flops/sec and memory locality wins on several different types of jobs
* `Callable` with forkserver
* `str` entrypoint with spawn
* `Callable` entrypoint with spawn

More details in [this doc (Meta-only).](https://docs.google.com/document/d/1vxD-OKYBTT27jbBwtW9iz9g0tNM0u-i0tiTJg_ieQA8/edit?tab=t.scjv58yswi64)

# Later PR
Update all the documentation when we're confident this has stabilized.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166026
Approved by: https://github.com/d4l3k

Co-authored-by: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
2025-10-29 03:58:44 +00:00
20be077085 [Inductor] support masked vectorization for the tail_loop for float64 datatype (#163316)
**Summary:**
Support masked vectorization for the tail_loop for float64 datatype.

**Example:**
```
import torch

def fn(x):
    return x * x

x = torch.randn((22, 22), dtype=torch.double)
with torch.no_grad():
    compiled_fn = torch.compile(fn)
    compiled_fn(x)
```

**Generated code:**

- Before
```
cpp_fused_mul_0 = async_compile.cpp_pybinding(['const double*', 'double*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C"  void  kernel(const double* in_ptr0,
                       double* out_ptr0)
{
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(484L); x0+=static_cast<int64_t>(16L))
        {
            {
                if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(480L)))
                {
                    auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                    auto tmp1 = tmp0 * tmp0;
                    tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                }
                if(C10_UNLIKELY(x0 >= static_cast<int64_t>(480L) && x0 < static_cast<int64_t>(484L)))
                {
                    for (int64_t x0_tail = static_cast<int64_t>(480L);x0_tail < static_cast<int64_t>(484L); x0_tail++)
                    {
                        auto tmp0 = in_ptr0[static_cast<int64_t>(x0_tail)];
                        auto tmp1 = double(tmp0 * tmp0);
                        out_ptr0[static_cast<int64_t>(x0_tail)] = tmp1;
                    }
                }
            }
        }
    }
}
''')

async_compile.wait(globals())
del async_compile

class Runner:
    def __init__(self, partitions):
        self.partitions = partitions

    def recursively_apply_fns(self, fns):
        new_callables = []
        for fn, c in zip(fns, self.partitions):
            new_callables.append(fn(c))
        self.partitions = new_callables

    def call(self, args):
        arg0_1, = args
        args.clear()
        assert_size_stride(arg0_1, (22, 22), (22, 1))
        buf0 = empty_strided_cpu((22, 22), (22, 1), torch.float64)
        # [Provenance debug handles] cpp_fused_mul_0:1
        cpp_fused_mul_0(arg0_1, buf0)
        del arg0_1
        return (buf0, )
```
- After
```
cpp_fused_mul_0 = async_compile.cpp_pybinding(['const double*', 'double*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C"  void  kernel(const double* in_ptr0,
                       double* out_ptr0)
{
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(484L); x0+=static_cast<int64_t>(16L))
        {
            {
                if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(480L)))
                {
                    auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                    auto tmp1 = tmp0 * tmp0;
                    tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                }
                if(C10_UNLIKELY(x0 >= static_cast<int64_t>(480L) && x0 < static_cast<int64_t>(484L)))
                {
                    auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(4L));
                    auto tmp1 = tmp0 * tmp0;
                    tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(4L));
                }
            }
        }
    }
}
''')

async_compile.wait(globals())
del async_compile

class Runner:
    def __init__(self, partitions):
        self.partitions = partitions

    def recursively_apply_fns(self, fns):
        new_callables = []
        for fn, c in zip(fns, self.partitions):
            new_callables.append(fn(c))
        self.partitions = new_callables

    def call(self, args):
        arg0_1, = args
        args.clear()
        assert_size_stride(arg0_1, (22, 22), (22, 1))
        buf0 = empty_strided_cpu((22, 22), (22, 1), torch.float64)
        # [Provenance debug handles] cpp_fused_mul_0:1
        cpp_fused_mul_0(arg0_1, buf0)
        del arg0_1
        return (buf0, )
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163316
Approved by: https://github.com/mingfeima, https://github.com/jansel
2025-10-29 03:30:38 +00:00
94eaeb9cb8 [Conv1d] Check overflow before we compute padding size. (#162363)
Fixes https://github.com/pytorch/pytorch/issues/161877
also fixes https://github.com/pytorch/pytorch/issues/161875

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162363
Approved by: https://github.com/jbschlosser
2025-10-29 03:27:20 +00:00
753d9bd806 Introduce a new API torch.xpu.set_per_process_memory_fraction (#165510)
# Motivation
Aligned with other backends, this PR introduces a new API `torch.xpu.set_per_process_memory_fraction` to allow user to customize the allowed memory per a single process.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165510
Approved by: https://github.com/EikanWang, https://github.com/ezyang
ghstack dependencies: #165508, #165509
2025-10-29 03:24:52 +00:00
dd1fe7c22f Remove clang-tidy type conversion suppressions (#166398)
This PR fixes and removes type conversion suppressions of clang-tidy.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166398
Approved by: https://github.com/Skylion007
2025-10-29 03:21:16 +00:00
695cb0d342 [2/N][Fix] Fix typo in test folder (#166374)
Fix typo in test folder.

_typos.toml
```bash
[default.extend-words]
nd = "nd"
arange = "arange"
Nd = "Nd"
GLOBALs = "GLOBALs"
hte = "hte"
iy = "iy"
PN = "PN"
Dout = "Dout"
optin = "optin"
gam = "gam"
PTD = "PTD"
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166374
Approved by: https://github.com/cyyever, https://github.com/ezyang
2025-10-29 03:02:07 +00:00
1764f3a9c8 [Fix] fix gramma error in PyTorch docs (#166158)
Fix several gramma errors in PyTorch docs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166158
Approved by: https://github.com/yewentao256, https://github.com/cyyever, https://github.com/ezyang
2025-10-29 03:01:07 +00:00
c9eabadc5e Suppress std::hardware_destructive_interference_size warning on GCC 13+ (#166297)
# Motivation
In https://github.com/pytorch/pytorch/pull/145591, `std::hardware_destructive_interference_size` was introduced in CUDACachingAllocator. Later, https://github.com/pytorch/pytorch/pull/160067 moved it to `c10/core/alignment.h` for code reuse.
However, on **GCC 13+** using `std::hardware_destructive_interference_size` triggers the following warning:
```bash
warning: use of ‘std::hardware_destructive_interference_size’ [-Winterference-size]
/home/pt-gpu/4T-4652/guangyey/stock-pytorch/aten/src/ATen/core/CachingHostAllocator.h:42:16: note: its value can vary between compiler versions or with different ‘-mtune’ or ‘-mcpu’ flags
/home/pt-gpu/4T-4652/guangyey/stock-pytorch/aten/src/ATen/core/CachingHostAllocator.h:42:16: note: if this use is part of a public ABI, change it to instead use a constant variable you define
/home/pt-gpu/4T-4652/guangyey/stock-pytorch/aten/src/ATen/core/CachingHostAllocator.h:42:16: note: the default value for the current CPU tuning is 64 bytes
/home/pt-gpu/4T-4652/guangyey/stock-pytorch/aten/src/ATen/core/CachingHostAllocator.h:42:16: note: you can stabilize this value with ‘--param hardware_destructive_interference_size=64’, or disable this warning with ‘-Wno-interference-size’
```

# Solution
- Solution 1: Replace `c10::hardware_destructive_interference_size` with a constant 64.
```cpp
constexpr std::size_t hardware_destructive_interference_size = 64;
```

- Solution 2: adding `-Wno-interference-size’ to 8d4e48831e/cmake/public/utils.cmake (L386) to suppress the warning.

# Additional Context
The current implementation uses the second approach. If the reviewers prefer the first approach, I am happy to update it accordingly.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166297
Approved by: https://github.com/ezyang
2025-10-29 02:57:46 +00:00
c201a1cab1 [OpenReg] Update Installation in README.md (#166235)
It is recommended to use `python -m pip install --no-build-isolation .` instead of `pip3 install --no-build-isolation .` because most of us use a virtual environment, and the latter probably relies on the system `pip3` rather than the conda or uv. We need to make it consistent with the Python we use, and it is also consistent with how `torch` is installed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166235
Approved by: https://github.com/fffrog, https://github.com/ezyang
2025-10-29 02:57:26 +00:00
e105a47575 [user-streams] Have StreamVariable inherit from StreamContextVariable (#164344)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164344
Approved by: https://github.com/williamwen42
ghstack dependencies: #162903, #164343
2025-10-29 02:49:54 +00:00
aab27b051a [user-streams] Move StreamContextVariable into streams module (#164343)
finish moving

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164343
Approved by: https://github.com/williamwen42, https://github.com/fxdawnn
ghstack dependencies: #162903
2025-10-29 02:49:54 +00:00
f8b4c00294 intfs + unit tests (#164723)
Test Plan:
```
buck test fbcode//mode/opt caffe2/test/inductor:caching
```

Differential Revision: D83727222

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164723
Approved by: https://github.com/aorenste
2025-10-29 02:32:19 +00:00
877f126e35 [MPS] Improve index_select error checking (#166468)
Just copy-n-paste overlap checks from
0d4992c170/aten/src/ATen/native/TensorAdvancedIndexing.cpp (L1620-L1622)

Very similar to https://github.com/pytorch/pytorch/pull/166425
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166468
Approved by: https://github.com/dcci, https://github.com/Skylion007
2025-10-29 02:23:12 +00:00
4fada51ada Fix existing Pyrefly errors (#166439)
Trying to keep main as clean of type errors as possible until we are able to swtich to just one checker.

This adds suppressions for existing type errors on main.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166439
Approved by: https://github.com/Skylion007
2025-10-29 02:08:02 +00:00
76b2c37045 [1/N] Remove unused loop variables (#166258)
This PR removes unused loop variables.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166258
Approved by: https://github.com/Lucaskabela, https://github.com/mlazos
2025-10-29 01:34:15 +00:00
adedf26e21 Support python slicing with tensor inputs. (#165074)
when the slice is tensor, we decompose it to .item() call and pass the unbacked symbol to the slice to avoid DDE.
the diff also fix an existing bug in codegen_dynamic_slice_size in the cpp wrapper.  a +1 should be -1 making it match
python codegen.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165074
Approved by: https://github.com/Lucaskabela
2025-10-29 01:18:45 +00:00
bea89d6060 [PyTorch] Improve conversion from/to bool on aarch64+sve (#166330)
Summary:
We are adding autovec routines to convert to/from boolean values

We observed the following performance improvements when compiling targeting armv9-a+sve2+fp16+bf16

before:

bool->uint8->bool ===> 447.854us
bool->int8->bool ===> 445.609us
bool->int16->bool ===> 312.425us
bool->int32->bool ===> 324.368us
bool->float->bool ===> 320.929us
bool->float16->bool ===> 290.825us
bool->bfloat16->bool ===> 437.250us

after

bool->uint8->bool ===> 78.988us ----> 467% higher throughput
bool->int8->bool ===> 78.494us -----> 468% higher throughput
bool->int16->bool ===> 107.993us ----> 189% higher throughput
bool->int32->bool ===> 186.887us -----> 74% higher throughput
bool->float->bool ===> 188.048us ------> 71% higher throughput
bool->float16->bool ===> 102.789us --> 183% higher throughput
bool->bfloat16->bool ===> 105.809us -> 313% higher throughput

Test Plan:
Correctness:

buck2 test mode/opt //caffe2/test:test_ops
buck2 test mode/opt //caffe2/test:torch

Performance:

buck2 run mode/opt //caffe2/benchmarks/operator_benchmark/fb:operator_benchmark_test

Reviewed By: mcfi

Differential Revision: D85533284

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166330
Approved by: https://github.com/mcfi
2025-10-29 01:09:34 +00:00
48e672d149 [dcp][state_dict] Make _flatten_optim_state_dict and _unflatten_optim_state_dict handle arbitrary-level of nested optim dictionaries by recursion (#165071)
Summary:
This updates the internal helper function of ` _flatten_optim_state_dict` and `_unflatten_optim_state_dict` to handle arbitrary level of nested dictionaries. With this, it can handle optimizer like Shampoo has multiple level of nested dictionary. We parametrized the `shampoo_checkpoint_test.py` to test both for `flatten_optimizer_state_dict=True` or `False`.

Example shampoo nested dictionary:
```
{
    "state": {
        0: {
            "block_0": {
                "shampoo": {
                    "factor_matrices": {
                        0: torch.tensor([[0.0, 0.0], [0.0, 0.0]]),
                        1: torch.tensor([[0.0, 0.0], [0.0, 0.0]]),
                    },
                    "factor_matrix_indices": {},
                    "inv_factor_matrices": {
                        0: torch.tensor([[1.0, 0.0], [0.0, 1.0]]),
                        1: torch.tensor([[1.0, 0.0], [0.0, 1.0]]),
                    },
                },
            },
        },
    },
    "param_groups": [
        {
            "lr": 0.01,
            "betas": (0.9, 1.0),
            "beta3": 0.9,
            "epsilon": 1e-12,
            "momentum": 0.9,
            "dampening": 0.0,
            "weight_decay": 0.0,
            "max_preconditioner_dim": 5,
            "precondition_frequency": 1,
            "start_preconditioning_step": 1,
            "use_nesterov": False,
            "use_bias_correction": True,
            "use_decoupled_weight_decay": True,
            "grafting_config": AdaGradPreconditionerConfig(epsilon=0.001),
            "use_pin_memory": False,
            "distributed_config": SingleDeviceDistributedConfig(
                target_parameter_dimensionality=2
            ),
            "preconditioner_config": self._preconditioner_config,
            "params": [0],
        }
    ],
}
```

With this update, shampoo optimizers can be used with torchtitan without any modification in torchtitan side.

Also, we ensure it is still backward compatible with other torch optimizers like Adam.

Test Plan:
Shampoo test:
```
[irisz@devvm5551.cco0 ~/fbsource/fbcode (49fd905c0b)]$ buck2 test @//mode/opt //hpc/optimizers/distributed_shampoo/dev/distributor/gpu_tests:shampoo_checkpoint_test
Buck UI: https://www.internalfb.com/buck2/ff5e0f02-637d-4a73-b990-c0792a460216
Test UI: https://www.internalfb.com/intern/testinfra/testrun/9007199373078880
Network: Up: 0B  Down: 0B
Executing actions. Remaining     0/5
Command: test.
Time elapsed: 27.3s
Tests finished: Pass 2. Fail 0. Fatal 0. Skip 0. Build failure 0
```

torch.checkpoint.state_dict test.
```
[irisz@devvm5551.cco0 ~/fbsource/fbcode (49fd905c0b)]$  buck2 test @//mode/opt  //caffe2/test/distributed/checkpoint:test_state_dict
Buck UI: https://www.internalfb.com/buck2/bf367c2c-4d17-4d13-b6c6-f6058211bcf2
Test UI: https://www.internalfb.com/intern/testinfra/testrun/13792273976572052
Network: Up: 0B  Down: 11GiB  (reSessionID-9662acf0-f3de-4993-b4fe-880c33f91f78)
Executing actions. Remaining     0/5
Command: test.
Time elapsed: 5:31.9s
Tests finished: Pass 26. Fail 0. Fatal 0. Skip 0. Build failure 0
```

Differential Revision: D83619435

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165071
Approved by: https://github.com/fegin
2025-10-29 01:00:38 +00:00
afaaaa314c [BE] Move GreenContext implementation details to cpp (#166462)
- Remove all complex defines logic from the header
- Make GreenContext constructor private, as  it should only be created via the static method as singleton
- Delete unused `getContext` and `getGreenContext` methods
- Rename `CUDA_HAS_GREEN_CONTEXT` to `HAS_CUDA_GREEN_CONTEXT()`, which results in compilation error if one accidentally makes a typo
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166462
Approved by: https://github.com/ngimel, https://github.com/eqy
2025-10-29 00:40:11 +00:00
84fe848503 Fix pyrefly error syntax (2/n) (#166448)
Ensrues pyrefly ignores only silence one error code.

After this, only ~40 files left to clean up .

pyrefly check
lintrunner

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166448
Approved by: https://github.com/Skylion007
2025-10-29 00:36:40 +00:00
56afad4eb3 [precompile] Pickle and check closure variable properly. (#166351)
Summary:

Previously we didn't correctly handle closure tuple when there's content in it. Adding additional code for serializing the tuple and merge it with guard manager local scope.

Test Plan:

pytest test/dynamo/test_aot_compile.py

Reviewers:

Subscribers:

Tasks:

Tags:

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166351
Approved by: https://github.com/Lucaskabela
2025-10-29 00:28:21 +00:00
2a058bfecf [ROCm][tunableop] Fixed Offline Tuning file writing (#166074)
- Fixes issue with offline tuning mode, we want to append to the existing file, not delete it.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166074
Approved by: https://github.com/naromero77amd, https://github.com/jeffdaily
2025-10-29 00:25:45 +00:00
b01af191f6 Update (base update)
[ghstack-poisoned]
2025-10-28 17:15:06 -07:00
fc319ce128 Update
[ghstack-poisoned]
2025-10-28 17:15:06 -07:00
31e42eb732 Fix pyrefly ignore syntax (#166438)
Reformats pyrefly ignore suppressions so they only ignore one error code.

pyrefly check
lintrunner

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166438
Approved by: https://github.com/Skylion007
2025-10-29 00:02:21 +00:00
a6f01826f9 Update (base update)
[ghstack-poisoned]
2025-10-28 17:00:52 -07:00
7b234db6a8 Update
[ghstack-poisoned]
2025-10-28 17:00:52 -07:00
a9b29caeae Add attention benchmarking numbers to pytorch operator microbenchmarks (#164155)
This pull request introduces a standardized YAML-based configuration system for transformer attention benchmarks, making it easier to run and manage comprehensive performance tests. It adds example configs, and a wrapper script to convert YAML configs into CLI arguments for the benchmark runner.

#### Next Steps:
CI Enablement: This change would further lead to running the attention ops in CI for regression tracking.

#### Developer flow: (Run locally)
`python score_mod.py --config configs/config_test.yaml`

#### Enabling CI run: https://github.com/pytorch/pytorch/pull/165915

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164155
Approved by: https://github.com/jbschlosser
2025-10-28 23:46:04 +00:00
0d4992c170 [dynamo][easy] Use CONSTANT_MATCH for __code__ guard (#166445)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166445
Approved by: https://github.com/Lucaskabela
ghstack dependencies: #166437, #166444
2025-10-28 23:19:42 +00:00
b060e5c131 [dynamo] Move more FUNCTION_MATCH to CLOSURE_MATCH (#166444)
Closure match is more relaxed than function match which is id match

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166444
Approved by: https://github.com/Lucaskabela
ghstack dependencies: #166437
2025-10-28 23:19:42 +00:00
6d5e651a50 [user-streams] update stream context to use fork/join (#162903)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162903
Approved by: https://github.com/anijain2305
2025-10-28 23:12:05 +00:00
3cc5949dc2 Remove global pytree registration for blockmask (#166434)
The global pytree registration of `BlockMask` was added in https://github.com/pytorch/pytorch/pull/166045

In general ppl assume `BlockMask` is a leaf, so the global registration  could lead to some unexpected failure when calling `tree_map()` on a `BlockMask` since now it will flatten all the way down.

Therefore, we remove the global registration but keep the `_flatten()` and `_unflatten()` classmethod. Users could do a local registration easily when it is needed.

in pytorch
```
python test/distributed/tensor/test_dtensor_export.py -k test_flex_attention_dtensor_export
```

in torchtitan
```
python -m tests.integration_tests.run_tests ./outputs --test_suite features --ngpu 8
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166434
Approved by: https://github.com/wwwjn
2025-10-28 23:11:52 +00:00
f167fd09fa [annotation] Override metadata on regenerated node in functional mode (#166200)
Fixes #165810

If we regenerate a node during functionalization, we override the "stack_trace", "custom", and "seq_nr" metadata of the regenerated node with the node meta of the original node.

```
python test/functorch/test_aot_joint_with_descriptors.py -k test_preserve_annotate_replay_view
python test/functorch/test_aotdispatch.py TestAOTAutogradWithDynamo.test_duplicated_arguments_on_tensor_overlap
 ```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166200
Approved by: https://github.com/bdhirsh
2025-10-28 22:59:39 +00:00
68b3984b77 [xpu][test] Enable skipped SparseAdam UTs (#166375)
With `SparseAdam` now correctly supported on Intel GPU, the previously disabled UTs can be enabled.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166375
Approved by: https://github.com/Skylion007, https://github.com/janeyx99
2025-10-28 22:49:25 +00:00
a1eb6b5538 [dynamo][guards] Do not guard on the queue_callback (#166437)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166437
Approved by: https://github.com/xmfan
2025-10-28 22:37:38 +00:00
f36f372acc bwd pass (#164504)
**Summary**
This implements the backward pass for the Varlen API and registers `_varlen_attn()` as a custom op.

**Benchmarking**

To benchmark, we compare runtime and TFLOPs against the current SDPA approach with padding.

Settings:

- 1 H100 machine
- `batch_size=8`, `max_seq_len=2048`, `embed_dim=1024`, `num_heads=16`
- dtype `torch.bfloat16`
- `is_causal=False`
- for variable length, we set sequences to be random multiples of 64 up to `max_seq_len`
- 100 runs

|        | Variable Length API | SDPA     |
|--------|--------------------|----------|
| Runtime | 0.8189142608642578 ms       | 3.263883056640625 ms  |
| TFLOPs | 268.652       | 158.731  |

We can see that runtime for Varlen is >3x faster

**Testing**

Run `python test/test_varlen_attention.py` for unit tests where we verify basic functionality and confirm numerical match between varlen gradients vs SDPA.

For custom op testing, `test_custom_op_registration` uses logging mode to verify that `_varlen_attn()` was called and tests with `torch.compile`. `test_custom_op_compliances` uses `torch.library.opcheck()` to verify.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164504
Approved by: https://github.com/drisspg
2025-10-28 22:35:11 +00:00
346d4e2e4b Update (base update)
[ghstack-poisoned]
2025-10-28 15:34:12 -07:00
edfd614820 Update
[ghstack-poisoned]
2025-10-28 15:34:12 -07:00
d9483d4c8d [dynamo] Clean up assert in dynamo [3/N] (#165903)
Some previous PRs have been merged. This PR aims for some **assert** that the users can trigger, and it may be better to turn them into a graph break. Correct me if there are any problems.

* ->#165903(Clean up for graph break)
* #165745
* #165430

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165903
Approved by: https://github.com/williamwen42

Co-authored-by: William Wen <william.wen42@gmail.com>
2025-10-28 22:29:35 +00:00
d82cc9dcf9 Update (base update)
[ghstack-poisoned]
2025-10-28 15:14:24 -07:00
cc52561455 Update
[ghstack-poisoned]
2025-10-28 15:14:24 -07:00
fea819ed08 added type annotation to _NoParamDecoratorContextManager.__new__ (#166414)
Fixes #166413

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166414
Approved by: https://github.com/Skylion007, https://github.com/malfet
2025-10-28 21:59:20 +00:00
84a2715d34 [dynamo] Revert C++-fying of symbolic shape guards (#166427)
Moving symbolic shape guards to C++ causes compile time issues. This basically boils down to a tradeoff question.

For models that have large amount of dynamic shape guards, this flag will help reduce guard latency. But for most of the models, that have a very few dynamic shape guards, the guard lantecy is anyways small. These models will still see a high compile time hit because of calling gcc during the compile.

So a good default value seems to be False. We can write a doc to give guidance on reducing guard latency.

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166427
Approved by: https://github.com/zou3519
2025-10-28 21:57:31 +00:00
572cc12b42 Move MaskPartial to placement_types to improve discoverability (#164414)
Had trouble finding this one myself in #163030.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164414
Approved by: https://github.com/ezyang
2025-10-28 21:56:02 +00:00
1fdef664a5 Revert "[Pytorch] Update Kineto Submodule (#166317)"
This reverts commit be283297100ab86123e74b7a8372995d32b140c8.

Reverted https://github.com/pytorch/pytorch/pull/166317 on behalf of https://github.com/jeffdaily due to ROCm CI was clean, but post-merge ROCm failures showed up ([comment](https://github.com/pytorch/pytorch/pull/166317#issuecomment-3458665809))
2025-10-28 21:55:38 +00:00
08ae55021e support batch size=0 for flash attention (#166318)
Fixes #165944

**Summary**

Today, if we attempt to run flash attention with batch_size 0, we get error `Runtime Error: batch size must be positive`. This PR fixes this by returning early with empty tensors in the fwd and bwd.

**Test plan**
`python test/test_transformers.py -k test_scaled_dot_product_attention` - added case for batch_size=0
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166318
Approved by: https://github.com/drisspg
2025-10-28 21:53:48 +00:00
6dc807e0ac Update (base update)
[ghstack-poisoned]
2025-10-28 14:01:56 -07:00
45f4cc2da3 Update
[ghstack-poisoned]
2025-10-28 14:01:56 -07:00
df8ecde100 Update (base update)
[ghstack-poisoned]
2025-10-28 13:39:26 -07:00
852184c383 Update
[ghstack-poisoned]
2025-10-28 13:39:26 -07:00
551921d484 Change t.is_cuda to t.device.type == 'cuda' in torch/utils/viz (#156418)
Fixes #156417

Unlike `.is_cuda` the property `.device` is supported by `ShardedTensor`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/156418
Approved by: https://github.com/mikaylagawarecki

Co-authored-by: Alexander Zhipa <azzhipa@amazon.com>
2025-10-28 20:34:14 +00:00
b5189e269e NVFP4 grouped gemm support via. FBGEMM kernels (#166308)
Summary:

* Add NVFP4 (1x16 block e4m3, tensor-wise fp32) scaled grouped gemm
* Extend testing to add nvfp4 support

Test Plan:

```
pytest -svv -k grouped test/test_scaled_matmul_cuda.py
```

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166308
Approved by: https://github.com/ngimel
2025-10-28 20:32:53 +00:00
3895ce093f [inductor] add in-kernel nan-check (#166008)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166008
Approved by: https://github.com/eellison
2025-10-28 20:19:10 +00:00
8aa087a29d [ez] Fix print for failing test when entire file fails (#166420)
Was previously printing "FAILED CONSISTENTLY: ul" since it was null,
This changes it so it prints the test_file by moving some logic for checking this to be earlier
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166420
Approved by: https://github.com/Skylion007
2025-10-28 20:13:58 +00:00
7379972cc0 Revert "[Inductor] Naive foreach autotune support (#162053)"
This reverts commit cdb60e44eb528bf02c6bb2d7e384298283e755ca.

Reverted https://github.com/pytorch/pytorch/pull/162053 on behalf of https://github.com/xmfan due to Compile time regression ([comment](https://github.com/pytorch/pytorch/pull/162053#issuecomment-3458252331))
2025-10-28 20:01:54 +00:00
b903018c26 [CD] Windows builds migrate python 3.14rc1->3.14.0 (#166408)
Python 3.14 was released, hence we can use official release version now
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166408
Approved by: https://github.com/Skylion007, https://github.com/malfet
2025-10-28 19:52:38 +00:00
21b48f8dfa Fixes torch.compile(nn.ModuleList()) changes bool() behavior (#159208)
Fixes #159139
## The Cause

The bug occurs because the OptimizedModule wrapper in torch._dynamo.eval_frame doesn't call the len method. This causes Python's bool() check to fall back to the default object truthiness (always True) instead of correctly evaluating containers with len() == 0 as False.
## The Fix

A very easy fix . I just added the len method to OptimizedModule in torch._dynamo.eval_frame class to delegate the call to the original module
```python
def __len__(self):
    """
    Proxy the len() call to the original module to fix truthiness checks.
    """
    return len(self._orig_mod)
```
This successfully fixes the issue . The script now works as expected.
## Reproduction Script
```python
import torch
import torch.nn as nn

# Create an empty nn.ModuleList
original = nn.ModuleList()

# Compile it using torch.compile
compiled = torch.compile(original)

# Compare their boolean evaluations
print(f"bool(original): {bool(original)}")
print(f"bool(compiled): {bool(compiled)}")

# Trigger failure if they differ
assert bool(original) == bool(compiled), "BUG: truthiness behavior mismatch after compilation"
```
## Output

bool(original): False
bool(compiled): False

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159208
Approved by: https://github.com/andrewboldi, https://github.com/Lucaskabela

Co-authored-by: pushkar-hue <pushkarsharma.rtm@gmail.com>
Co-authored-by: Lucas Kabela <lucasakabela@gmail.com>
2025-10-28 19:21:24 +00:00
009ea77234 Remove not needed code path. (#166278)
I accepted a PR that added this code, but re-examining it now, I'm questioning the approach. It seems like we're working around an issue with the inductor generating incorrect sizes. A comment suggests it might be related to unsqueezed u0 values. Removing this code didn't cause any failures, so I'll take it out and address the root issue if it arises.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166278
Approved by: https://github.com/Lucaskabela
2025-10-28 19:03:22 +00:00
0e46a10aa7 [ONNX] Warn when it's training (#166412)
Fixes #166163

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166412
Approved by: https://github.com/justinchuby
2025-10-28 19:01:05 +00:00
a25818cf7e Fix image display on pypi project description section (#166404)
Fixes https://github.com/pytorch/pytorch/issues/165559

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166404
Approved by: https://github.com/malfet, https://github.com/Skylion007, https://github.com/Camyll
2025-10-28 18:58:24 +00:00
e3e93c7107 [MPS] Fix random in-place ops on non-contiguous tensors (#165267)
Random in-place operations (normal_, uniform_, exponential_, bernoulli_, random_) were silently failing on non-contiguous tensors on macOS < 15.0.

* Added needsGather check and scatter-back logic to handle non-contiguous output tensors, following the pattern used in PointwiseOps.

* Adds test to confirm these now work
* Remove pre-macOS15 xfail for test_Dropout

Fixes #165257 and #124029

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165267
Approved by: https://github.com/kulinseth, https://github.com/malfet

Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
2025-10-28 18:43:22 +00:00
1abfa5f70b [EZ][MPS] Improve distribution error checking (#166425)
Essentially not allow ops on self-overlapping outputs, by adding
`at::assert_no_internal_overlap(self);` check that already used in CPU
and CUDA builds, see
895795f07c/aten/src/ATen/native/DistributionTemplates.h (L366)

This fixes `test_error_inputs_bernoulli_mps`

Should be landed ahead of https://github.com/pytorch/pytorch/pull/165267
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166425
Approved by: https://github.com/Skylion007, https://github.com/seemethere
2025-10-28 18:42:12 +00:00
687c15c0b3 [AOTI][BE] Change test_aoti_inference to one-pass build (#164277)
Summary: To fix https://github.com/pytorch/pytorch/issues/159400. Currently, test_aoti_abi_check and test_aoti_inference need to be built in two passes, first build pytorch using the regular `pythonsetup.py develop` and then build with `CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 python setup.py devleop`. This is cumbersome. Fix by rewriting CMakeLists.txt for test_aoti_inference to one-pass build which runs AOTI to compile models at the test time. Also update CI test script to get rid of two-pass build. For test_aoti_abi_check, it is not AOTI specific, so we make it not guarded by BUILD_AOT_INDUCTOR_TEST.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164277
Approved by: https://github.com/janeyx99
2025-10-28 17:43:22 +00:00
895795f07c [ROCm][CI] forward fix kineto submodule bump (#166421)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166421
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-10-28 17:40:23 +00:00
2dc56456cb refactor: pull _replace_node common functionality out of Scheduler.finalize_multi_template_buffers (#163368)
Pull replace_node function out of Scheduler.finalize_multi_template_buffers(). This is needed by the next PR (#163369). As part of this also pull the _replace_operation_buffer() up to top-level since it needed no self references.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163368
Approved by: https://github.com/PaulZhang12
2025-10-28 17:21:52 +00:00
8110ce02a2 Add a skill for writing skills (#166266)
Apparently, if you just ask Claude to write a skill it doesn't follow the
correct rules.  So this one is just the official docs for skills.

Signed-off-by: Edward Yang <ezyang@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166266
Approved by: https://github.com/Skylion007
ghstack dependencies: #166265
2025-10-28 16:49:27 +00:00
43c30f607e Use correct layout convention for skills (#166265)
Signed-off-by: Edward Yang <ezyang@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166265
Approved by: https://github.com/Skylion007
2025-10-28 16:49:27 +00:00
5ebf74a655 [2/2] Move scaled_mm routines to their own file (#166314)
Summary:

* Further simplify `ATen/native/cuda/Blas.cpp` by moving `_scaled_mm`,
  `_scaled_mm_v2` and supporting methods to a new file,
  `ATen/native/cuda/ScaledBlas.cpp`

Test Plan:

```
pytest -svv test/test_matmul_cuda.py
pytest -svv test/test_scaled_matmul_cuda.py
```

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166314
Approved by: https://github.com/eqy
ghstack dependencies: #166313
2025-10-28 16:35:32 +00:00
acd936cc1a [1/2] Split cublasCommonArgs into its own file (#166313)
Summary:

* Factor out `cublasCommonArgs` struct
* Necessary for factoring out scaled mm routines

Test Plan:

```
pytest -svv test/test_matmul_cuda.py
pytest -svv test/test_scaled_matmul_cuda.py
```

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166313
Approved by: https://github.com/eqy, https://github.com/Skylion007
2025-10-28 16:35:32 +00:00
a4a0378e6b Revert "[cuDNN] Smoke-test runtime cuDNN version matches compile time version in CI (#165922)"
This reverts commit 2a5f87decf34b3d0ea7670238e2fd4620ed19e9f.

Reverted https://github.com/pytorch/pytorch/pull/165922 on behalf of https://github.com/atalman due to cudnn update started to fail, see https://github.com/pytorch/pytorch/pull/165913#issuecomment-3457293475 ([comment](https://github.com/pytorch/pytorch/pull/165922#issuecomment-3457389406))
2025-10-28 16:29:29 +00:00
ac841267a1 [ROCm] skip AsyncTP test class as AsyncTP is not supported on ROCm (#166316)
Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166316
Approved by: https://github.com/jeffdaily
2025-10-28 16:23:46 +00:00
0eacd934bc Revert "Update cuDNN 9.10.2 in Manylinux 2.28 Docker files (#165913)"
This reverts commit 840d63c12d255dac1ae3c5e442c6ea6eb87a7256.

Reverted https://github.com/pytorch/pytorch/pull/165913 on behalf of https://github.com/clee2000 due to I think something here is causing CI tests to segfault at exit on cuda, ex [GH job link](https://github.com/pytorch/pytorch/actions/runs/18857880394/job/53811917713) [HUD commit link](9a91486e45) says no tests failed but it segfaulted afterwards.  I can't tell if it's because of this change, or an unpinned dependency in docker that got triggered by this.  Note to self, would have been bad TD except trunk didn't run either ([comment](https://github.com/pytorch/pytorch/pull/165913#issuecomment-3457293475))
2025-10-28 16:11:07 +00:00
5016e7b2eb [FlexAttention] Add mechanism to get optimal autotune decision (#165817)
Script: https://github.com/meta-pytorch/attention-gym/pull/169

Feels directionally okay but there is some bike shedding / this could be quite prone to collision of keys depending on mask mod and score mod changes and simple cache key.

Usecase: https://github.com/meta-pytorch/attention-gym/pull/169

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165817
Approved by: https://github.com/Chillee
2025-10-28 15:50:12 +00:00
544b443ea1 [CD] Upgrade to CUDA 13.0.2 for nightly binaries (#165470)
13.0.U2 is posted, adding to nightlies
Why we want to upgrade: CUDA 13.0.U2 included a new release from cuBLAS that
1. Enabled opt-in fixed-point emulation for FP64 matmuls (D/ZGEMM) which improves performance and power-efficiency.
2. Improved performance on NVIDIA [DGX Spark](https://www.nvidia.com/en-us/products/workstations/dgx-spark/) for FP16/BF16 and FP8 GEMMs.
3. adds BF16x9 FP32 emulation support for SYRK and HERK routines.
Reference: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cublas-release-13-0-update-2

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165470
Approved by: https://github.com/atalman
2025-10-28 15:14:43 +00:00
3041ede082 Improve eig tests in preparation for new eig backends (#166322)
### Summary
Improves validation of `torch.linalg.eig` results by verifying the eigen decomposition identity **A v − v λ = 0**.

### Motivation
Eigenvectors are not unique, and numerical differences between backends (cuSOLVER, MAGMA, CPU)
can cause false test failures. This PR replaces direct elementwise comparisons with a mathematical
identity check, improving robustness across devices.

### Details
- Introduces `fulfills_eigen_decomposition_identity()` in `test_eig_compare_backends()` to validate the eigen equation.
- Uses CPU matmul for high-precision verification.
- Handles zero-sized matrices explicitly.
- Tolerances derived from numerical comparisons between cuSOLVER and NumPy.
  See discussion: [dev-discuss.pytorch.org link](https://dev-discuss.pytorch.org/t/cusolver-dnxgeev-faster-cuda-eigenvalue-calculations/3248/6)

### Impact
- Improves test stability and correctness across eig backends.
- No change to public API.
- All tests pass; lintrunner reports no issues.
- Enables introduction of new eig backends without false test failures.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166322
Approved by: https://github.com/lezcano
2025-10-28 14:42:47 +00:00
34d6ef7022 Update gm.print_readable to include Annotation (#165397)
Sample output
```
[rank0]:        # Annotation: {'compile_with_inductor': 'flex_attention'} File: /data/users/bahuang/pytorch/torch/nn/attention/flex_attention.py:1490 in flex_attention, code: out, lse, max_scores = flex_attention_hop(
[rank0]:        score_mod_2 = self.score_mod_2
[rank0]:        mask_fn_2 = self.mask_fn_2
[rank0]:        flex_attention_1 = torch.ops.higher_order.flex_attention(xq_5, xk_5, xv_3, score_mod_2, (2048, 2048, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___kv_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___kv_indices, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_kv_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_kv_indices, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___q_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___q_indices, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_q_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_q_indices, 128, 128, mask_fn_2), 0.25, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False}, (), (g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___mask_mod___closure___0_cell_contents,));  xq_5 = xk_5 = xv_3 = score_mod_2 = mask_fn_2 = None
[rank0]:        out_2: "bf16[8, 4, 2048, 16]" = flex_attention_1[0];  flex_attention_1 = None
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165397
Approved by: https://github.com/yushangdi, https://github.com/anijain2305, https://github.com/mlazos
2025-10-28 13:54:38 +00:00
110efe4df4 Revert "[inductor][choices] lookup table choices 1/3 (#164978)"
This reverts commit b44423bbb43860c1e340cbebc9d101dc18031ecb.

Reverted https://github.com/pytorch/pytorch/pull/164978 on behalf of https://github.com/atalman due to failing internal test on newly added tests: Test when there's no lookup table entry with different autotune modes ([comment](https://github.com/pytorch/pytorch/pull/164978#issuecomment-3456400126))
2025-10-28 13:12:55 +00:00
e137cd0a10 docs: fix typos (#164879)
Correct typos in the comments

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164879
Approved by: https://github.com/Lucaskabela, https://github.com/mlazos, https://github.com/cyyever
2025-10-28 12:00:36 +00:00
be28329710 [Pytorch] Update Kineto Submodule (#166317)
Summary: Update Submodule

Test Plan: CI

Differential Revision: D85579130

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166317
Approved by: https://github.com/Skylion007
2025-10-28 10:41:17 +00:00
85a7c745aa [triton][nativert] Add num_cpu_threads for triton-cpu (#166255)
Summary:
The new triton-cpu has `num_cpu_threads` like `num_warps`, which are auto-tunable. This diff adds `num_cpu_threads` to NativeRT.

Differential Revision: D85515240

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166255
Approved by: https://github.com/XueningXu
2025-10-28 08:40:04 +00:00
eeed168251 Update (base update)
[ghstack-poisoned]
2025-10-28 01:08:10 -07:00
8cd9eb6891 Update
[ghstack-poisoned]
2025-10-28 01:08:10 -07:00
32fe4f681e [dynamo] fix keyerror in resume_execution (again) (#166040)
Fixes https://github.com/pytorch/pytorch/issues/166176

The error I attempted to fix in https://github.com/pytorch/pytorch/pull/162318 was still appearing internally.

Surprised that this wasn't caught anywhere 😰

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166040
Approved by: https://github.com/Lucaskabela
ghstack dependencies: #166036
2025-10-28 07:04:29 +00:00
ebb2b2e894 [dynamo] fix store attr graph break in with block (#166036)
Fixes https://github.com/pytorch/pytorch/issues/166033

Differential Revision: [D85198055](https://our.internmc.facebook.com/intern/diff/D85198055)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166036
Approved by: https://github.com/Lucaskabela
2025-10-28 07:04:29 +00:00
13413b3b07 [AMP][Refactor] Autocast dtype handling to simplify device-specific c… (#165221)
This PR refactors the autocast context manager in autocast_mode.py to simplify and centralize the logic for checking supported dtypes for each device. The previous implementation repeated similar checks for multiple device types. Now, a single mapping device_supported_dtypes is used to associate device types with their supported dtypes, and the validation logic is unified.

**The former PR #163446 was merged but reverted due to failed CI test on `openreg` related tests.**

This RR additionally slightly modified some test assertions for passing the CI tests. CI failed due to assertion for the exactly same error message. For example:
```
File "/var/lib/jenkins/workspace/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_autocast.py", line 9, in test_autocast_with_unsupported_type
    with self.assertWarnsRegex(
        AssertionError: "In openreg autocast, but the target dtype torch.float32 is not supported." does not match "In openreg autocast, but the target dtype is not supported. Disabling autocast."
```

Sorry for the inconvenience again.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165221
Approved by: https://github.com/albanD
2025-10-28 06:21:29 +00:00
5d0b3e28dc [inductor] generate fused rms/layer norm bwd (#165370)
RMS/Layer norm backward would generated 2 kind of reductions:
- the reduction computing dx which reduce across the hidden dimension (in the context of transformer)
- the reduction computing dw/db which reduce across the BxT (batch size , sequence length) dimension.

These 2 set of reductions have common input buffers but inductor can not fuse them because of different loop orders.

There are multiple sources of custom kernels that implement fused version of such kernel (Liger-Kernel, quack, Paul Zhang's internal post). This PR enable Inductor to generate such kernels automatically.

The generated kernel is very similar to 33924d20b6/src/liger_kernel/ops/rms_norm.py (L114) .

To make the implementation simple and performing, we enable such fusion only if the inner reduction (computing dx) is a persistent reduction. This should be true for representative inputs. Persistent reduction is critical for the perf here to make sure a loaded tensor does not need to be reload.

To make sure the inner reduction (computing dx) and outer reductions (computing dw/db) being fusible, the PR does the following:
1. convert the outer reductions to pointwise by replacing 'reduction' & 'store_reduction' node with a new type of node 'parital_accumulate'. The new node will collect the reduction type, buffer name, input of reduction etc, which is essential for proper codegening.
2. do loop reordering (rely on the earlier loop ordering after fusion work) to reorder the loops of the converted pointwise so it can be fused with the inner reduction
3. there can be epilogues that need to be added in the end. E.g. the outer reduction may be followed by a division for mean , or followed by a down cast if dw/db is in low precision (fp16/bf16).

Some early benchmarking on H100 shows about 2X speedup for both RMSNorm and LayerNorm backward for shape (1152 * 500, 384 ) used in some internal model. Note that, I manually disable split reduction in this benchmarking since otherwise the fusion will be skipped right now. The next PR will make the mix-order-reduction compose better with split reduction

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165370
Approved by: https://github.com/jansel
ghstack dependencies: #166204
2025-10-28 05:53:52 +00:00
9139368b64 [PyTorch] Use events from pool in copy_device_to_device (#165647)
Summary: In this diff, we add a event pool so that we dont have to create/destroy events all the time, instead re-use the events from the pool.

Test Plan: contbuild

Differential Revision: D84685495

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165647
Approved by: https://github.com/bbus
2025-10-28 05:19:05 +00:00
02095cc09d [dynamo] Dont guard on getset descriptors for torch_function (#166346)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166346
Approved by: https://github.com/mlazos
ghstack dependencies: #166329
2025-10-28 04:33:56 +00:00
65868156c6 [dynamo] Guard selectively on the torch APIs (#166329)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166329
Approved by: https://github.com/Lucaskabela
2025-10-28 04:33:56 +00:00
f93ea7dab1 [export] Update dynamo_graph_capture_for_export to return GraphModule. (#166091)
Make dynamo_graph_capture_for_export return a more compatible GraphModule object which is closer the the original behavior of dynamo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166091
Approved by: https://github.com/tugsbayasgalan
2025-10-28 04:23:28 +00:00
a77f5d9a00 [ROCm] Use a ROCm version string without hash. (#166336)
Fixes #166068

Use the ROCm version string that does not contain a hash. The string is set in LoadHIP.cmake.

Tested on repro provided by reporter.

For a ROCm 7.0 docker container, we get `7.0.0`.

For a ROCm 7.0.2 docker container, we get `7.0.2`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166336
Approved by: https://github.com/jeffdaily
2025-10-28 03:53:55 +00:00
ff46d5a79b [Inductor][Triton][FP8] Support deepseek-style scaling in Inductor (#164404)
Summary:
Support deepseek-style scaling in Inductor Triton for FP8 GEMMs. DeepSeek-style scaling is a colloquial term for a fine-grained mixed precision framework using FP8 to train [Deepseek-V3](https://arxiv.org/pdf/2412.19437), DeepSeek AI's recent MoE (Mixture of Experts) model. DeepSeek-style scaling effectively extends the dynamic range of FP8 by mitigating dequantization overhead under increased-precision accumulation, which is key to achieving more accurate FP8 GEMM results.

DeepSeek-style scaling on matmul `A @ B` leverages two different types of scaling strategies to preserve a balance between numerical stability and training efficiency:
- Activations (input tensor `A`): tile-wise (1x128 across shape `(M, K)`)
- Weights (input tensor `B`): block-wise (128x128 across shape `(N, K)`)

This diff enables Inductor users to replicate past successes with deepseek-style scaling and achieve higher numerical stability while increasing training efficiency.

NOTE: Block-wise 128x128 scaling is only supported in CUDA 12.9+; therefore, deepseek-style scaling is currently unsupported in `fbcode` (CUDA 12.4). Use OSS PyTorch to run deepseek-style scaling.

NOTE: Accuracy for FP8 is unstable, even with high tolerances, which is why TritonBench benchmarks are unlikely to be accurate against a `torch` implementation.

Test Plan:
In OSS PyTorch, run
```
TORCHINDUCTOR_CACHE_DIR=~/personal/cache_dir_inductor CUDA_LAUNCH_BLOCKING=1 TORCH_USE_CUDA_DSA=1 TRITON_PRINT_AUTOTUNING=1 TRITON_ALWAYS_COMPILE=1 TORCH_LOGS=+inductor TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 ENABLE_PERSISTENT_TMA_MATMUL=1 TORCHINDUCTOR_MAX_AUTOTUNE_GEMM=1 python run.py --op fp8_gemm --only torch_fp8_gemm,pt2_fp8_gemm --metrics tflops,accuracy --m 4096 --n 768 --k 512 --output="{output_dir}/deepseek_bench.csv" --scaling_deepseek --atol=1e-2 --rtol=0.5 2>&1 | tee ~/personal/deepseek_style/deepseek_bench.log
```

Differential Revision: D83609850

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164404
Approved by: https://github.com/slayton58
2025-10-28 03:38:54 +00:00
f452edd782 [dynamo, 3.14] fix misc. bugs to get most dynamo unittests passing locally in 3.14 (#164631)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164631
Approved by: https://github.com/Lucaskabela, https://github.com/mlazos
2025-10-28 03:24:22 +00:00
ea698e8bfc [dynamo, nested graph breaks] disallow nested graph breaks in HOPs (#166016)
As discussed offline with @ydwu4, we should not allow nested graph breaks in HOPs.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166016
Approved by: https://github.com/Lucaskabela
ghstack dependencies: #166013, #166015, #165808, #165809
2025-10-28 03:03:38 +00:00
7f7a28046b [dynamo, nested graph breaks] disable nested graph breaks in generators; enable nested_graph_breaks in test_ctx_manager.py and test_generator.py (#165809)
Generators should not support nested graph breaks.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165809
Approved by: https://github.com/Lucaskabela, https://github.com/guilhermeleobas
ghstack dependencies: #166013, #166015, #165808
2025-10-28 03:03:37 +00:00
d8283a317a [dynamo, nested graph breaks] fix RETURN_VALUE tx skipping in nested graph breaks (#165808)
Previously, we would completely skip building and calling any resume function if the leaf frame's resume instruction was RETURN_VALUE/RETURN_CONST. Now, we only skip building/calling resume functions for frames that are resuming on RETURN_VALUE.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165808
Approved by: https://github.com/Lucaskabela
ghstack dependencies: #166013, #166015
2025-10-28 03:03:37 +00:00
e0ca3049c0 [dynamo, nested graph breaks] remove _dynamo.utils.counter patch on inlined tx'es (#166015)
This `patch.dict(counters, ...` appears to be ancient code that doesn't really seem to be doing anything? It causes issues in nested graph breaks because the patch cleanup clears out the record of the nested graph break. Removing the patch to see if it's even needed in the first place.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166015
Approved by: https://github.com/Lucaskabela
ghstack dependencies: #166013
2025-10-28 03:03:37 +00:00
8417981c96 [dynamo, nested graph breaks] add TestCaseWithNestedGraphBreaks subclass (#166013)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166013
Approved by: https://github.com/Lucaskabela
2025-10-28 03:03:37 +00:00
06e71c8558 [hop] local_map MoE: fix unbacked symints during tracing and symint activations order in the wrapper (#165551)
This PR fixes 2 issues with local_mapping token-choice moe. Splits from the fw token dispatch result in tensors with unbacked shapes and these unbacked shapes are fully contained in the a2as, and should not leak outside of the joint graph. The HOP body fw and bw are expected to coerce back to static shapes (due to adding it with shared experts output) before returning.
```python
routed_output: "bf16[u0 + u1 + u10 + u11 + u12 + u13 + u14 + u15 + u16 + u17 + u18 + u19 + u2 + u20 + u21 + u22 + u23 + u24 + u25 + u26 + u27 + u28 + u29 + u3 + u30 + u31 + u32 + u33 + u34 + u35 + u36 + u37 + u38 + u39 + u4 + u40 + u41 + u42 + u43 + u44 + u45 + u46 + u47 + u48 + u49 + u5 + u50 + u51 + u52 + u53 + u54 + u55 + u56 + u57 + u58 + u59 + u6 + u60 + u61 + u62 + u63 + u7 + u8 + u9, 2048]" = torch.ops.higher_order.autograd_function_apply(fwd_body_1, bwd_body_1, out_1, item, item_1, item_2, item_3, item_4, item_5, item_6, item_7, item_8, item_9, item_10, item_11, item_12, item_13, item_14, item_15, item_16, item_17, item_18, item_19, item_20, item_21, item_22, item_23, item_24, item_25, item_26, item_27, item_28, item_29, item_30, item_31, item_32, item_33, item_34, item_35, item_36, item_37, item_38, item_39, item_40, item_41, item_42, item_43, item_44, item_45, item_46, item_47, item_48, item_49, item_50, item_51, item_52, item_53, item_54, item_55, item_56, item_57, item_58, item_59, item_60, item_61, item_62, item_63, item_64, item_65, item_66, item_67, item_68, item_69, item_70, item_71, item_72, item_73, item_74, item_75, item_76, item_77, item_78, item_79, item_80, item_81, item_82, item_83, item_84, item_85, item_86, item_87, item_88, item_89, item_90, item_91, item_92, item_93, item_94, item_95, item_96, item_97, item_98, item_99, item_100, item_101, item_102, item_103, item_104, item_105, item_106, item_107, item_108, item_109, item_110, item_111, item_112, item_113, item_114, item_115, item_116, item_117, item_118, item_119, item_120, item_121, item_122, item_123, item_124, item_125, item_126, item_127, args_tensor_mask = [True, False, False, False], non_differentiable_idx = []);  fwd_body_1 = bwd_body_1 = out_1 = item = item_1 = item_2 = item_3 = item_4 = item_5 = item_6 = item_7 = item_8 = item_9 = item_10 = item_11 = item_12 = item_13 = item_14 = item_15 = item_16 = item_17 = item_18 = item_19 = item_20 = item_21 = item_22 = item_23 = item_24 = item_25 = item_26 = item_27 = item_28 = item_29 = item_30 = item_31 = item_32 = item_33 = item_34 = item_35 = item_36 = item_37 = item_38 = item_39 = item_40 = item_41 = item_42 = item_43 = item_44 = item_45 = item_46 = item_47 = item_48 = item_49 = item_50 = item_51 = item_52 = item_53 = item_54 = item_55 = item_56 = item_57 = item_58 = item_59 = item_60 = item_61 = item_62 = item_63 = item_64 = item_65 = item_66 = item_67 = item_68 = item_69 = item_70 = item_71 = item_72 = item_73 = item_74 = item_75 = item_76 = item_77 = item_78 = item_79 = item_80 = item_81 = item_82 = item_83 = item_84 = item_85 = item_86 = item_87 = item_88 = item_89 = item_90 = item_91 = item_92 = item_93 = item_94 = item_95 = item_96 = item_97 = item_98 = item_99 = item_100 = item_101 = item_102 = item_103 = item_104 = item_105 = item_106 = item_107 = item_108 = item_109 = item_110 = item_111 = item_112 = item_113 = item_114 = item_115 = item_116 = item_117 = item_118 = item_119 = item_120 = item_121 = item_122 = item_123 = item_124 = item_125 = item_126 = item_127 = None

# File: /home/xmfan/core/a/autoparallel/examples/example_ds3_local_map.py:777 in local_mapped_region, code: torch._check(routed_output.shape[0] == shape[0] * shape[1])
size_3 = routed_output.size()
getitem_139 = size_3[1];  size_3 = getitem_139 = None

# File: /home/xmfan/core/a/autoparallel/examples/example_ds3_local_map.py:779 in local_mapped_region, code: routed_output = routed_output.view(shape)
routed_output_1: "bf16[4, 6144, 2048]" = routed_output.view((4, 6144, 2048));  routed_output = None

# File: /home/xmfan/core/a/autoparallel/examples/example_ds3_local_map.py:781 in local_mapped_region, code: out = out.scatter_add(dim=1, index=token_indices_experts_sorted, src=routed_output)
out_3: "bf16[4, 1024, 2048]" = out_2.scatter_add(dim = 1, index = token_indices_experts_sorted_2, src = routed_output_1);  out_2 = token_indices_experts_sorted_2 = routed_output_1 = None
```

## 1. Unbacked symints contained within the HOP body

Based on 9b2974e812 and 36030e0315.

We disable proxy mode so that unbacked symints that are contained within the HOP subgraph aren't proxied:
```python
[rank0]: RuntimeError: u576 + u577 + u578 + u579 + u580 + u581 + u582 + u583 + u584 + u585 + u586 + u587 + u588 + u589 + u590 + u591 + u592 + u593 + u594 + u595 + u596 + u597 + u598 + u599 + u600 + u601 + u602 + u603 + u604 + u605 + u606 + u607 + u608 + u609 + u610 + u611 + u612 + u613 + u614 + u615 + u616 + u617 + u618 + u619 + u620 + u621 + u622 + u623 + u624 + u625 + u626 + u627 + u628 + u629 + u630 + u631 + u632 + u633 + u634 + u635 + u636 + u637 + u638 + u639 + 1 (140667108386064)is not tracked with proxy for <torch.fx.experimental.proxy_tensor.PythonKeyTracer object at 0x7fef9d44f950>
```
And we ensure that no unbacked symints leak outside of the region.

## 2. Saved symint activations

local_map is using the partitioned backward, and needs to follow the partitioner's desired ordering, this is the same order as AOTAutograd runtime wrapper uses in `_backward_prologue_functional` where we pass symints first: d2c82bafb7/torch/_functorch/_aot_autograd/runtime_wrappers.py (L1702-L1704)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165551
Approved by: https://github.com/bobrenjc93, https://github.com/bdhirsh
ghstack dependencies: #164780
2025-10-28 02:52:41 +00:00
a76b59cc45 [dynamo] local_map error message for reordered inputs (#164780)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164780
Approved by: https://github.com/mlazos
2025-10-28 02:52:41 +00:00
74336f8c77 Revert "[CD] Upgrade to CUDA 13.0.2 for nightly binaries (#165470)"
This reverts commit 5e769ff86780a7ffd561615dbf4b0defe80cfbb9.

Reverted https://github.com/pytorch/pytorch/pull/165470 on behalf of https://github.com/atalman due to Sorry reverting for now, to restore trunk health ([comment](https://github.com/pytorch/pytorch/pull/165470#issuecomment-3454166879))
2025-10-28 02:21:48 +00:00
236ce736a1 [reland] Add provenance to inductor IR nodes created after graph.run (#164255) (#164746)
Summary:

as title

- Some IR nodes are created during `finalize_multi_template_buffers()` in Scheduler. This PR adds provenance (`origin_node` and `origins`) for those nodes.

- Extract `assign_origin_node` function

Test Plan:
```
buck run mode/opt fbcode//caffe2/test/inductor:provenance_tracing -- -r  test_deferred_triton_kernels
```

Differential Revision: D83979975

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164746
Approved by: https://github.com/mlazos
2025-10-28 02:20:20 +00:00
17bdb232e1 [GR v0] AOTI Enablement - Fix GR model AOTI inplace update by skipping empty named (#165970) (#166037)
Summary:

Add a gflag to allow us skip empty constant named parameter during
dense loading. In [vm_parameters.py](https://fburl.com/code/7xr9ihwy), there is
a constant _empty_tensor parameter used for the model. This constant parameter
is skipped in XL weights during model publish because it is empty. This will
break model inplace update later because it will be reported by the AOTI
container but cannot be found from the model merge weights. This diff will
allow us to solve the problem.

Test Plan: Verified inplace update in job https://www.internalfb.com/vanguard/serving_test_cases/1165842932095688

Reviewed By: muchulee8, joannec3634

Differential Revision: D85082330

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166037
Approved by: https://github.com/muchulee8, https://github.com/jcwchen
2025-10-28 01:50:36 +00:00
add37bacda [MPS] Better error checking for FFT ops (#166272)
Namely, error out rather than crash when out dtype is of an unexpected type
Resize output tensor to the expected size in `_out` operation, to prevent crash when tensor of an unexpected size is passed.
Preserve symbolic shapes whenever possible

Test plan: Run `python test_ops.py -v -k test_out_warning_fft_hfft_mps` for MPS device, without this change it crashes with `Error: Invalid KernelDAG, equalShape for destination failed'`, run `python ../test/test_ops.py -v -k test_dtypes_stft_mps`, without this change it crashes with `A complex mlir::Type does not have a corresponding complex MPSDataType"`, when input dtype is bfloat16
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166272
Approved by: https://github.com/kulinseth
2025-10-28 01:31:47 +00:00
1425b40f29 [inductor] Fix argmin/argmax returning incorrect indices for non-contiguous tensor (#165983)
Fixes #163929

Fixes argmin/argmax operations to return correct logical indices instead of physical memory offsets when applied to transposed/permuted tensors.  When `argmin()` or `argmax()` is called on a transposed tensor, Inductor was returning physical memory indices instead of logical row-major indices. This caused incorrect results that don't match eager mode behavior.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165983
Approved by: https://github.com/shunting314
2025-10-28 01:23:24 +00:00
8af9ed0824 [torchfuzz] split, chunk, stack, cat, expand, gather, cumsum, clamp, index_select, split (#166221)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166221
Approved by: https://github.com/pianpwk
ghstack dependencies: #166187, #166188, #166220, #166189, #166190
2025-10-28 01:21:07 +00:00
7045aab143 [torchfuzz] add mhaf operator (#166190)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166190
Approved by: https://github.com/pianpwk
ghstack dependencies: #166187, #166188, #166220, #166189
2025-10-28 01:21:07 +00:00
7ae8aaf4c0 [torchfuzz] add sdpa operator (#166189)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166189
Approved by: https://github.com/pianpwk
ghstack dependencies: #166187, #166188, #166220
2025-10-28 01:20:58 +00:00
f2450798cd [torchfuzz] make pointwise subclasses defined torch_op_name (#166220)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166220
Approved by: https://github.com/pianpwk
ghstack dependencies: #166187, #166188
2025-10-28 01:08:34 +00:00
46d17e8871 [Symm mem] Add a unit test for mempool tensor with dist collective (#166206)
We haven't tried to see if tensors on nvshmem calling c10d collectives work or not. This PR is adding a show case for it inside UT.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166206
Approved by: https://github.com/ngimel
2025-10-28 00:41:47 +00:00
dc011d3203 [inductor][ez] add overridable env var for disabling fx graph cache (#166138)
I set TORCHINDUCTOR_FX_GRAPH_CACHE=0 a lot to make sure the compilation
happens by disabling fx graph caching. I even put this in my .bashrc.
But this cause a simple vllm script fail:
https://gist.github.com/shunting314/4253b2b5ab5e7d1b0fc9516c84054904

Error log:
https://gist.github.com/shunting314/1d04bbeb58bc486f975684f56d65615d

The root cause is,
1. vllm patch inductor_config.fx_graph_cache to True here:
   e255d92990/vllm/compilation/compiler_interface.py (L308)

   The code in vllm relies fx graph cache is on (unless
   VLLM_DISABLE_COMPILE_CACHE is overriden to false)
2. setting TORCHINDUCTOR_FX_GRAPH_CACHE=0 will cause
   inductor_config.fx_graph_cache not overridable.

I add TORCHINDUCTOR_FX_GRAPH_CACHE_DEFAULT so that we can still use it to skip fx
graph cache while still allow project like vllm to override it.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166138
Approved by: https://github.com/eellison
2025-10-28 00:27:19 +00:00
e95920e3e6 [Optimus] Rename the post_grad_graph tlparse log (#166109)
Summary:
ezyang observed a cache miss issue, see details in https://github.com/pytorch/pytorch/issues/166012

We thus rename the post_grad_graph tlparse log name to resolve the cache issue.

Differential Revision: D85309891

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166109
Approved by: https://github.com/jamesjwu
2025-10-28 00:23:01 +00:00
5e769ff867 [CD] Upgrade to CUDA 13.0.2 for nightly binaries (#165470)
13.0.U2 is posted, adding to nightlies
Why we want to upgrade: CUDA 13.0.U2 included a new release from cuBLAS that
1. Enabled opt-in fixed-point emulation for FP64 matmuls (D/ZGEMM) which improves performance and power-efficiency.
2. Improved performance on NVIDIA [DGX Spark](https://www.nvidia.com/en-us/products/workstations/dgx-spark/) for FP16/BF16 and FP8 GEMMs.
3. adds BF16x9 FP32 emulation support for SYRK and HERK routines.
Reference: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cublas-release-13-0-update-2

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165470
Approved by: https://github.com/atalman
2025-10-28 00:21:47 +00:00
0ae3e30621 [torchfuzz] fix group norm operator (#166188)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166188
Approved by: https://github.com/pianpwk
ghstack dependencies: #166187
2025-10-28 00:11:04 +00:00
47f50cfd45 [torchfuzz] check in more ignore regexes (#166187)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166187
Approved by: https://github.com/pianpwk
2025-10-27 23:58:54 +00:00
a51f877287 Enable local tensor mode for another set of DTensor tests (#166105)
Enable local tensor mode DTensor tests for the optimizers, op strategy,  matrix ops,
math ops, init ops, experimental ops, embedding ops, dynamic, convolution ops, main api.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166105
Approved by: https://github.com/ezyang
2025-10-27 23:58:24 +00:00
b44423bbb4 [inductor][choices] lookup table choices 1/3 (#164978)
\# why

- enable users to control which choices get used on which inputs
- reduce lowering time, and pin kernel selection, by selecting
  them for the inputs

\# what

- a new InductorChoices subclass that implements a lookup table
- a README explaining the usage
- corresponding testing

- currently only supports templates that go through
  `V.choices.get_template_configs`

\# testing

```
python3 -bb -m pytest test/inductor/test_lookup_table.py -v
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164978
Approved by: https://github.com/PaulZhang12, https://github.com/eellison
2025-10-27 23:45:16 +00:00
8e1e4ee8e0 [reland][dynamo][easy] Support torch.accelerator.current_accelerator (#166327)
Reland https://github.com/pytorch/pytorch/pull/165734

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166327
Approved by: https://github.com/Lucaskabela
2025-10-27 23:41:43 +00:00
1e836bc769 [MPS] fix large matmul test device (#166271)
PR is self explanatory
Test was introduced by https://github.com/pytorch/pytorch/pull/143095 and was always running on CPU

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166271
Approved by: https://github.com/kulinseth, https://github.com/malfet

Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
2025-10-27 22:56:59 +00:00
9a91486e45 [Inductor-FX] Don't flatten constant args (#166144)
Summary: Fallback kernels are created with flattened constant args and an `unflatten` utility to unflatten them when needed. Apply it in FXConverter to preserve the original structure

Test Plan: added new CI tests

Differential Revision: D85347589

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166144
Approved by: https://github.com/blaine-rister
2025-10-27 22:33:37 +00:00
92381a5aa7 [ROCm] Custom OpenBLAS library name (#166333)
- TheRock build system for ROCm builds OpenBLAS from source and uses a custom name for the library.
- Following existing conventions in `FindOpenBLAS.cmake` to support finding a custom named version of OpenBLAS.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166333
Approved by: https://github.com/jeffdaily
2025-10-27 22:13:05 +00:00
2a5f87decf [cuDNN] Smoke-test runtime cuDNN version matches compile time version in CI (#165922)
Fix and regression test for https://github.com/pytorch/pytorch/issues/165801

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165922
Approved by: https://github.com/malfet, https://github.com/atalman, https://github.com/Skylion007, https://github.com/drisspg

Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
2025-10-27 22:10:45 +00:00
840d63c12d Update cuDNN 9.10.2 in Manylinux 2.28 Docker files (#165913)
Fixes https://github.com/pytorch/pytorch/issues/165801
Smoke test: https://github.com/pytorch/pytorch/pull/165922/files

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165913
Approved by: https://github.com/Camyll, https://github.com/Skylion007
2025-10-27 22:08:06 +00:00
2ce894bb1d [dynamo] Dont guard on numpy Cython functions (#166328)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166328
Approved by: https://github.com/Lucaskabela
2025-10-27 22:01:10 +00:00
47ec1e9990 Support regional inductor with custom config (#166269)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166269
Approved by: https://github.com/anijain2305
2025-10-27 21:46:02 +00:00
904abfc2ca Export flex attention with kwargs and DTensor (#166045)
Fixes #165948

Adding registration of the MaskBlock makes flex attention with kwargs exportable.

Also modified unittests to accept kwargs

```
python test/distributed/tensor/test_dtensor_export.py -k test_flex_attention_dtensor_export

python test/inductor/test_flex_attention.py -k test_pytree_
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166045
Approved by: https://github.com/drisspg, https://github.com/SherlockNoMad

Co-authored-by: fduwjj <fduwjj@gmail.com>
2025-10-27 21:40:40 +00:00
7d16fcf2df Re-re-re-re-apply "C++-accessible Placements via pybind11 (#163030)" (#166132)
Was reverted (again!) due to a merge conflict that crept in sometime during the "export to github -> land internally -> merge on github" process.

D85096233

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166132
Approved by: https://github.com/Skylion007, https://github.com/ezyang, https://github.com/malfet
2025-10-27 21:19:32 +00:00
483845a9c4 [DTensor][Op] fix for DTensor ops with Partial placements (#165962)
**Summary:** When operations are done on partial placements, we use sharding logic to incorrectly determine whether we should redistribute the tensor to replicate. By delaying the redistribution, we do the operation first, and then the partial reduction. This leads to incorrect results for max, min, gradient norm clipping, and more. We solve this by setting reduction_linear to False when there is a Partial placement to force the redistribution before completing the op.

**Test Cases**
1. pytest test/distributed/tensor/test_math_ops.py -k test_partial_reduction_ops
2. pytest test/distributed/tensor/test_math_ops.py -k test_matching_partial_reduction_ops

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165962
Approved by: https://github.com/wconstab
2025-10-27 21:17:13 +00:00
60bcb4ee88 [pipeline][be] refactored pipeline composability tests (#165701)
**Summary:** The first thing I did was increase the world size to 8 because test_3d_with_tp_dp_pp wouldn't actually do fully shard as tp = 2, pp = 2, leaving dp = 1. The second thing was refactoring the tests using both single and multi stage schedules so that their logic is largely combined. This was accomplished by using the logic in test_replicate_pp_grad multi-stage schedule to determine the start and end indices for a partial model, but setting virtual_stage to 1 if we are using single stage schedules. Even if this approach isn't approved, multistage schedule logic in test_3d_with_tp_dp_pp and test_replicate_pp should be changed as the logic used is incorrect.

**Test Case**
1. pytest test/distributed/_composable/test_composability/test_pp_composability.py

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165701
Approved by: https://github.com/H-Huang
2025-10-27 21:08:57 +00:00
ee7434be82 [dynamo][guards] 1/N Guard selectively for DTensor (#165824)
A few internal jobs are observing very high guard overhead for DTensor.
Since we own DTensor, we can make those guards way faster.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165824
Approved by: https://github.com/Lucaskabela, https://github.com/bdhirsh
2025-10-27 20:35:40 +00:00
d049ed2cb1 [BE] Fix metal compilation warnings (#166315)
- Fixes `s/#pragma onces/#pragma once` typoe

All methods in the headers must be inline, otherwise one gets barrage of following warnings
```
/Users/malfet/git/pytorch/pytorch/c10/metal/utils.h:337:7: warning: unused function 'conj<half __attribute__((ext_vector_type(2)))>' [-Wunused-function]
half2 conj(half2 a) {
      ^
/Users/malfet/git/pytorch/pytorch/c10/metal/utils.h:342:8: warning: unused function 'conj<float __attribute__((ext_vector_type(2)))>' [-Wunused-function]
float2 conj(float2 a) {
       ^
2 warnings generated.
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166315
Approved by: https://github.com/seemethere, https://github.com/atalman
2025-10-27 20:17:10 +00:00
09b7c29589 Update base for Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-13 14:23:01 -07:00
c3ee2a7cc0 Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-13 14:23:01 -07:00
12aecbe8a9 Update base for Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-13 12:47:27 -07:00
898c7e2469 Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-13 12:47:27 -07:00
e8ed389b1c Update base for Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-13 10:30:20 -07:00
0ec0409263 Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-13 10:30:20 -07:00
269ddf2806 Update base for Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-12 22:29:10 -07:00
6a604f5fd0 Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-12 22:29:10 -07:00
e6b97f56f7 Update base for Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-12 21:30:45 -07:00
92810f9415 Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-12 21:30:45 -07:00
8aca43eec0 Update base for Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-10 20:15:09 -07:00
6cff982faf Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-10 20:15:09 -07:00
0d0ea39e0b Update base for Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-10 20:04:08 -07:00
0b8d222d87 Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-10 20:04:08 -07:00
508a1a85fd Update base for Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-09 11:32:43 -07:00
e16b1f6aaa Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-09 11:32:43 -07:00
ec9c05790a Update base for Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-09 10:50:24 -07:00
03c7ab48f1 Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-09 10:50:24 -07:00
e8de928a48 Update base for Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-08 20:23:15 -07:00
9413170be2 Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-08 20:23:15 -07:00
436279a047 Update base for Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-08 20:12:15 -07:00
b111e4992c Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-08 20:12:15 -07:00
9a08c60898 Update base for Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-08 17:06:31 -07:00
57f98ce90f Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-08 17:06:31 -07:00
a7028b599c Update base for Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-08 16:39:44 -07:00
57b5cbeba9 Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-08 16:39:44 -07:00
bae10f7b04 Update base for Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-06 22:09:00 -07:00
4961153214 Update on "[user-streams] Add basic stream tests"
[ghstack-poisoned]
2025-10-06 22:09:00 -07:00
7ab6845341 [user-streams] Add basic stream tests
[ghstack-poisoned]
2025-10-02 16:36:18 -07:00
34d6051176 [User-streams] Make torch.Event weakref compatible
[ghstack-poisoned]
2025-10-02 16:36:14 -07:00
5b6fc77250 Update on "[user-streams] Make device-agnostic streams weakref compatible"
[ghstack-poisoned]
2025-10-02 16:36:14 -07:00
3e34678357 Update base for Update on "[user-streams] Make device-agnostic streams weakref compatible"
[ghstack-poisoned]
2025-10-02 16:36:14 -07:00
fdccfc2dc8 Update on "[user-streams] Make cuda streams weakref compatible"
[ghstack-poisoned]
2025-10-02 14:23:27 -07:00
10bb9b3ca0 Update base for Update on "[user-streams] Make cuda streams weakref compatible"
[ghstack-poisoned]
2025-10-02 14:23:27 -07:00
bd04e30f8a Update on "[user-streams] Make cuda streams weakref compatible"
[ghstack-poisoned]
2025-10-01 01:22:07 -07:00
345f4f396a Update base for Update on "[user-streams] Make cuda streams weakref compatible"
[ghstack-poisoned]
2025-10-01 01:22:06 -07:00
4e9d08856e [user-streams] Make cuda streams weakref compatible
[ghstack-poisoned]
2025-09-30 15:31:30 -07:00
1b0ed4de38 Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-30 15:31:30 -07:00
f12798434a Update base for Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-30 15:31:30 -07:00
ae23da73f1 Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-18 23:57:35 -07:00
397155450f Update base for Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-18 23:57:35 -07:00
53aa957add Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-18 16:10:33 -07:00
7835bc709a Update base for Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-18 16:10:33 -07:00
842680934a Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-16 10:21:36 -07:00
b441d7c6d1 Update base for Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-16 10:21:36 -07:00
4c9d834916 Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-16 01:06:18 -07:00
d7bf92baa9 Update base for Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-16 01:06:18 -07:00
77ddab454e Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-16 00:53:05 -07:00
5e00f4a152 Update base for Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-16 00:53:05 -07:00
33375f19ae Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-15 17:48:45 -07:00
b7eb8dace7 Update base for Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-15 17:48:45 -07:00
e388d09cf0 Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-13 23:58:10 -07:00
e330c24154 Update base for Update on "[dynamo] Remove retrieving objects by ID"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-09-13 23:58:10 -07:00
9cf38e3efd [dynamo] Remove retrieving objects by ID
[ghstack-poisoned]
2025-09-13 23:55:52 -07:00
2ce5614456 [user-streams] Track external/internal nodes for stream context
[ghstack-poisoned]
2025-09-13 23:43:21 -07:00
b95c6cb7f3 [user-streams] update stream context to use fork/join
[ghstack-poisoned]
2025-09-13 23:43:18 -07:00
eda656b169 [user-streams] Add stream state manager
[ghstack-poisoned]
2025-09-13 23:43:14 -07:00
da38f66f8e [user-cuda-streams] Add cuda streams test suite
[ghstack-poisoned]
2025-09-13 23:43:11 -07:00
797f61ab4e [user-cuda-streams] Add fork/join custom ops
Make custom ops inplace

[ghstack-poisoned]
2025-09-13 23:43:07 -07:00
2a1a2804ca [user-cuda-streams] Pass streams/events to the graph via lookup table
[ghstack-poisoned]
2025-09-13 23:43:03 -07:00
859 changed files with 30323 additions and 11672 deletions

View File

@ -195,13 +195,16 @@ case "$tag" in
NINJA_VERSION=1.9.0
TRITON=yes
;;
pytorch-linux-jammy-xpu-n-py3)
pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
VISION=yes
XPU_VERSION=2025.2
NINJA_VERSION=1.9.0
TRITON=yes
if [[ $tag =~ "benchmarks" ]]; then
INDUCTOR_BENCHMARKS=yes
fi
;;
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.10

View File

@ -3,7 +3,7 @@
set -eux
ACL_VERSION=${ACL_VERSION:-"v25.02"}
ACL_VERSION=${ACL_VERSION:-"v52.6.0"}
ACL_INSTALL_DIR="/acl"
# Clone ACL

View File

@ -49,12 +49,20 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
export SYSROOT_DEP="sysroot_linux-64=2.17"
fi
# Install correct Python version
# Also ensure sysroot is using a modern GLIBC to match system compilers
if [ "$ANACONDA_PYTHON_VERSION" = "3.14" ]; then
as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
python="3.14.0" \
${SYSROOT_DEP} \
-c conda-forge
else
# Install correct Python version
# Also ensure sysroot is using a modern GLIBC to match system compilers
as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
python="$ANACONDA_PYTHON_VERSION" \
${SYSROOT_DEP}
fi
# libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
# which is provided in libstdcxx 12 and up.
conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge

View File

@ -10,7 +10,7 @@ else
arch_path='sbsa'
fi
NVSHMEM_VERSION=3.3.24
NVSHMEM_VERSION=3.4.5
function install_cuda {
version=$1
@ -150,7 +150,7 @@ function install_130 {
CUDNN_VERSION=9.13.0.50
echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
# install CUDA 13.0 in the same container
install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
install_cudnn 13 $CUDNN_VERSION

View File

@ -40,11 +40,7 @@ EOF
# Default url values
rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
# Add amdgpu repository
UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
# Add rocm repository
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -

View File

@ -12,8 +12,8 @@ function do_install() {
rocm_version_nodot=${rocm_version//./}
# https://github.com/icl-utk-edu/magma/pull/65
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
# post merge of https://github.com/icl-utk-edu/magma/pull/65
MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
rocm_dir="/opt/rocm"

View File

@ -97,7 +97,7 @@ case ${image} in
manylinux2_28-builder:xpu)
TARGET=xpu_final
GPU_IMAGE=amd64/almalinux:8
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13"
MANY_LINUX_VERSION="2_28"
;;
*)

View File

@ -138,10 +138,12 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
#test_binary_ufuncs.py
numpy==1.22.4; python_version == "3.10"
numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
numpy==2.1.2; python_version >= "3.13"
numpy==2.1.2; python_version >= "3.13" and python_version < "3.14"
numpy==2.3.4; python_version >= "3.14"
pandas==2.0.3; python_version < "3.13"
pandas==2.2.3; python_version >= "3.13"
pandas==2.2.3; python_version >= "3.13" and python_version < "3.14"
pandas==2.3.3; python_version >= "3.14"
#onnxruntime
#Description: scoring engine for Open Neural Network Exchange (ONNX) models
@ -153,7 +155,8 @@ opt-einsum==3.3
#Pinned versions: 3.3
#test that import: test_linalg.py
optree==0.13.0
optree==0.13.0 ; python_version < "3.14"
optree==0.17.0 ; python_version >= "3.14"
#Description: A library for tree manipulation
#Pinned versions: 0.13.0
#test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
@ -252,7 +255,8 @@ scikit-image==0.22.0
#test that import:
scipy==1.10.1 ; python_version <= "3.11"
scipy==1.14.1 ; python_version >= "3.12"
scipy==1.14.1 ; python_version > "3.11" and python_version < "3.14"
scipy==1.16.2 ; python_version >= "3.14"
# Pin SciPy because of failing distribution tests (see #60347)
#Description: scientific python
#Pinned versions: 1.10.1
@ -324,7 +328,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
#Pinned versions: 1.4.1
#test that import:
lxml==5.3.0
lxml==5.3.0 ; python_version < "3.14"
lxml==6.0.2 ; python_version >= "3.14"
#Description: This is a requirement of unittest-xml-reporting
PyGithub==2.3.0
@ -334,7 +339,9 @@ sympy==1.13.3
#Pinned versions:
#test that import:
onnx==1.19.1
onnx==1.19.1 ; python_version < "3.14"
# Unpin once Python 3.14 is supported. See onnxruntime issue 26309.
onnx==1.18.0 ; python_version == "3.14"
#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
#Pinned versions:
#test that import:
@ -359,7 +366,7 @@ pwlf==2.2.1
#test that import: test_sac_estimator.py
# To build PyTorch itself
pyyaml==6.0.2
pyyaml==6.0.3
pyzstd
setuptools==78.1.1
packaging==23.1

View File

@ -54,12 +54,15 @@ ENV OPENSSL_DIR /opt/openssl
RUN rm install_openssl.sh
ARG INDUCTOR_BENCHMARKS
ARG ANACONDA_PYTHON_VERSION
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
# Install XPU Dependencies
ARG XPU_VERSION

View File

@ -100,6 +100,8 @@ COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
# Only build aoti cpp tests when INDUCTOR_BENCHMARKS is set to True
ENV BUILD_AOT_INDUCTOR_TEST ${INDUCTOR_BENCHMARKS}
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt

View File

@ -6,7 +6,7 @@ dependencies = [
"GitPython==3.1.45",
"docker==7.1.0",
"pytest==7.3.2",
"uv==0.9.5"
"uv==0.9.6"
]
[tool.setuptools]

View File

@ -1,7 +1,7 @@
SHELL=/usr/bin/env bash
DOCKER_CMD ?= docker
DESIRED_ROCM ?= 7.0
DESIRED_ROCM ?= 7.1
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
PACKAGE_NAME = magma-rocm
# inherit this from underlying docker image, do not pass this env var to docker
@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
magma-rocm/build_magma.sh
.PHONY: all
all: magma-rocm71
all: magma-rocm70
all: magma-rocm64
@ -24,6 +25,11 @@ clean:
$(RM) -r magma-*
$(RM) -r output
.PHONY: magma-rocm71
magma-rocm71: DESIRED_ROCM := 7.1
magma-rocm71:
$(DOCKER_RUN)
.PHONY: magma-rocm70
magma-rocm70: DESIRED_ROCM := 7.0
magma-rocm70:

View File

@ -6,8 +6,8 @@ set -eou pipefail
# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
# https://github.com/icl-utk-edu/magma/pull/65
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
# post merge of https://github.com/icl-utk-edu/magma/pull/65
MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
# Folders for the build
PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
# Fetch magma sources and verify checksum
pushd ${PACKAGE_DIR}
git clone https://github.com/jeffdaily/magma
git clone https://github.com/icl-utk-edu/magma
pushd magma
git checkout ${MAGMA_VERSION}
popd

View File

@ -426,7 +426,7 @@ fi
if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then
# export test times so that potential sharded tests that'll branch off this build will use consistent data
# don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
python tools/stats/export_test_times.py
PYTHONPATH=. python tools/stats/export_test_times.py
fi
# don't do this for bazel or s390x or riscv64 as they don't use sccache
if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then

View File

@ -460,28 +460,18 @@ test_inductor_shard() {
--verbose
}
test_inductor_aoti() {
# docker build uses bdist_wheel which does not work with test_aot_inductor
# TODO: need a faster way to build
test_inductor_aoti_cpp() {
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
# We need to hipify before building again
python3 tools/amd_build/build_amd.py
fi
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
else
BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
fi
# aoti cmake custom command requires `torch` to be installed
# initialize the cmake build cache and install torch
/usr/bin/env "${BUILD_COMMAND[@]}"
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
}
@ -582,6 +572,8 @@ fi
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
DYNAMO_BENCHMARK_FLAGS+=(--device xpu)
else
DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
fi
@ -675,6 +667,8 @@ test_perf_for_dashboard() {
device=cuda_b200
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
device=rocm
elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
device=xpu
fi
for mode in "${modes[@]}"; do
@ -1767,7 +1761,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
else
# Do this after checkout_install_torchbench to ensure we clobber any
# nightlies that torchbench may pull in
if [[ "${TEST_CONFIG}" != *cpu* ]]; then
if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then
install_torchrec_and_fbgemm
fi
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
@ -1776,7 +1770,7 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
install_torchvision
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
test_inductor_aoti
test_inductor_aoti_cpp
fi
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
install_torchvision

View File

@ -7,12 +7,9 @@ if "%DESIRED_PYTHON%" == "3.13t" (
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
set PYTHON_EXEC="python3.13t"
) else if "%DESIRED_PYTHON%"=="3.14" (
echo Python version is set to 3.14 or 3.14t
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
) else if "%DESIRED_PYTHON%"=="3.14t" (
echo Python version is set to 3.14 or 3.14t
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0-amd64.exe"
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
set PYTHON_EXEC="python3.14t"
) else (

View File

@ -1,3 +1,8 @@
---
name: docstring
description: Write docstrings for PyTorch functions and methods following PyTorch conventions. Use when writing or updating docstrings in PyTorch code.
---
# PyTorch Docstring Writing Guide
This skill describes how to write docstrings for functions and methods in the PyTorch project, following the conventions in `torch/_tensor_docs.py` and `torch/nn/functional.py`.

View File

@ -0,0 +1,385 @@
---
name: skill-writer
description: Guide users through creating Agent Skills for Claude Code. Use when the user wants to create, write, author, or design a new Skill, or needs help with SKILL.md files, frontmatter, or skill structure.
---
# Skill Writer
This Skill helps you create well-structured Agent Skills for Claude Code that follow best practices and validation requirements.
## When to use this Skill
Use this Skill when:
- Creating a new Agent Skill
- Writing or updating SKILL.md files
- Designing skill structure and frontmatter
- Troubleshooting skill discovery issues
- Converting existing prompts or workflows into Skills
## Instructions
### Step 1: Determine Skill scope
First, understand what the Skill should do:
1. **Ask clarifying questions**:
- What specific capability should this Skill provide?
- When should Claude use this Skill?
- What tools or resources does it need?
- Is this for personal use or team sharing?
2. **Keep it focused**: One Skill = one capability
- Good: "PDF form filling", "Excel data analysis"
- Too broad: "Document processing", "Data tools"
### Step 2: Choose Skill location
Determine where to create the Skill:
**Personal Skills** (`~/.claude/skills/`):
- Individual workflows and preferences
- Experimental Skills
- Personal productivity tools
**Project Skills** (`.claude/skills/`):
- Team workflows and conventions
- Project-specific expertise
- Shared utilities (committed to git)
### Step 3: Create Skill structure
Create the directory and files:
```bash
# Personal
mkdir -p ~/.claude/skills/skill-name
# Project
mkdir -p .claude/skills/skill-name
```
For multi-file Skills:
```
skill-name/
├── SKILL.md (required)
├── reference.md (optional)
├── examples.md (optional)
├── scripts/
│ └── helper.py (optional)
└── templates/
└── template.txt (optional)
```
### Step 4: Write SKILL.md frontmatter
Create YAML frontmatter with required fields:
```yaml
---
name: skill-name
description: Brief description of what this does and when to use it
---
```
**Field requirements**:
- **name**:
- Lowercase letters, numbers, hyphens only
- Max 64 characters
- Must match directory name
- Good: `pdf-processor`, `git-commit-helper`
- Bad: `PDF_Processor`, `Git Commits!`
- **description**:
- Max 1024 characters
- Include BOTH what it does AND when to use it
- Use specific trigger words users would say
- Mention file types, operations, and context
**Optional frontmatter fields**:
- **allowed-tools**: Restrict tool access (comma-separated list)
```yaml
allowed-tools: Read, Grep, Glob
```
Use for:
- Read-only Skills
- Security-sensitive workflows
- Limited-scope operations
### Step 5: Write effective descriptions
The description is critical for Claude to discover your Skill.
**Formula**: `[What it does] + [When to use it] + [Key triggers]`
**Examples**:
✅ **Good**:
```yaml
description: Extract text and tables from PDF files, fill forms, merge documents. Use when working with PDF files or when the user mentions PDFs, forms, or document extraction.
```
✅ **Good**:
```yaml
description: Analyze Excel spreadsheets, create pivot tables, and generate charts. Use when working with Excel files, spreadsheets, or analyzing tabular data in .xlsx format.
```
❌ **Too vague**:
```yaml
description: Helps with documents
description: For data analysis
```
**Tips**:
- Include specific file extensions (.pdf, .xlsx, .json)
- Mention common user phrases ("analyze", "extract", "generate")
- List concrete operations (not generic verbs)
- Add context clues ("Use when...", "For...")
### Step 6: Structure the Skill content
Use clear Markdown sections:
```markdown
# Skill Name
Brief overview of what this Skill does.
## Quick start
Provide a simple example to get started immediately.
## Instructions
Step-by-step guidance for Claude:
1. First step with clear action
2. Second step with expected outcome
3. Handle edge cases
## Examples
Show concrete usage examples with code or commands.
## Best practices
- Key conventions to follow
- Common pitfalls to avoid
- When to use vs. not use
## Requirements
List any dependencies or prerequisites:
```bash
pip install package-name
```
## Advanced usage
For complex scenarios, see [reference.md](reference.md).
```
### Step 7: Add supporting files (optional)
Create additional files for progressive disclosure:
**reference.md**: Detailed API docs, advanced options
**examples.md**: Extended examples and use cases
**scripts/**: Helper scripts and utilities
**templates/**: File templates or boilerplate
Reference them from SKILL.md:
```markdown
For advanced usage, see [reference.md](reference.md).
Run the helper script:
\`\`\`bash
python scripts/helper.py input.txt
\`\`\`
```
### Step 8: Validate the Skill
Check these requirements:
✅ **File structure**:
- [ ] SKILL.md exists in correct location
- [ ] Directory name matches frontmatter `name`
✅ **YAML frontmatter**:
- [ ] Opening `---` on line 1
- [ ] Closing `---` before content
- [ ] Valid YAML (no tabs, correct indentation)
- [ ] `name` follows naming rules
- [ ] `description` is specific and < 1024 chars
✅ **Content quality**:
- [ ] Clear instructions for Claude
- [ ] Concrete examples provided
- [ ] Edge cases handled
- [ ] Dependencies listed (if any)
✅ **Testing**:
- [ ] Description matches user questions
- [ ] Skill activates on relevant queries
- [ ] Instructions are clear and actionable
### Step 9: Test the Skill
1. **Restart Claude Code** (if running) to load the Skill
2. **Ask relevant questions** that match the description:
```
Can you help me extract text from this PDF?
```
3. **Verify activation**: Claude should use the Skill automatically
4. **Check behavior**: Confirm Claude follows the instructions correctly
### Step 10: Debug if needed
If Claude doesn't use the Skill:
1. **Make description more specific**:
- Add trigger words
- Include file types
- Mention common user phrases
2. **Check file location**:
```bash
ls ~/.claude/skills/skill-name/SKILL.md
ls .claude/skills/skill-name/SKILL.md
```
3. **Validate YAML**:
```bash
cat SKILL.md | head -n 10
```
4. **Run debug mode**:
```bash
claude --debug
```
## Common patterns
### Read-only Skill
```yaml
---
name: code-reader
description: Read and analyze code without making changes. Use for code review, understanding codebases, or documentation.
allowed-tools: Read, Grep, Glob
---
```
### Script-based Skill
```yaml
---
name: data-processor
description: Process CSV and JSON data files with Python scripts. Use when analyzing data files or transforming datasets.
---
# Data Processor
## Instructions
1. Use the processing script:
\`\`\`bash
python scripts/process.py input.csv --output results.json
\`\`\`
2. Validate output with:
\`\`\`bash
python scripts/validate.py results.json
\`\`\`
```
### Multi-file Skill with progressive disclosure
```yaml
---
name: api-designer
description: Design REST APIs following best practices. Use when creating API endpoints, designing routes, or planning API architecture.
---
# API Designer
Quick start: See [examples.md](examples.md)
Detailed reference: See [reference.md](reference.md)
## Instructions
1. Gather requirements
2. Design endpoints (see examples.md)
3. Document with OpenAPI spec
4. Review against best practices (see reference.md)
```
## Best practices for Skill authors
1. **One Skill, one purpose**: Don't create mega-Skills
2. **Specific descriptions**: Include trigger words users will say
3. **Clear instructions**: Write for Claude, not humans
4. **Concrete examples**: Show real code, not pseudocode
5. **List dependencies**: Mention required packages in description
6. **Test with teammates**: Verify activation and clarity
7. **Version your Skills**: Document changes in content
8. **Use progressive disclosure**: Put advanced details in separate files
## Validation checklist
Before finalizing a Skill, verify:
- [ ] Name is lowercase, hyphens only, max 64 chars
- [ ] Description is specific and < 1024 chars
- [ ] Description includes "what" and "when"
- [ ] YAML frontmatter is valid
- [ ] Instructions are step-by-step
- [ ] Examples are concrete and realistic
- [ ] Dependencies are documented
- [ ] File paths use forward slashes
- [ ] Skill activates on relevant queries
- [ ] Claude follows instructions correctly
## Troubleshooting
**Skill doesn't activate**:
- Make description more specific with trigger words
- Include file types and operations in description
- Add "Use when..." clause with user phrases
**Multiple Skills conflict**:
- Make descriptions more distinct
- Use different trigger words
- Narrow the scope of each Skill
**Skill has errors**:
- Check YAML syntax (no tabs, proper indentation)
- Verify file paths (use forward slashes)
- Ensure scripts have execute permissions
- List all dependencies
## Examples
See the documentation for complete examples:
- Simple single-file Skill (commit-helper)
- Skill with tool permissions (code-reviewer)
- Multi-file Skill (pdf-processing)
## Output format
When creating a Skill, I will:
1. Ask clarifying questions about scope and requirements
2. Suggest a Skill name and location
3. Create the SKILL.md file with proper frontmatter
4. Include clear instructions and examples
5. Add supporting files if needed
6. Provide testing instructions
7. Validate against all requirements
The result will be a complete, working Skill that follows all best practices and validation rules.

View File

@ -27,7 +27,9 @@ runs:
docker system prune -af
diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
diskspace_cutoff_int=$((diskspace_cutoff + 0))
difference=$((100 - diskspace_cutoff_int))
echo "Error: Available diskspace is less than $difference percent. Not enough diskspace."
echo "$msg"
exit 1
else

View File

@ -1 +1 @@
69bbe7363897764f9e758d851cd0340147d27f94
3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2

View File

@ -1 +1 @@
1752fe6809b74921644866275ab80244b96e80bc
218d2ab791d437309f91e0486eb9fa7f00badc17

View File

@ -540,6 +540,26 @@
- Lint
- pull
- name: PrivateUse1
patterns:
- torch/accelerator/**
- torch/utils/backend_registration.py
- torch/csrc/acc/**
- torch/csrc/DeviceAccelerator.*
- torch/csrc/profiler/standalone/privateuse1_observer.*
- aten/src/ATen/DeviceAccelerator.*
- aten/src/ATen/core/GeneratorForPrivateuseone.*
- aten/src/ATen/detail/PrivateUse1HooksInterface.*
- docs/source/accelerator/**
- test/cpp_extensions/open_registration_extension/torch_openreg/**
approved_by:
- albanD
- fffrog
mandatory_checks_name:
- EasyCLA
- Lint
- pull
- name: superuser
patterns:
- '*'

View File

@ -19,6 +19,7 @@ ciflow_push_tags:
- ciflow/inductor-perf-test-nightly-rocm-mi300
- ciflow/inductor-perf-test-nightly-rocm-mi355
- ciflow/inductor-perf-test-nightly-x86-zen
- ciflow/inductor-perf-test-nightly-xpu
- ciflow/inductor-periodic
- ciflow/inductor-rocm
- ciflow/linux-aarch64
@ -26,6 +27,7 @@ ciflow_push_tags:
- ciflow/nightly
- ciflow/op-benchmark
- ciflow/periodic
- ciflow/periodic-rocm-mi200
- ciflow/periodic-rocm-mi300
- ciflow/pull
- ciflow/quantization-periodic

View File

@ -11,18 +11,24 @@ architectures:
* Latest XPU
"""
import json
import os
import re
from pathlib import Path
from typing import Optional
# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
SCRIPT_DIR = Path(__file__).absolute().parent
REPO_ROOT = SCRIPT_DIR.parent.parent
CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
CUDA_STABLE = "12.8"
CUDA_ARCHES_FULL_VERSION = {
"12.6": "12.6.3",
"12.8": "12.8.1",
"12.9": "12.9.1",
"13.0": "13.0.0",
"13.0": "13.0.2",
}
CUDA_ARCHES_CUDNN_VERSION = {
"12.6": "9",
@ -31,8 +37,7 @@ CUDA_ARCHES_CUDNN_VERSION = {
"13.0": "9",
}
# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
ROCM_ARCHES = ["6.4", "7.0"]
ROCM_ARCHES = ["7.0", "7.1"]
XPU_ARCHES = ["xpu"]
@ -56,7 +61,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
@ -73,7 +78,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
@ -90,27 +95,27 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
),
"13.0": (
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
"nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
"nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | "
"nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | "
"nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | "
"nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
"nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
"nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
"nvidia-cublas==13.1.0.3; platform_system == 'Linux' | "
"nvidia-cufft==12.0.0.61; platform_system == 'Linux' | "
"nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
"nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | "
"nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | "
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
"nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
"nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
"nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | "
"nvidia-nvtx==13.0.85; platform_system == 'Linux' | "
"nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | "
"nvidia-cufile==1.15.1.6; platform_system == 'Linux'"
),
"xpu": (
"intel-cmplr-lib-rt==2025.2.1 | "
@ -137,9 +142,48 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
}
def get_nccl_wheel_version(arch_version: str) -> str:
import re
# Used by tools/nightly.py
PYTORCH_NIGHTLY_PIP_INDEX_URL = "https://download.pytorch.org/whl/nightly"
NIGHTLY_SOURCE_MATRIX = {
"cpu": dict(
name="cpu",
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cpu",
supported_platforms=["Linux", "macOS", "Windows"],
accelerator="cpu",
)
}
CUDA_NIGHTLY_SOURCE_MATRIX = {
f"cuda-{major}.{minor}": dict(
name=f"cuda-{major}.{minor}",
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu{major}{minor}",
supported_platforms=["Linux", "Windows"],
accelerator="cuda",
)
for major, minor in (map(int, version.split(".")) for version in CUDA_ARCHES)
}
ROCM_NIGHTLY_SOURCE_MATRIX = {
f"rocm-{major}.{minor}": dict(
name=f"rocm-{major}.{minor}",
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm{major}.{minor}",
supported_platforms=["Linux"],
accelerator="rocm",
)
for major, minor in (map(int, version.split(".")) for version in ROCM_ARCHES)
}
XPU_NIGHTLY_SOURCE_MATRIX = {
"xpu": dict(
name="xpu",
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/xpu",
supported_platforms=["Linux"],
accelerator="xpu",
)
}
NIGHTLY_SOURCE_MATRIX.update(CUDA_NIGHTLY_SOURCE_MATRIX)
NIGHTLY_SOURCE_MATRIX.update(ROCM_NIGHTLY_SOURCE_MATRIX)
NIGHTLY_SOURCE_MATRIX.update(XPU_NIGHTLY_SOURCE_MATRIX)
def get_nccl_wheel_version(arch_version: str) -> str:
requirements = map(
str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
)
@ -147,17 +191,14 @@ def get_nccl_wheel_version(arch_version: str) -> str:
def read_nccl_pin(arch_version: str) -> str:
from pathlib import Path
nccl_pin_path = os.path.join(
Path(__file__).absolute().parents[2],
".ci",
"docker",
"ci_commit_pins",
f"nccl-cu{arch_version[:2]}.txt",
nccl_pin_path = (
REPO_ROOT
/ ".ci"
/ "docker"
/ "ci_commit_pins"
/ f"nccl-cu{arch_version[:2]}.txt"
)
with open(nccl_pin_path) as f:
return f.read().strip()
return nccl_pin_path.read_text().strip()
def validate_nccl_dep_consistency(arch_version: str) -> None:
@ -165,7 +206,8 @@ def validate_nccl_dep_consistency(arch_version: str) -> None:
wheel_ver = get_nccl_wheel_version(arch_version)
if not nccl_release_tag.startswith(f"v{wheel_ver}"):
raise RuntimeError(
f"{arch_version} NCCL release tag version {nccl_release_tag} does not correspond to wheel version {wheel_ver}"
f"{arch_version} NCCL release tag version {nccl_release_tag} "
f"does not correspond to wheel version {wheel_ver}"
)
@ -412,7 +454,14 @@ def generate_wheels_matrix(
return ret
validate_nccl_dep_consistency("13.0")
validate_nccl_dep_consistency("12.9")
validate_nccl_dep_consistency("12.8")
validate_nccl_dep_consistency("12.6")
arch_version = ""
for arch_version in CUDA_ARCHES:
validate_nccl_dep_consistency(arch_version)
del arch_version
if __name__ == "__main__":
# Used by tools/nightly.py
(SCRIPT_DIR / "nightly_source_matrix.json").write_text(
json.dumps(NIGHTLY_SOURCE_MATRIX, indent=4) + "\n"
)

View File

@ -38,6 +38,10 @@ on:
default: ""
description: |
List of tests to include (empty string implies default list)
dashboard-tag:
required: false
type: string
default: ""
disable-monitor:
description: |
[Experimental] Disable utilization monitoring for tests.
@ -58,6 +62,11 @@ on:
required: false
type: number
default: 1
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
permissions:
id-token: write
contents: read
@ -196,6 +205,8 @@ jobs:
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
run: |
# Fetch aws credential from IMDs
@ -246,6 +257,8 @@ jobs:
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
-e TESTS_TO_INCLUDE \
-e ZE_AFFINITY_MASK \
-e HUGGING_FACE_HUB_TOKEN \
-e DASHBOARD_TAG \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--ulimit stack=10485760:83886080 \
--ulimit core=0 \

View File

@ -36,7 +36,7 @@ jobs:
runs-on: linux.9xlarge.ephemeral
strategy:
matrix:
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm7.0", "rocm7.1", "cpu"]
steps:
- name: Build docker image
uses: pytorch/pytorch/.github/actions/binary-docker-build@main

View File

@ -52,8 +52,8 @@ jobs:
{ tag: "cuda12.9" },
{ tag: "cuda12.8" },
{ tag: "cuda12.6" },
{ tag: "rocm6.4" },
{ tag: "rocm7.0" },
{ tag: "rocm7.1" },
{ tag: "cpu" },
]
steps:

View File

@ -34,7 +34,7 @@ jobs:
id-token: write
strategy:
matrix:
rocm_version: ["70", "64"]
rocm_version: ["71", "70"]
steps:
- name: Checkout PyTorch
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -54,8 +54,8 @@ jobs:
{ name: "manylinuxaarch64-builder", tag: "cuda12.9", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm7.0", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm7.1", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "xpu", runner: "linux.9xlarge.ephemeral" },

View File

@ -55,7 +55,7 @@ jobs:
docker-image: ["pytorch/manylinux2_28-builder:cpu"]
include:
- device: "rocm"
rocm_version: "7.0"
rocm_version: "7.1"
runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
- device: "cuda"
rocm_version: ""
@ -159,12 +159,7 @@ jobs:
WITH_CLANG_LDD="--with-clang-ldd"
fi
if [[ "${BUILD_DEVICE}" == xpu ]]; then
docker exec -t "${container_name}" bash -c "dnf install -y gcc-toolset-13-gcc-c++"
docker exec -t "${container_name}" bash -c "source /opt/rh/gcc-toolset-13/enable && ${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE"
else
docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"
fi
docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"
if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "xpu") ]]; then
docker exec -t "${container_name}" bash -c "auditwheel repair --plat ${PLATFORM} //artifacts/*.whl"

View File

@ -57,6 +57,7 @@ jobs:
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
pytorch-linux-jammy-py3.10-clang12,
pytorch-linux-jammy-py3.13-clang12,
pytorch-linux-jammy-py3.14-clang12,
pytorch-linux-jammy-rocm-n-py3,
pytorch-linux-noble-rocm-n-py3,
pytorch-linux-jammy-rocm-n-py3-benchmarks,
@ -66,6 +67,7 @@ jobs:
pytorch-linux-jammy-py3.12-halide,
pytorch-linux-jammy-xpu-n-1-py3,
pytorch-linux-jammy-xpu-n-py3,
pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
pytorch-linux-jammy-py3-clang18-asan,
pytorch-linux-jammy-py3-clang12-onnx,
pytorch-linux-jammy-linter,

View File

@ -132,7 +132,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -178,7 +178,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -224,7 +224,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -270,7 +270,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -381,7 +381,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -427,7 +427,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -473,7 +473,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -519,7 +519,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -630,7 +630,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -676,7 +676,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -722,7 +722,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -768,7 +768,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -879,7 +879,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -925,7 +925,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -971,7 +971,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1017,7 +1017,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1128,7 +1128,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1174,7 +1174,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1220,7 +1220,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1266,7 +1266,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1377,7 +1377,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1423,7 +1423,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1469,7 +1469,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1515,7 +1515,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1626,7 +1626,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1672,7 +1672,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1718,7 +1718,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1764,7 +1764,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -384,124 +384,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-rocm6_4-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.4
GPU_ARCH_VERSION: "6.4"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: libtorch-rocm6_4-shared-with-deps-release
build_environment: linux-binary-libtorch
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-rocm6_4-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-rocm6_4-shared-with-deps-release-build
- get-label-type
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.4
GPU_ARCH_VERSION: "6.4"
GPU_ARCH_TYPE: rocm
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-rocm6_4-shared-with-deps-release
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: ROCm set GPU_FLAG
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
docker-image-name: libtorch-cxx11-builder
custom-tag-prefix: rocm6.4
docker-build-dir: .ci/docker
working-directory: pytorch
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
env:
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm
libtorch-rocm6_4-shared-with-deps-release-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-rocm6_4-shared-with-deps-release-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.4
GPU_ARCH_VERSION: "6.4"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-rocm6_4-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-rocm7_0-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -619,3 +501,121 @@ jobs:
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-rocm7_1-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm7.1
GPU_ARCH_VERSION: "7.1"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.1
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: libtorch-rocm7_1-shared-with-deps-release
build_environment: linux-binary-libtorch
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-rocm7_1-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-rocm7_1-shared-with-deps-release-build
- get-label-type
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm7.1
GPU_ARCH_VERSION: "7.1"
GPU_ARCH_TYPE: rocm
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.1
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-rocm7_1-shared-with-deps-release
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: ROCm set GPU_FLAG
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
docker-image-name: libtorch-cxx11-builder
custom-tag-prefix: rocm7.1
docker-build-dir: .ci/docker
working-directory: pytorch
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
env:
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm
libtorch-rocm7_1-shared-with-deps-release-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-rocm7_1-shared-with-deps-release-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm7.1
GPU_ARCH_VERSION: "7.1"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.1
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-rocm7_1-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,148 @@
name: inductor-perf-nightly-xpu
on:
push:
tags:
- ciflow/inductor-perf-test-nightly-xpu/*
schedule:
- cron: 30 17 * * *
workflow_dispatch:
inputs:
training:
description: Run training (on by default)?
required: false
type: boolean
default: true
inference:
description: Run inference (on by default)?
required: false
type: boolean
default: true
default:
description: Run inductor_default?
required: false
type: boolean
default: false
dynamic:
description: Run inductor_dynamic_shapes?
required: false
type: boolean
default: false
cppwrapper:
description: Run inductor_cpp_wrapper?
required: false
type: boolean
default: false
cudagraphs:
description: Run inductor_cudagraphs?
required: false
type: boolean
default: false
freezing_cudagraphs:
description: Run inductor_cudagraphs with freezing for inference?
required: false
type: boolean
default: false
aotinductor:
description: Run aot_inductor for inference?
required: false
type: boolean
default: false
maxautotune:
description: Run inductor_max_autotune?
required: false
type: boolean
default: false
benchmark_configs:
description: The list of configs used the benchmark
required: false
type: string
default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions: read-all
jobs:
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
xpu-n-py3_10-inductor-benchmark-build:
name: xpu-n-py3.10-inductor-benchmark
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-xpu-n-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
runner: linux.c7i.12xlarge
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_xpu", shard: 1, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_huggingface_perf_xpu", shard: 2, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_huggingface_perf_xpu", shard: 3, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_huggingface_perf_xpu", shard: 4, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_huggingface_perf_xpu", shard: 5, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
]}
secrets: inherit
xpu-n-py3_10-inductor-benchmark-test-nightly:
permissions:
id-token: write
contents: read
if: github.event_name != 'workflow_dispatch'
name: xpu-n-py3.10-inductor-benchmark
uses: ./.github/workflows/_xpu-test.yml
needs: xpu-n-py3_10-inductor-benchmark-build
with:
build-environment: linux-jammy-xpu-n-py3.10
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
timeout-minutes: 720
# Disable monitor in perf tests for more investigation
disable-monitor: true
monitor-log-interval: 10
monitor-data-collect-interval: 2
secrets: inherit
xpu-n-py3_10-inductor-benchmark-test:
permissions:
id-token: write
contents: read
if: github.event_name == 'workflow_dispatch'
name: xpu-n-py3.10-inductor-test
uses: ./.github/workflows/_xpu-test.yml
needs: xpu-n-py3_10-inductor-benchmark-build
with:
build-environment: linux-jammy-xpu-n-py3.10
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -0,0 +1,84 @@
name: periodic-rocm-mi200
on:
schedule:
# We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
# Also run less frequently on weekends.
- cron: 45 0,8,16 * * 1-5
- cron: 45 4 * * 0,6
- cron: 45 4,12,20 * * 1-5
- cron: 45 12 * * 0,6
- cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
push:
tags:
- ciflow/periodic/*
- ciflow/periodic-rocm-mi200/*
branches:
- release/*
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
llm-td:
if: github.repository_owner == 'pytorch'
name: before-test
uses: ./.github/workflows/llm_td_retrieval.yml
permissions:
id-token: write
contents: read
target-determination:
name: before-test
uses: ./.github/workflows/target_determination.yml
needs: llm-td
permissions:
id-token: write
contents: read
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-rocm-py3_10-build:
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
secrets: inherit

View File

@ -204,37 +204,6 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-rocm-py3_10-build:
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build:
name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
uses: ./.github/workflows/_linux-build.yml

View File

@ -6,6 +6,7 @@ on:
- pull
- trunk
- periodic
- periodic-rocm-mi200
- periodic-rocm-mi300
- inductor
- unstable

View File

@ -59,14 +59,18 @@ jobs:
runner: linux.c7i.12xlarge
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 1, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 2, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 3, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 4, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 5, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 6, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 7, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 8, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 9, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 10, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 11, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 12, num_shards: 12, runner: "linux.idc.xpu" },
]}
secrets: inherit

1
.gitignore vendored
View File

@ -143,6 +143,7 @@ scripts/release_notes/*.json
sccache-stats*.json
lint.json
merge_record.json
.github/scripts/nightly_source_matrix.json
# These files get copied over on invoking setup.py
torchgen/packaged/*

View File

@ -374,7 +374,7 @@ cmake_dependent_option(
"Build the lazy Torchscript backend, not compatible with mobile builds" ON
"NOT INTERN_BUILD_MOBILE" OFF)
cmake_dependent_option(BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin folder"
OFF "USE_CUDA" OFF)
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
"CPU_AARCH64" OFF)

View File

@ -1,4 +1,4 @@
![PyTorch Logo](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/pytorch-logo-dark.png)
![PyTorch Logo](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/pytorch-logo-dark.png)
--------------------------------------------------------------------------------
@ -72,7 +72,7 @@ Elaborating Further:
If you use NumPy, then you have used Tensors (a.k.a. ndarray).
![Tensor illustration](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/tensor_illustration.png)
![Tensor illustration](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/tensor_illustration.png)
PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
computation by a huge amount.
@ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc
While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
You get the best of speed and flexibility for your crazy research.
![Dynamic graph](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/dynamic_graph.gif)
![Dynamic graph](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/dynamic_graph.gif)
### Python First

View File

@ -260,7 +260,7 @@ IF(USE_FBGEMM_GENAI)
if(USE_CUDA)
# To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
# If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
set(FBGEMM_CUTLASS_KERNELS_REGEX ".*mx8mx8bf16_grouped.*")
set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped).*")
file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
@ -291,6 +291,7 @@ IF(USE_FBGEMM_GENAI)
set(fbgemm_genai_cuh
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/f4f4bf16_grouped/"
"${FBGEMM_GENAI_SRCS}/"
)

View File

@ -677,8 +677,8 @@ struct CachingHostAllocatorImpl {
// size. This allows us to quickly find a free block of the right size.
// We use deque to store per size free list and guard the list with its own
// mutex.
alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>> free_list_ =
std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>>
free_list_{MAX_SIZE_INDEX};
alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
std::deque<std::pair<E, B*>> events_; // event queue paired with block

View File

@ -19,6 +19,13 @@ inline namespace CPU_CAPABILITY {
#error "Big endian is not supported."
#endif
// GCC does not properly optimize bf16 operators
#if defined(__ARM_FEATURE_BF16) && (__clang_major__ >= 19)
#define BF16_ARITHMETIC_SUPPORTED() 1
#else
#define BF16_ARITHMETIC_SUPPORTED() 0
#endif
// Unlike the float16_t family of types, bfloat16_t is not available
// when we're not targeting bfloat16 hardware support on some
// platforms (but not Mac, so we have to be careful not to shadow the
@ -352,18 +359,35 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
other, &Vectorized<float>::name); \
}
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
Vectorized frac() const;
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)
#ifdef __ARM_FEATURE_BF16
// Flip sign bit
Vectorized<c10::BFloat16> neg() const {
return -values;
return vreinterpretq_bf16_s16(vreinterpretq_s16_bf16(values) ^ (-32768));
}
// Fast reciprocal is fine because we are truncating results
Vectorized<c10::BFloat16> reciprocal() const {
return 1.0f / values;
auto x = vcvtq_low_f32_bf16(values);
auto y = vcvtq_high_f32_bf16(values);
x = vrecpeq_f32(x);
y = vrecpeq_f32(y);
return vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(x), y);
}
// Clearing the sign bit
Vectorized<c10::BFloat16> abs() const {
return vreinterpretq_bf16_u16(vreinterpretq_u16_bf16(values) & 0x7FFF);
}
#else
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
#endif
// These functions are optimized on clang-21+
#if BF16_ARITHMETIC_SUPPORTED() && (__clang_major__ >= 21)
Vectorized<c10::BFloat16> operator==(
const Vectorized<c10::BFloat16>& other) const {
return values == other.values;
@ -394,8 +418,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
return values >= other.values;
}
#else
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==)
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=)
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<)
@ -451,7 +473,7 @@ template <>
Vectorized<c10::BFloat16> inline operator+(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b) {
#ifdef __ARM_FEATURE_BF16
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
return x + y;
@ -464,7 +486,7 @@ template <>
Vectorized<c10::BFloat16> inline operator-(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b) {
#ifdef __ARM_FEATURE_BF16
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
return x - y;
@ -477,7 +499,7 @@ template <>
Vectorized<c10::BFloat16> inline operator*(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b) {
#ifdef __ARM_FEATURE_BF16
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
return x * y;
@ -490,7 +512,7 @@ template <>
Vectorized<c10::BFloat16> inline operator/(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b) {
#ifdef __ARM_FEATURE_BF16
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
return x / y;
@ -607,7 +629,7 @@ Vectorized<c10::BFloat16> inline fmadd(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b,
const Vectorized<c10::BFloat16>& c) {
#ifdef __ARM_FEATURE_BF16
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
bfloat16x8_t z = c;
@ -627,7 +649,7 @@ Vectorized<c10::BFloat16> inline fnmadd(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b,
const Vectorized<c10::BFloat16>& c) {
#ifdef __ARM_FEATURE_BF16
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
bfloat16x8_t z = c;
@ -643,7 +665,7 @@ Vectorized<c10::BFloat16> inline fmsub(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b,
const Vectorized<c10::BFloat16>& c) {
#ifdef __ARM_FEATURE_BF16
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
bfloat16x8_t z = c;
@ -659,7 +681,7 @@ Vectorized<c10::BFloat16> inline fnmsub(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b,
const Vectorized<c10::BFloat16>& c) {
#ifdef __ARM_FEATURE_BF16
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
bfloat16x8_t z = c;

View File

@ -6,9 +6,9 @@ namespace at::vec {
inline namespace CPU_CAPABILITY {
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
// Enable auto-vectorization for GCC-13+ and clang-17+
// Enable auto-vectorization for clang-17+
// GCC-12 has a bug: gcc.gnu.org/bugzilla/show_bug.cgi?id=117001
#if __GNUC__ > 12 || (defined(__clang__) && (__clang_major__ >= 17))
#if defined(__clang__) && (__clang_major__ >= 17)
template <typename from_type, typename to_type>
inline void convertImpl(
@ -21,12 +21,46 @@ inline void convertImpl(
}
}
template <typename to_type>
inline void convertFromBool(
const bool* __restrict src,
to_type* __restrict dst,
int64_t n) {
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
uint64_t len = static_cast<uint64_t>(n);
for (uint64_t i = 0; i < len; i++) {
dst[i] = srcPtr[i] != 0 ? static_cast<to_type>(1) : static_cast<to_type>(0);
}
}
template <typename from_type>
inline void convertToBool(
const from_type* __restrict src,
bool* __restrict dst,
int64_t n) {
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
uint64_t len = static_cast<uint64_t>(n);
for (uint64_t i = 0; i < len; i++) {
dstPtr[i] = src[i] != static_cast<from_type>(0) ? 1 : 0;
}
}
#define CONVERT_TEMPLATE(from_type, to_type) \
template <> \
inline void convert(const from_type* src, to_type* dst, int64_t n) { \
return convertImpl<from_type, to_type>(src, dst, n); \
}
#define CONVERT_FROM_BOOL_TEMPLATE(to_type) \
inline void convert(const bool* src, to_type* dst, int64_t n) { \
return convertFromBool<to_type>(src, dst, n); \
}
#define CONVERT_TO_BOOL_TEMPLATE(from_type) \
inline void convert(const from_type* src, bool* dst, int64_t n) { \
return convertToBool<from_type>(src, dst, n); \
}
CONVERT_TEMPLATE(uint8_t, uint8_t)
CONVERT_TEMPLATE(uint8_t, int8_t)
CONVERT_TEMPLATE(uint8_t, int16_t)
@ -34,6 +68,7 @@ CONVERT_TEMPLATE(uint8_t, int32_t)
CONVERT_TEMPLATE(uint8_t, int64_t)
CONVERT_TEMPLATE(uint8_t, float)
CONVERT_TEMPLATE(uint8_t, double)
CONVERT_TO_BOOL_TEMPLATE(uint8_t)
CONVERT_TEMPLATE(int8_t, uint8_t)
CONVERT_TEMPLATE(int8_t, int8_t)
CONVERT_TEMPLATE(int8_t, int16_t)
@ -41,6 +76,7 @@ CONVERT_TEMPLATE(int8_t, int32_t)
CONVERT_TEMPLATE(int8_t, int64_t)
CONVERT_TEMPLATE(int8_t, float)
CONVERT_TEMPLATE(int8_t, double)
CONVERT_TO_BOOL_TEMPLATE(int8_t)
CONVERT_TEMPLATE(int16_t, uint8_t)
CONVERT_TEMPLATE(int16_t, int8_t)
CONVERT_TEMPLATE(int16_t, int16_t)
@ -48,6 +84,7 @@ CONVERT_TEMPLATE(int16_t, int32_t)
CONVERT_TEMPLATE(int16_t, int64_t)
CONVERT_TEMPLATE(int16_t, float)
CONVERT_TEMPLATE(int16_t, double)
CONVERT_TO_BOOL_TEMPLATE(int16_t)
CONVERT_TEMPLATE(int32_t, uint8_t)
CONVERT_TEMPLATE(int32_t, int8_t)
CONVERT_TEMPLATE(int32_t, int16_t)
@ -55,6 +92,7 @@ CONVERT_TEMPLATE(int32_t, int32_t)
CONVERT_TEMPLATE(int32_t, int64_t)
CONVERT_TEMPLATE(int32_t, float)
CONVERT_TEMPLATE(int32_t, double)
CONVERT_TO_BOOL_TEMPLATE(int32_t)
CONVERT_TEMPLATE(int64_t, uint8_t)
CONVERT_TEMPLATE(int64_t, int8_t)
CONVERT_TEMPLATE(int64_t, int16_t)
@ -62,6 +100,7 @@ CONVERT_TEMPLATE(int64_t, int32_t)
CONVERT_TEMPLATE(int64_t, int64_t)
CONVERT_TEMPLATE(int64_t, float)
CONVERT_TEMPLATE(int64_t, double)
CONVERT_TO_BOOL_TEMPLATE(int64_t)
CONVERT_TEMPLATE(float, uint8_t)
CONVERT_TEMPLATE(float, int8_t)
CONVERT_TEMPLATE(float, int16_t)
@ -69,6 +108,7 @@ CONVERT_TEMPLATE(float, int32_t)
CONVERT_TEMPLATE(float, int64_t)
CONVERT_TEMPLATE(float, float)
CONVERT_TEMPLATE(float, double)
CONVERT_TO_BOOL_TEMPLATE(float)
CONVERT_TEMPLATE(double, uint8_t)
CONVERT_TEMPLATE(double, int8_t)
CONVERT_TEMPLATE(double, int16_t)
@ -76,6 +116,14 @@ CONVERT_TEMPLATE(double, int32_t)
CONVERT_TEMPLATE(double, int64_t)
CONVERT_TEMPLATE(double, float)
CONVERT_TEMPLATE(double, double)
CONVERT_TO_BOOL_TEMPLATE(double)
CONVERT_FROM_BOOL_TEMPLATE(uint8_t)
CONVERT_FROM_BOOL_TEMPLATE(int8_t)
CONVERT_FROM_BOOL_TEMPLATE(int16_t)
CONVERT_FROM_BOOL_TEMPLATE(int32_t)
CONVERT_FROM_BOOL_TEMPLATE(int64_t)
CONVERT_FROM_BOOL_TEMPLATE(float)
CONVERT_FROM_BOOL_TEMPLATE(double)
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#define CONVERT_FROM_FP16_TEMPLATE(to_type) \
@ -107,6 +155,41 @@ CONVERT_TO_FP16_TEMPLATE(int32_t)
CONVERT_TO_FP16_TEMPLATE(int64_t)
CONVERT_TO_FP16_TEMPLATE(float)
CONVERT_TO_FP16_TEMPLATE(double)
inline void convertBoolToFp16Impl(
const bool* __restrict src,
at::Half* __restrict dst,
int64_t n) {
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
float16_t* dstPtr = reinterpret_cast<float16_t*>(dst);
uint64_t len = static_cast<uint64_t>(n);
for (uint64_t i = 0; i < len; i++) {
dstPtr[i] = srcPtr[i] != 0 ? 1.0 : 0;
}
}
template <>
inline void convert(const bool* src, at::Half* dst, int64_t n) {
return convertBoolToFp16Impl(src, dst, n);
}
inline void convertFp16ToBoolImpl(
const at::Half* __restrict src,
bool* __restrict dst,
int64_t n) {
const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src);
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
uint64_t len = static_cast<uint64_t>(n);
for (uint64_t i = 0; i < len; i++) {
dstPtr[i] = srcPtr[i] != 0.0 ? 1 : 0;
}
}
template <>
inline void convert(const at::Half* src, bool* dst, int64_t n) {
return convertFp16ToBoolImpl(src, dst, n);
}
#endif
#ifdef __ARM_FEATURE_BF16
CONVERT_TEMPLATE(bfloat16_t, uint8_t)
@ -124,6 +207,44 @@ CONVERT_TEMPLATE(int32_t, bfloat16_t)
CONVERT_TEMPLATE(int64_t, bfloat16_t)
CONVERT_TEMPLATE(float, bfloat16_t)
CONVERT_TEMPLATE(double, bfloat16_t)
inline void convertBoolToBfloat16Impl(
const bool* __restrict src,
c10::BFloat16* __restrict dst,
int64_t n) {
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
uint16_t* dstPtr = reinterpret_cast<uint16_t*>(dst);
uint64_t len = static_cast<uint64_t>(n);
constexpr uint16_t kBf16One = 0x3f80; // 1.0 in bfloat16
for (uint64_t i = 0; i < len; i++) {
dstPtr[i] = srcPtr[i] != 0 ? kBf16One : 0;
}
}
template <>
inline void convert(const bool* src, c10::BFloat16* dst, int64_t n) {
return convertBoolToBfloat16Impl(src, dst, n);
}
inline void convertBfloat16ToBoolImpl(
const c10::BFloat16* __restrict src,
bool* __restrict dst,
int64_t n) {
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
const uint16_t* srcPtr = reinterpret_cast<const uint16_t*>(src);
uint64_t len = static_cast<uint64_t>(n);
for (uint64_t i = 0; i < len; i++) {
// Check if all non-sign bits are 0
bool isBf16Zero = (srcPtr[i] & 0x7fff) == 0;
dstPtr[i] = isBf16Zero ? 0 : 1;
}
}
template <>
inline void convert(const c10::BFloat16* src, bool* dst, int64_t n) {
return convertBfloat16ToBoolImpl(src, dst, n);
}
#endif
#endif

View File

@ -309,7 +309,7 @@ class Vectorized<float> {
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
// Implementation copied from Arm Optimized Routine
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
Vectorized<float> exp_u20() const {
inline Vectorized<float> vexpq_f32_u20() const {
// bail out to sleef if it's a special case:
// i.e. there's an input s.t. |input| > 87.3....
const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
@ -348,6 +348,9 @@ class Vectorized<float> {
return vfmaq_f32(scale, poly, scale);
}
Vectorized<float> exp_u20() const {
return vexpq_f32_u20();
}
Vectorized<float> fexp_u20() const {
return exp_u20();
}
@ -634,7 +637,7 @@ inline Vectorized<float> Vectorized<float>::erf() const {
// - exp(- x * x)
auto pow_2 = (*this) * (*this);
auto neg_pow_2 = pow_2 ^ neg_zero_vec;
auto tmp4 = neg_pow_2.exp();
auto tmp4 = neg_pow_2.vexpq_f32_u20();
auto tmp5 = tmp4 ^ neg_zero_vec;
// erf(x) = sign(x) * (1 - r * t * exp(- x * x))
auto tmp6 = t * tmp5;

View File

@ -2,10 +2,10 @@
#include <ATen/cuda/ATenCUDAGeneral.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/core/impl/GPUTrace.h>
#include <c10/cuda/CUDAStream.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/core/impl/GPUTrace.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#include <c10/util/Exception.h>
#include <cuda_runtime_api.h>
@ -246,4 +246,79 @@ private:
}
};
// EventPool - Thread-safe pool of CUDA events to avoid expensive cudaEventCreate
// calls. cudaEventCreate when concurrently invoked from multiple threads can be
// very expensive (especially on certain device/driver combinations).
using CUDAEventPtr =
std::unique_ptr<CUDAEvent, std::function<void(CUDAEvent*)>>;
class EventPool {
public:
EventPool() : pools_(at::cuda::device_count()) {}
CUDAEventPtr get(const DeviceIndex device) {
// If the device is invalid, return a default event and no pooling
if (device < 0 || device >= (DeviceIndex)pools_.size()) {
auto deleter = [](CUDAEvent* event) {
delete event;
};
return CUDAEventPtr(
std::make_unique<CUDAEvent>(cudaEventDisableTiming).release(), deleter);
}
auto& pool = pools_[device];
// Create a destructor that returns the event to the appropriate device pool
auto destructor = [&pool](CUDAEvent* event) noexcept {
if (event != nullptr) {
std::lock_guard<std::mutex> lock(pool.mutex_);
pool.event_pool_.emplace_back(event);
}
};
{
std::lock_guard<std::mutex> lock(pool.mutex_);
if (!pool.event_pool_.empty()) {
auto event = std::move(pool.event_pool_.back());
pool.event_pool_.pop_back();
return CUDAEventPtr(event.release(), destructor);
}
}
return CUDAEventPtr(
std::make_unique<CUDAEvent>(cudaEventDisableTiming).release(),
destructor);
}
void empty_cache() {
for (auto& pool : pools_) {
std::lock_guard<std::mutex> lock(pool.mutex_);
pool.event_pool_.clear();
}
}
void init_num_events(const size_t num_events) {
for (DeviceIndex device_idx = 0; device_idx < at::cuda::device_count(); ++device_idx) {
CUDAGuard device_guard(device_idx);
std::vector<CUDAEventPtr> temp_events;
temp_events.reserve(num_events);
for (size_t i = 0; i < num_events; ++i) {
auto event = get(device_idx);
// Record the event to ensure it's properly initialized
event->record();
temp_events.emplace_back(std::move(event));
}
// Events will be returned to pool when temp_events is destroyed
}
}
private:
struct alignas(64) PerDevicePool {
alignas(64) std::mutex mutex_;
std::vector<std::unique_ptr<CUDAEvent>> event_pool_;
};
std::vector<PerDevicePool> pools_;
};
} // namespace at::cuda

View File

@ -1,78 +1,90 @@
#include <ATen/cuda/CUDAGreenContext.h>
namespace at::cuda {
GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
#if CUDA_HAS_GREEN_CONTEXT
int driver_version;
C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
TORCH_CHECK(
driver_version >= 12080, "cuda driver too old to use green context!");
CUcontext pctx = nullptr;
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
if (C10_UNLIKELY(!pctx)) {
TORCH_WARN(
"Attempted to create a green context but"
" there was no primary context! Creating a primary context...");
cudaFree(0);
}
CUdevice device;
device_id_ = device_id;
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
// Get device resources
CUdevResource device_resource;
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
// Split resources
std::vector<CUdevResource> result(1);
auto result_data = result.data();
unsigned int nb_groups = 1;
CUdevResource remaining;
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
result_data,
&nb_groups,
&device_resource,
&remaining,
0, // default flags
num_sms));
TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
// Generate resource descriptor
CUdevResourceDesc desc;
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
&desc, result_data, 1));
// Create green context
// CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
&green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
// Convert to regular context
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/driver_api.h>
#include <stdexcept>
#include <vector>
#define HAS_CUDA_GREEN_CONTEXT() 1
#else
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
#define HAS_CUDA_GREEN_CONTEXT() 0
// Suppress unsued private field warnings as this class is not supposed to be called
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-private-field")
#endif
namespace at::cuda {
GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
#if HAS_CUDA_GREEN_CONTEXT()
int driver_version;
C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
TORCH_CHECK(
driver_version >= 12080, "cuda driver too old to use green context!");
CUcontext pctx = nullptr;
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
if (C10_UNLIKELY(!pctx)) {
TORCH_WARN(
"Attempted to create a green context but"
" there was no primary context! Creating a primary context...");
cudaFree(0);
}
CUdevice device;
device_id_ = device_id;
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
// Get device resources
CUdevResource device_resource;
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
// Split resources
std::vector<CUdevResource> result(1);
auto result_data = result.data();
unsigned int nb_groups = 1;
CUdevResource remaining;
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
result_data,
&nb_groups,
&device_resource,
&remaining,
0, // default flags
num_sms));
TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
// Generate resource descriptor
CUdevResourceDesc desc;
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
&desc, result_data, 1));
// Create green context
// CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
&green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
// Convert to regular context
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
#else
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
#endif
}
std::unique_ptr<GreenContext> GreenContext::create(
uint32_t num_sms,
std::optional<uint32_t> device_id) {
#if CUDA_HAS_GREEN_CONTEXT
#if HAS_CUDA_GREEN_CONTEXT()
if (!device_id.has_value()) {
device_id = at::cuda::current_device();
}
return std::make_unique<GreenContext>(device_id.value(), num_sms);
return std::unique_ptr<GreenContext>(new GreenContext(device_id.value(), num_sms));
#else
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
#endif
@ -80,7 +92,7 @@ namespace at::cuda {
// Implement move operations
GreenContext::GreenContext(GreenContext&& other) noexcept{
#if CUDA_HAS_GREEN_CONTEXT
#if HAS_CUDA_GREEN_CONTEXT()
device_id_ = std::exchange(other.device_id_, -1);
green_ctx_ = std::exchange(other.green_ctx_, nullptr);
context_ = std::exchange(other.context_, nullptr);
@ -91,7 +103,7 @@ namespace at::cuda {
}
GreenContext& GreenContext::operator=(GreenContext&& other) noexcept{
#if CUDA_HAS_GREEN_CONTEXT
#if HAS_CUDA_GREEN_CONTEXT()
if (this != &other) {
// Clean up current resources
if (green_ctx_) {
@ -120,7 +132,7 @@ namespace at::cuda {
}
GreenContext::~GreenContext() noexcept{
#if CUDA_HAS_GREEN_CONTEXT
#if HAS_CUDA_GREEN_CONTEXT()
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
#else
@ -128,25 +140,9 @@ namespace at::cuda {
#endif
}
// Get the underlying CUDA context
CUcontext GreenContext::getContext() const {
#if CUDA_HAS_GREEN_CONTEXT
return context_;
#else
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
#endif
}
// Get the underlying green context
#if CUDA_HAS_GREEN_CONTEXT
CUgreenCtx GreenContext::getGreenContext() const {
return green_ctx_;
}
#endif
// Make this context current
void GreenContext::setContext() {
#if CUDA_HAS_GREEN_CONTEXT
#if HAS_CUDA_GREEN_CONTEXT()
auto current_stream = c10::cuda::getCurrentCUDAStream();
parent_stream_ = current_stream.stream();
@ -175,7 +171,7 @@ namespace at::cuda {
}
void GreenContext::popContext() {
#if CUDA_HAS_GREEN_CONTEXT
#if HAS_CUDA_GREEN_CONTEXT()
// see above note about stream being hardcoded to the default stream
at::cuda::CUDAEvent ev;
ev.record(c10::cuda::getCurrentCUDAStream());

View File

@ -1,53 +1,38 @@
#pragma once
#include <ATen/cuda/CUDAEvent.h>
#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/driver_api.h>
#include <cuda.h>
#include <memory>
#include <stdexcept>
#include <vector>
#define CUDA_HAS_GREEN_CONTEXT 1
#else
#define CUDA_HAS_GREEN_CONTEXT 0
#endif
// Forward declare green context as opaque ptr
typedef struct CUgreenCtx_st* CUgreenCtx;
namespace at::cuda {
class TORCH_CUDA_CPP_API GreenContext {
public:
GreenContext(uint32_t device_id, uint32_t num_sms);
static std::unique_ptr<GreenContext> create(uint32_t num_sms, std::optional<uint32_t> device_id);
// Green context creation
static std::unique_ptr<GreenContext> create(
uint32_t num_sms,
std::optional<uint32_t> device_id);
~GreenContext() noexcept;
// Delete copy constructor and assignment
GreenContext(const GreenContext&) = delete;
GreenContext& operator=(const GreenContext&) = delete;
// Implement move operations
GreenContext(GreenContext&& other) noexcept;
GreenContext& operator=(GreenContext&& other) noexcept;
~GreenContext() noexcept;
// Get the underlying CUDA context
CUcontext getContext() const;
// Get the underlying green context
#if CUDA_HAS_GREEN_CONTEXT
CUgreenCtx getGreenContext() const;
#endif
// Make this context current
void setContext();
void popContext();
private:
#if CUDA_HAS_GREEN_CONTEXT
GreenContext(uint32_t device_id, uint32_t num_sms);
// Implement move operations
GreenContext(GreenContext&& other) noexcept;
GreenContext& operator=(GreenContext&& other) noexcept;
int32_t device_id_ = -1;
CUgreenCtx green_ctx_ = nullptr;
CUcontext context_ = nullptr;
cudaStream_t parent_stream_ = nullptr;
#endif
};
} // namespace at::cuda

View File

@ -7,17 +7,6 @@
#endif
#if defined(USE_ROCM)
// hipSparse const API added in v2.4.0
#if HIPSPARSE_VERSION >= 200400
#define AT_USE_HIPSPARSE_GENERIC_API() 1
#else
#define AT_USE_HIPSPARSE_GENERIC_API() 1
#endif
#else // USE_ROCM
#define AT_USE_HIPSPARSE_GENERIC_API() 0
#endif // USE_ROCM
// cuSparse Generic API spsv function was added in CUDA 11.3.0
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11500)
#define AT_USE_CUSPARSE_GENERIC_SPSV() 1

View File

@ -1,6 +1,7 @@
#include <ATen/cuda/CUDAContextLight.h>
#include <ATen/cuda/Sleep.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAStream.h>
@ -24,8 +25,22 @@ __global__ void spin_kernel(int64_t cycles) {
#endif
}
}
thread_local int *flag = nullptr;
__global__ void busy_wait_for_flag_kernel(int *flag) {
atomicExch(flag, 1);
while (atomicAdd(flag, 0) == 1) {
// do nothing
}
}
__global__ void clear_flag_kernel(int *flag) {
atomicExch(flag, 0);
}
} // anonymous namespace
void sleep(int64_t cycles) {
dim3 grid(1);
dim3 block(1);
@ -33,6 +48,26 @@ void sleep(int64_t cycles) {
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
void busy_wait_for_flag() {
if (!flag) {
flag = (int*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(int));
}
dim3 grid(1);
dim3 block(1);
busy_wait_for_flag_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(flag);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
void clear_flag() {
if (!flag) {
flag = (int*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(int));
}
dim3 grid(1);
dim3 block(1);
clear_flag_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(flag);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
#ifdef USE_ROCM
__global__ void flush_icache_kernel()
{

View File

@ -7,6 +7,11 @@ namespace at::cuda {
// enqueues a kernel that spins for the specified number of cycles
TORCH_CUDA_CU_API void sleep(int64_t cycles);
// enqueues a kernel that spins until a flag is cleared by a
// corresponding call to clear_flag()
TORCH_CUDA_CU_API void busy_wait_for_flag();
TORCH_CUDA_CU_API void clear_flag();
// flushes instruction cache for ROCm; no-op for CUDA
TORCH_CUDA_CU_API void flush_icache();

View File

@ -580,7 +580,7 @@ std::ofstream& TuningContext::GetUntunedFile(){
filename.append(device);
}
untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::trunc);
untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::app);
}
return untuned_file_;
}

View File

@ -1,5 +1,6 @@
#pragma once
#include <c10/core/CachingDeviceAllocator.h>
#include <c10/core/Device.h>
#include <c10/util/Exception.h>
@ -151,6 +152,36 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
}
virtual bool isAvailable() const override;
/* MTIAGraph related APIs */
virtual int64_t mtiagraphCreate(bool keep_graph = false) const {
FAIL_MTIAHOOKS_FUNC(__func__);
return -1;
}
virtual void mtiagraphCaptureBegin(int64_t handle, MempoolId_t pool) const {
FAIL_MTIAHOOKS_FUNC(__func__);
}
virtual void mtiagraphCaptureEnd(int64_t handle) const {
FAIL_MTIAHOOKS_FUNC(__func__);
}
virtual void mtiagraphInstantiate(int64_t handle) const {
FAIL_MTIAHOOKS_FUNC(__func__);
}
virtual void mtiagraphReplay(int64_t handle) const {
FAIL_MTIAHOOKS_FUNC(__func__);
}
virtual void mtiagraphReset(int64_t handle) const {
FAIL_MTIAHOOKS_FUNC(__func__);
}
virtual MempoolId_t mtiagraphPool(int64_t handle) const {
FAIL_MTIAHOOKS_FUNC(__func__);
}
};
struct TORCH_API MTIAHooksArgs {};

View File

@ -534,20 +534,20 @@ Tensor trace_decomp(const Tensor& tensor) {
std::tuple<Tensor, std::optional<int64_t>> tril_batch_rule(
const Tensor& self,
std::optional<int64_t> self_bdim,
int64_t diagonal = 0) {
c10::SymInt diagonal = 0) {
TORCH_CHECK(self.dim() >= 2, "tril: The input tensor must have at least 2 dimensions.");
auto self_ = moveBatchDimToFront(self, self_bdim);
auto result = at::tril(self_, diagonal);
auto result = at::tril_symint(self_, std::move(diagonal));
return std::make_tuple(std::move(result), 0);
}
std::tuple<Tensor, std::optional<int64_t>> triu_batch_rule(
const Tensor& self,
std::optional<int64_t> self_bdim,
int64_t diagonal = 0) {
c10::SymInt diagonal = 0) {
TORCH_CHECK(self.dim() >= 2, "triu: The input tensor must have at least 2 dimensions.");
auto self_ = moveBatchDimToFront(self, self_bdim);
auto result = at::triu(self_, diagonal);
auto result = at::triu_symint(self_, std::move(diagonal));
return std::make_tuple(std::move(result), 0);
}

View File

@ -410,8 +410,8 @@ struct ConvParams {
return false;
}
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
// broken on cuDNN 9.8
if (cudnn_version >= 90800) {
// broken on cuDNN 9.8 - 9.14
if (cudnn_version >= 90800 && cudnn_version < 91500) {
if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
(input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
weight.dim() == 5) {
@ -689,6 +689,10 @@ static void check_shape_forward(const at::Tensor& input,
", but got bias of size ", at::symint::sizes<T>(bias), " instead");
for (const auto i : c10::irange(2, k)) {
// T could be int64_t or SymInt, Specialized numeric_limts<SymInt> in c10/core/SymInt.h
TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
"Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
(std::numeric_limits<T>::max() / 2));
input_shape.push_back(at::symint::size<T>(input, i) + 2 * padding[i-2]);
// log new kernel size considering dilation
kernel_shape.push_back(dilation[i-2] * (weight_sizes[i]-1) + 1);
@ -715,6 +719,11 @@ static void check_shape_forward(const at::Tensor& input,
"Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size");
}
} else { // transposed
for (const auto i : c10::irange(2, k)) {
TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
"Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
(std::numeric_limits<T>::max() / 2));
}
TORCH_CHECK(at::symint::size<T>(input, 1) == weight_sizes[0],
"Given transposed=", transposed, ", weight of size ", weight_sizes,
", expected input", at::symint::sizes<T>(input), " to have ", weight_sizes[0],

View File

@ -52,8 +52,7 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
for (const auto k : c10::irange(kw)) {
int iShift = std::max(0, static_cast<int>(k - real_pad));
int oShift = std::max(0, static_cast<int>(real_pad - k));
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
int t = std::min(ilen + real_pad - k, olen) - oShift;
long t = std::min(ilen + real_pad - k, olen) - oShift;
// Note: gemm assumes column-major matrices
// input is l*m (row-major)
// weight is m*r (row-major)

View File

@ -16,8 +16,7 @@ bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) {
auto linearId = elements - 1;
// NOTE: Assumes all strides are positive, which is true for now
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
for (int i = t.dim() - 1; i >= 0; --i) {
for (auto i = t.dim() - 1; i >= 0; --i) {
auto curDimIndex = linearId % t.sym_size(i);
auto curDimOffset = curDimIndex * t.sym_stride(i);
offset += curDimOffset;

View File

@ -68,7 +68,6 @@ Tensor fbgemm_linear_int8_weight_fp32_activation(
const float* input_ptr = input_contig.const_data_ptr<float>();
TORCH_CHECK(input.dim() >= 2);
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
const int64_t K = input.size(input.dim() - 1);
TORCH_CHECK(weight.dim() == 2);

View File

@ -160,10 +160,9 @@ struct Dist {
// value of k.
parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) {
const Vec pvec(p);
double n2 = n - .5;
double n2 = static_cast<double>(n) - .5;
// The -1 accounts for floating point truncation issues
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2.0 * static_cast<double>(k) - 1.0)));
int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
const scalar_t * self_i = self_start + i * m;

View File

@ -139,7 +139,7 @@ void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, dou
}
);
} else {
AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] {
AT_DISPATCH_ALL_TYPES_AND(kHalf, dtype, "smooth_l1_backward_cpu_out", [&] {
auto norm_val = norm.to<scalar_t>();
scalar_t beta_val(beta);
auto norm_val_vec = Vectorized<scalar_t>(norm_val);

File diff suppressed because it is too large Load Diff

View File

@ -1,11 +1,11 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/Context.h>
#include <ATen/Dispatch.h>
#include <ATen/Dispatch_v2.h>
#include <ATen/cuda/CachingHostAllocator.h>
#include <ATen/core/Tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAEvent.h>
#include <ATen/cuda/CachingHostAllocator.h>
#include <ATen/cuda/PeerToPeerAccess.h>
#include <ATen/native/Copy.h>
#include <ATen/native/TensorIterator.h>
@ -27,6 +27,24 @@
namespace at::native {
namespace {
// Initial pool size for CUDA events per device.
constexpr size_t kInitialEventPoolSize = 8;
at::cuda::CUDAEventPtr getEventFromPool(const at::DeviceIndex device_idx) {
static auto* event_pool = []() {
auto* pool = new at::cuda::EventPool();
// Pre-populate the pool with events to avoid stalls in creating events
pool->init_num_events(kInitialEventPoolSize);
return pool;
}();
return event_pool->get(device_idx);
}
} // namespace
void neg_kernel_cuda(TensorIteratorBase &iter);
void conj_kernel_cuda(TensorIteratorBase &iter);
@ -263,12 +281,14 @@ void copy_device_to_device(TensorIterator& iter,
// write-after-read dependencies on the destination side are handled, so
// that no one is operating on the dst memory when we perform the copy.
// src waits on dst barrier (src already waits on src)
CUDAEvent dst_ready;
// Use event pool for better performance instead of creating new events
auto dst_ready = getEventFromPool(dst_device.index());
device_guard.set_device(dst_device);
dst_ready.record(getCurrentCUDAStream(dst_device.index()));
dst_ready->record(getCurrentCUDAStream(dst_device.index()));
device_guard.set_device(src_device);
dst_ready.block(copy_stream);
dst_ready->block(copy_stream);
}
if (memcpy_eligible) {
@ -307,11 +327,11 @@ void copy_device_to_device(TensorIterator& iter,
// operate on dst's copy until the copy is complete.
// Still on src_device, record stream event
CUDAEvent src_ready;
src_ready.record(copy_stream);
auto src_ready = getEventFromPool(src_device.index());
src_ready->record(copy_stream);
device_guard.set_device(dst_device);
src_ready.block(getCurrentCUDAStream(dst_device.index()));
src_ready->block(getCurrentCUDAStream(dst_device.index()));
}
AT_CUDA_CHECK(cudaGetLastError());

View File

@ -208,6 +208,62 @@ _f8_f8_bf16_rowwise_grouped_mm(
#endif
}
Tensor&
_f4_f4_bf16_grouped_mm_fbgemm(
const Tensor& mat_a,
const Tensor& mat_b,
const Tensor& scale_a,
const std::optional<Tensor>& global_scale_a,
const Tensor& scale_b,
const std::optional<Tensor>& global_scale_b,
const std::optional<Tensor>& offs,
const std::optional<Tensor>& bias,
Tensor& out) {
#if !defined(USE_ROCM) && defined(USE_FBGEMM_GENAI)
// Typing checks
TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2,
"mat_a must be Float4_e2n1fn_2, got: ", mat_a.scalar_type());
TORCH_CHECK_VALUE(mat_b.scalar_type() == at::kFloat4_e2m1fn_x2,
"mat_b must be Float4_e2n1fn_2, got: ", mat_b.scalar_type());
std::optional<Tensor> combined_global_scale = std::nullopt;
if (global_scale_a.has_value() || global_scale_b.has_value()) {
// NVFP4
TORCH_CHECK_VALUE(global_scale_a.has_value() && global_scale_b.has_value(),
"For NVFP4 grouped gemm both of global_scale_{a,b} must have values")
TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e4m3fn,
"scale_a must be Float8_e4m3fn, got: ", scale_a.scalar_type());
TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e4m3fn,
"scale_b must be Float8_e4m3fn, got: ", scale_b.scalar_type());
TORCH_CHECK_VALUE(global_scale_a.value().scalar_type() == at::kFloat,
"global_scale_a must be Float, got: ", global_scale_a.value().scalar_type());
TORCH_CHECK_VALUE(global_scale_b.value().scalar_type() == at::kFloat,
"global_scale_b must be Float, got: ", global_scale_b.value().scalar_type());
combined_global_scale = global_scale_a.value().mul(global_scale_b.value());
} else {
// MXFP4
TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e8m0fnu,
"scale_a must be Float8_e8m0fnu, got: ", scale_a.scalar_type());
TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e8m0fnu,
"scale_b must be Float8_e8m0fnu, got: ", scale_b.scalar_type());
}
auto o = fbgemm_gpu::f4f4bf16_grouped_mm(
mat_a,
mat_b,
scale_a,
scale_b,
offs.value(),
out,
combined_global_scale
);
#else
TORCH_CHECK_NOT_IMPLEMENTED(false, "nvfp4 grouped gemm is not supported without USE_FBGEMM_GENAI, and only for CUDA")
#endif
return out;
}
void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
// Checks scales for 2d or 3d target tensors (`mat`).
if (mat.dim() == 2) {
@ -245,7 +301,15 @@ void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int
}
}
void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
void _check_scales_blocked(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
// if {mx,nv}fp4, will need to modify K later
bool is_fp4 = (mat.scalar_type() == kFloat4_e2m1fn_x2);
int blocksize = 32;
// check for nvfp4 vs. mxfp4 to fix blocksize
if (is_fp4 && scale.scalar_type() == kFloat8_e4m3fn) {
blocksize = 16;
}
// Checks scales for 2d or 3d target tensors (`mat`).
if (mat.dim() == 2) {
// For MXFP8, 2d tensors have variable size groups represented as subtensors,
@ -253,17 +317,19 @@ void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim,
// so we can't check the scale sizes without doing a d2h sync to get the group sizes here.
TORCH_CHECK(
scale.dim() == mat.dim(),
"for mxfp8, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(), " and scale.dim() = ", scale.dim(), " for arg ", arg_idx);
"for block-scaled, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(),
" and scale.dim() = ", scale.dim(), " for arg ", arg_idx
);
// LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/32, 4))
// RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/32, 4))
// LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/blocksize, 4))
// RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/blocksize, 4))
// * weight is transposed prior to the call, scale stays non-transposed.
bool LHS = arg_idx == 0;
int scale_dim_to_check = 0;
int mat_dim_to_check = LHS ? 0 : 1;
TORCH_CHECK(
scale.size(scale_dim_to_check) >= mat.size(mat_dim_to_check),
"for mxfp8, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
"for block-scaled, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
"must have scale.shape[", scale_dim_to_check, "] >= ", mat.size(mat_dim_to_check), " but got scale.shape=(", scale.size(0), ", ", scale.size(1), ")");
} else {
// For MXFP8, 3d tensors have static group sizes (stack of 2d tensors),
@ -273,32 +339,40 @@ void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim,
};
// TODO: this is for 3d tensor in 2d-3d case specifically.
// We'll need to support 3d-3d and 3d-2d cases once mxfp8 grouped gemm supports them.
// We'll need to support 3d-3d and 3d-2d cases once mxfp8/nvfp4 grouped gemm supports them.
int64_t G = mat.size(0);
int64_t K = mat.size(1);
if (is_fp4) {
// FP4 packs 2 values into a single 8b word - the "real" K is 2x the
// reported K. Reverse that adjustment.
const int fp4_elems_per_byte = 2;
K *= fp4_elems_per_byte;
}
int64_t N = mat.size(2);
int64_t blocked_scale_K = round_up(K/32, 4);
int64_t blocked_scale_K = round_up(K/blocksize, 4);
int64_t blocked_scale_N = round_up(N, 128);
// fbgemm expects stack of flattened blocked scales for 3d tensor, shape (G, blocked_scale_K * blocked_scale_N).
TORCH_CHECK(
scale.dim() == mat.dim() - 1,
"for mxfp8 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N), but scale is ", scale.dim(), "D for arg ", arg_idx
"for block-scaled 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N),",
"but scale is ", scale.dim(), "D for arg ", arg_idx
);
TORCH_CHECK(
scale.size(0) == G && scale.size(1) == blocked_scale_K * blocked_scale_N,
"for mxfp8, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ") for arg ", arg_idx
"for block-scaled grouped GEMM, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ")",
" for arg ", arg_idx, ", got: ", scale.size(0), ", ", scale.size(1)
);
}
}
void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
bool using_fp8_rowwise = scale.scalar_type() == kFloat;
bool using_mxfp8 = scale.scalar_type() == at::kFloat8_e8m0fnu;
bool using_mx = scale.scalar_type() == at::kFloat8_e8m0fnu;
if (using_fp8_rowwise) {
_check_scales_fp8_rowwise(mat, scale, dim, arg_idx, scale_multiplier);
} else if (using_mxfp8) {
_check_scales_mxfp8(mat, scale, dim, arg_idx);
} else if (using_mx) {
_check_scales_blocked(mat, scale, dim, arg_idx);
} else {
TORCH_CHECK(false, "scale must be float32 or float8_e8m0fnu, but got ", scale.dtype());
}
@ -411,9 +485,11 @@ namespace {
using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2> scale_grouped_kernel_dispatch = {{
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 4> scale_grouped_kernel_dispatch = {{
{ "rowwise_rowwise", scaled_blas::check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
{ "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}}};
{ "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8},
{ "mxfp4_mxfp4", scaled_blas::check_mxfp4_recipe, ScaledGemmImplementation::MXFP4_MXFP4},
{ "nvfp4_nvfp4", scaled_blas::check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4}}};
} // anonymous namespace
@ -525,8 +601,9 @@ _scaled_grouped_mm_cuda_v2(
out);
}
case ScaledGemmImplementation::MXFP8_MXFP8: {
_check_scales_mxfp8(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
_check_scales_mxfp8(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
// scale shape checks
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
return _mx8_mx8_bf16_grouped_mm_fbgemm(
mat_a,
mat_b,
@ -537,6 +614,36 @@ _scaled_grouped_mm_cuda_v2(
offs.value(),
out);
}
case ScaledGemmImplementation::MXFP4_MXFP4: {
// scale shape checks
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
return _f4_f4_bf16_grouped_mm_fbgemm(
mat_a,
mat_b,
scale_a[0], /* block-scale A */
std::nullopt, /* global-scale A */
scale_b[0], /* block-scale B */
std::nullopt, /* global-scale B */
offs.value(),
std::nullopt, /* bias */
out);
}
case ScaledGemmImplementation::NVFP4_NVFP4: {
// scale shape checks
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
return _f4_f4_bf16_grouped_mm_fbgemm(
mat_a,
mat_b,
scale_a[0], /* block-scale A */
scale_a[1], /* global-scale A */
scale_b[0], /* block-scale B */
scale_b[1], /* global-scale B */
offs.value(),
std::nullopt, /* bias */
out);
}
default:
TORCH_CHECK_NOT_IMPLEMENTED(false,
"_scaled_grouped_mm_cuda_v2 is in an inconsistent state - should never reach here");

View File

@ -13,7 +13,7 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx,
if (allow_neg_indices) {
ind = (ind < 0) ? ind + ind_dim_size : ind;
}
CUDA_KERNEL_ASSERT(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds");
CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds", "Expected 0 <= index < ind_dim_size(%ld), but got index = %ld", ind_dim_size, ind);
int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
if (off >= slice_size) return;
auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);

File diff suppressed because it is too large Load Diff

View File

@ -160,8 +160,8 @@ struct _cuda_scatter_gather_internal_kernel {
auto offsets = offset_calc.get(i);
int64_t idx_dim = *(index_t*)(index_ptr + offsets[2]);
CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
&& "scatter gather kernel index out of bounds");
CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
&& "scatter gather kernel index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);
f(
(scalar_t*)(self_ptr + offsets[0]),
@ -406,9 +406,8 @@ struct _cuda_scatter_fill_internal_kernel {
auto offsets = offset_calc.get(i);
int64_t idx_dim = *(index_t*)(index_ptr + offsets[1]);
CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
&& "index out of bounds"
);
CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
&& "index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);
f(
(scalar_t*)(self_ptr + offsets[0]),

View File

@ -12,14 +12,15 @@
namespace at::native {
#if AT_USE_JITERATOR()
#if 0 && AT_USE_JITERATOR()
constexpr char tan_name[] = "tan_impl";
#endif
void tan_kernel_cuda(TensorIteratorBase& iter) {
auto common_dtype = iter.common_dtype();
if (at::isComplexType(common_dtype)) {
#if AT_USE_JITERATOR()
// Disabled due to accuracy issues
#if 0 && AT_USE_JITERATOR()
static const auto tan_string = jiterator_stringify(
template <typename T> T tan_impl(T a) { return std::tan(a); });
AT_DISPATCH_COMPLEX_TYPES_AND(

View File

@ -12,14 +12,15 @@
namespace at::native {
#if AT_USE_JITERATOR()
#if 0 && AT_USE_JITERATOR()
constexpr char tanh_name[] = "tanh_impl";
#endif
void tanh_kernel_cuda(TensorIteratorBase& iter) {
auto common_dtype = iter.common_dtype();
if (at::isComplexType(common_dtype)) {
#if AT_USE_JITERATOR()
// Disabled due to accuracy issues
#if 0 && AT_USE_JITERATOR()
static const auto tanh_string = jiterator_stringify(
template <typename T> T tanh_impl(T a) { return std::tanh(a); });
AT_DISPATCH_COMPLEX_TYPES_AND(

View File

@ -0,0 +1,171 @@
#pragma once
#include <ATen/core/Tensor.h>
namespace at::native {
using at::blas::ScalingType;
using at::blas::SwizzleType;
namespace {
// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492
c10::MaybeOwned<Tensor> inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) {
if (resolve_conj && tensor.is_conj()) {
return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
} else {
return c10::MaybeOwned<Tensor>::borrowed(tensor);
}
}
c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) {
if (tensor.is_non_overlapping_and_dense()) { // common case
transpose_tensor = tensor.is_contiguous();
return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor);
}
IntArrayRef tensor_strides = tensor.strides();
IntArrayRef tensor_sizes = tensor.sizes();
if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
transpose_tensor = false;
return resolve_conj_if_indicated(tensor, !transpose_result);
} else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
transpose_tensor = true;
return resolve_conj_if_indicated(tensor, transpose_result);
} else {
transpose_tensor = true;
return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
}
}
c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) {
if (tensor.is_non_overlapping_and_dense()) { // common case
transpose_tensor = tensor.is_contiguous();
return resolve_conj_if_indicated(tensor, true);
}
IntArrayRef tensor_strides = tensor.strides();
IntArrayRef tensor_sizes = tensor.sizes();
if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
transpose_tensor = false;
return resolve_conj_if_indicated(tensor, true);
} else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
transpose_tensor = true;
return resolve_conj_if_indicated(tensor, true);
} else {
transpose_tensor = true;
return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
}
}
} // namespace
/**
* @brief Prepares matrices for CUBLAS operation
*
* This constructor prepares tensors for CUBLAS
* The main difference is that PyTorch uses row-major as the default and
* CUBLAS expects column-major.
*
* @details
* To enable row-major output while using CUBLAS,
* we use the mathematical identity that (A × B)^T = B^T × A^T.
*
* Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major)
* T = row-major, N = col-major
*
* Example:
* For matrices A (M×K)(row-major) and B (K×N)(row-major):
* - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major)
* - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N)
* - However, since the output form cublas is column-major this is
* - equivalent to an output of size MxN row-major as expected
*
* The transpose flags are derived from the layouts of the passed in tensors
*
* If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted
* to their unpacked values to match what cuBLAS expects.
*
* @param mat1 First input matrix
* @param mat2 Second input matrix
* @param c Output matrix (result)
* @param scale_a Optional scaling factor for first matrix
* @param scale_b Optional scaling factor for second matrix
* @param scale_result Optional scaling factor for result
*/
struct cublasCommonArgs {
cublasCommonArgs(
const Tensor& mat1,
const Tensor& mat2,
Tensor& c,
const std::optional<Tensor>& scale_a = std::nullopt,
const std::optional<Tensor>& scale_b = std::nullopt,
const std::optional<Tensor>& scale_result = std::nullopt,
const std::optional<ScalingType>& scaling_choice_a = std::nullopt,
const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
bool transpose_result = false, transpose_a = false, transpose_b = false;
result = prepare_matrix_for_cublas(c, transpose_result);
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
// Handle scale tensors if provided
if (scale_a && scale_b) {
// By default since we return in row-major we run the gemm
// as B.T @ A.T, check transpose_result to determine if we flip the scales
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
}
if (scale_result) {
scale_result_ptr = scale_result->data_ptr();
scale_result_dtype = scale_result->scalar_type();
}
// Update transpose flags
if (transpose_result) {
transpose_a = !transpose_a;
transpose_b = !transpose_b;
}
auto sizes_a = mata->sizes();
auto sizes_b = matb->sizes();
m = sizes_a[transpose_result ? 1 : 0];
k = sizes_a[transpose_result ? 0 : 1];
n = sizes_b[transpose_result ? 0 : 1];
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
result_ld = result->stride(transpose_result ? 0 : 1);
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
// cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
// if the gemm operands are in packed float4
if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) {
k = k * 2;
lda = lda * 2;
ldb = ldb * 2;
}
}
// Matrix members
char transa, transb;
int64_t m, n, k;
int64_t lda, ldb, result_ld;
c10::MaybeOwned<Tensor> mata, matb, result;
// Scale members
void* scale_mata_ptr = nullptr;
void* scale_matb_ptr = nullptr;
void* scale_result_ptr = nullptr;
std::optional<c10::ScalarType> scale_mata_dtype;
std::optional<ScalingType> scaling_mata_type;
std::optional<c10::ScalarType> scale_matb_dtype;
std::optional<ScalingType> scaling_matb_type;
std::optional<c10::ScalarType> scale_result_dtype;
};
} // namespace at::native

View File

@ -141,7 +141,8 @@ WelfordDataLN cuWelfordOnlineSum(
if constexpr (!rms_norm){
U delta = val - curr_sum.mean;
U new_count = curr_sum.count + 1.f;
#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
//Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
#if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
#else
U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
@ -163,7 +164,8 @@ WelfordDataLN cuWelfordCombine(
U count = dataA.count + dataB.count;
U mean, sigma2;
if (count > decltype(dataB.count){0}) {
#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
//Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
#if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
auto coef = __builtin_amdgcn_rcpf(count);
#else
auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division

View File

@ -86,6 +86,28 @@ struct zeta_functor {
}
};
struct logaddexp_functor {
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
inline T operator()(const T a, const T b) {
return c10::metal::logaddexp(a, b);
}
template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
inline float operator()(const T a, const T b) {
return c10::metal::logaddexp(float(a), float(b));
}
};
struct logaddexp2_functor {
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
inline T operator()(const T a, const T b) {
return c10::metal::logaddexp2(a, b);
}
template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
inline float operator()(const T a, const T b) {
return c10::metal::logaddexp2(float(a), float(b));
}
};
struct xlog1py_functor {
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
inline T operator()(const T a, const T b) {
@ -377,6 +399,10 @@ REGISTER_FLOAT_BINARY_OP(fmin);
REGISTER_FLOAT_BINARY_OP(nextafter);
REGISTER_FLOAT_BINARY_OP(zeta);
REGISTER_INT2FLOAT_BINARY_OP(zeta);
REGISTER_FLOAT_BINARY_OP(logaddexp);
REGISTER_INT2FLOAT_BINARY_OP(logaddexp);
REGISTER_FLOAT_BINARY_OP(logaddexp2);
REGISTER_INT2FLOAT_BINARY_OP(logaddexp2);
REGISTER_FLOAT_BINARY_OP(xlog1py);
REGISTER_INT2FLOAT_BINARY_OP(xlog1py);
REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_t);
@ -463,6 +489,8 @@ REGISTER_BINARY_OP(add, float2, float2);
REGISTER_BINARY_OP(add, half2, half2);
REGISTER_BINARY_OP(sub, float2, float2);
REGISTER_BINARY_OP(sub, half2, half2);
REGISTER_BINARY_OP(logaddexp, float2, float2);
REGISTER_BINARY_OP(logaddexp, half2, half2);
REGISTER_BINARY_ALPHA_OP(add_alpha, float2, float2, float2);
REGISTER_BINARY_ALPHA_OP(add_alpha, half2, half2, half2);
REGISTER_BINARY_ALPHA_OP(sub_alpha, float2, float2, float2);

View File

@ -1,4 +1,4 @@
#pragma onces
#pragma once
#include <c10/metal/common.h>
template <unsigned N = c10::metal::max_ndim>

View File

@ -89,6 +89,14 @@ static void zeta_mps_kernel(TensorIteratorBase& iter) {
lib.exec_binary_kernel(iter, "zeta");
}
static void logaddexp_mps_kernel(TensorIteratorBase& iter) {
lib.exec_binary_kernel(iter, "logaddexp");
}
static void logaddexp2_mps_kernel(TensorIteratorBase& iter) {
lib.exec_binary_kernel(iter, "logaddexp2");
}
static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "xlog1py_mps not implemented for non-floating types");
lib.exec_binary_kernel(iter, "xlog1py");
@ -211,6 +219,8 @@ REGISTER_DISPATCH(fmin_stub, &fmin_mps_kernel)
REGISTER_DISPATCH(copysign_stub, &copysign_mps_kernel)
REGISTER_DISPATCH(nextafter_stub, &nextafter_mps_kernel)
REGISTER_DISPATCH(zeta_stub, &zeta_mps_kernel)
REGISTER_DISPATCH(logaddexp_stub, &logaddexp_mps_kernel);
REGISTER_DISPATCH(logaddexp2_stub, &logaddexp2_mps_kernel);
REGISTER_DISPATCH(xlog1py_stub, &xlog1py_mps_kernel)
REGISTER_DISPATCH(chebyshev_polynomial_t_stub, &chebyshev_polynomial_t_mps_kernel)
REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_mps_kernel)

View File

@ -17,8 +17,6 @@
#include <ATen/ops/ge_native.h>
#include <ATen/ops/gt_native.h>
#include <ATen/ops/le_native.h>
#include <ATen/ops/logaddexp2_native.h>
#include <ATen/ops/logaddexp_native.h>
#include <ATen/ops/logical_and_native.h>
#include <ATen/ops/logical_or_native.h>
#include <ATen/ops/logical_xor_native.h>
@ -277,30 +275,6 @@ TORCH_IMPL_FUNC(pow_Scalar_out_mps)(const Scalar& base, const Tensor& exp, const
}
}
TORCH_IMPL_FUNC(logaddexp_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
mps::BinaryOpBlock logaddexp_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
MPSGraph* mpsGraph = cachedGraph->graph();
MPSGraphTensor* sumTensor =
[mpsGraph additionWithPrimaryTensor:[mpsGraph exponentWithTensor:primaryCastTensor name:nil]
secondaryTensor:[mpsGraph exponentWithTensor:secondaryCastTensor name:nil]
name:nil];
return [mpsGraph logarithmWithTensor:sumTensor name:nil];
};
mps::binaryOpTensor(self, other, output, "logaddexp_out_mps", logaddexp_op_block);
}
TORCH_IMPL_FUNC(logaddexp2_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
mps::BinaryOpBlock logaddexp2_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
MPSGraph* mpsGraph = cachedGraph->graph();
MPSGraphTensor* sumTensor =
[mpsGraph additionWithPrimaryTensor:[mpsGraph exponentBase2WithTensor:primaryCastTensor name:nil]
secondaryTensor:[mpsGraph exponentBase2WithTensor:secondaryCastTensor name:nil]
name:nil];
return [mpsGraph logarithmBase2WithTensor:sumTensor name:nil];
};
mps::binaryOpTensor(self, other, output, "logaddexp2_out_mps", logaddexp2_op_block);
}
TORCH_IMPL_FUNC(xlogy_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
mps::BinaryOpBlock xlogy_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
MPSGraph* mpsGraph = cachedGraph->graph();

View File

@ -57,6 +57,7 @@ Tensor& random_mps_impl(Tensor& self,
if (self.numel() == 0) {
return self;
}
at::assert_no_internal_overlap(self);
// MPS random is broken for 5D+ tensors, see https://github.com/pytorch/pytorch/issues/147624
const auto need_reshape = self.ndimension() > 4;
auto mps_gen = get_generator_or_default<MPSGeneratorImpl>(gen, at::mps::detail::getDefaultMPSGenerator());
@ -153,8 +154,16 @@ Tensor& random_mps_impl(Tensor& self,
feeds[meanPlaceholder.getMPSGraphTensor()] = meanPlaceholder.getMPSGraphTensorData();
}
Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self);
// Handle non-contiguous output tensors by creating a contiguous temporary
const auto needs_gather = needsGather(self);
Tensor self_ = needs_gather ? at::empty_like(self, MemoryFormat::Contiguous) : self;
Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self_);
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
// Copy results back to original non-contiguous output
if (needs_gather) {
self.copy_(self_);
}
}
return self;

View File

@ -1,3 +1,5 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/Resize.h>
#include <ATen/native/SpectralOpsUtils.h>
#include <ATen/native/mps/OperationUtils.h>
@ -37,25 +39,12 @@ NSArray<NSNumber*>* IntArrayToNSArray(IntArrayRef arr) {
} // anonymous namespace
Tensor _fft_c2r_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
TORCH_CHECK(self.is_complex());
auto in_sizes = self.sizes();
DimVector out_sizes(in_sizes.begin(), in_sizes.end());
out_sizes[dim.back()] = last_dim_size;
auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type())));
auto out = at::empty({}, self.options().dtype(c10::toRealValueType(self.scalar_type())));
return _fft_c2r_mps_out(self, dim, normalization, last_dim_size, out);
}
Tensor _fft_r2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) {
TORCH_CHECK(self.is_floating_point());
auto input_sizes = self.sizes();
DimVector out_sizes(input_sizes.begin(), input_sizes.end());
auto last_dim = dim.back();
auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1;
if (onesided) {
out_sizes[last_dim] = last_dim_halfsize;
}
auto out = at::empty(out_sizes, self.options().dtype(c10::toComplexType(self.scalar_type())));
auto out = at::empty({}, self.options().dtype(c10::toComplexType(self.scalar_type())));
return _fft_r2c_mps_out(self, dim, normalization, onesided, out);
}
@ -72,6 +61,17 @@ using namespace mps;
// TODO: Investigate numerical discrepancies see https://github.com/pytorch/pytorch/issues/120237
Tensor& _fft_r2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided, Tensor& out) {
TORCH_CHECK(self.scalar_type() == kFloat || self.scalar_type() == kHalf, "Only float and half dtypes are supported");
TORCH_CHECK(out.scalar_type() == c10::toComplexType(self.scalar_type()));
const auto input_sizes = self.sym_sizes();
SymDimVector out_sizes(input_sizes.begin(), input_sizes.end());
auto last_dim = dim.back();
auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1;
if (onesided) {
out_sizes[last_dim] = last_dim_halfsize;
}
at::native::resize_output_symint(out, out_sizes);
auto key = __func__ + getTensorsStringKey({self, out}) + ":" + getArrayRefString(dim) + ":" +
std::to_string(normalization) + ":" + std::to_string(onesided);
@autoreleasepool {
@ -112,6 +112,12 @@ Tensor& _fft_c2r_mps_out(const Tensor& self,
int64_t normalization,
int64_t last_dim_size,
Tensor& out) {
TORCH_CHECK(self.is_complex(), "Input must be complex");
TORCH_CHECK(out.scalar_type() == c10::toRealValueType(self.scalar_type()), "Unexpected output type");
const auto in_sizes = self.sym_sizes();
SymDimVector out_sizes(in_sizes.begin(), in_sizes.end());
out_sizes[dim.back()] = last_dim_size;
at::native::resize_output_symint(out, out_sizes);
auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" +
std::to_string(normalization) + ":" + std::to_string(last_dim_size);
@autoreleasepool {

View File

@ -617,6 +617,9 @@ Tensor& index_select_out_mps(const Tensor& self, int64_t dim, const Tensor& inde
TORCH_CHECK(self.scalar_type() == output.scalar_type(),
"index_select(): self and output must have the same scalar type");
TORCH_CHECK(dim == 0 || dim < self.dim(), "index_select(): Indexing dim ", dim, " is out of bounds of tensor");
at::assert_no_internal_overlap(output);
at::assert_no_overlap(output, self);
at::assert_no_overlap(output, index);
auto output_size = self.sizes().vec();
if (self.dim() > 0) {
output_size[dim] = num_indices;

View File

@ -370,7 +370,7 @@ static void nllnd_loss_backward_impl(Tensor& grad_input_arg,
onValue:-1.0f
offValue:0.0f
name:nil];
oneHotTensor = castMPSTensor(mpsGraph, oneHotTensor, inputTensor.dataType);
oneHotTensor = castMPSTensor(mpsGraph, oneHotTensor, [inputTensor dataType]);
if (isWeightsArrayValid) {
oneHotTensor = [mpsGraph multiplicationWithPrimaryTensor:oneHotTensor
secondaryTensor:weightTensor
@ -705,6 +705,7 @@ static void smooth_l1_loss_template(const Tensor& input,
TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.");
TORCH_CHECK(input.is_mps());
TORCH_CHECK(target.is_mps());
TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "MPS doesn't know how to do square_i64");
if ((input.numel() == 0) || (target.numel() == 0)) {
reduction == Reduction::Mean ? output.fill_(std::numeric_limits<float>::quiet_NaN()) : output.zero_();
return;
@ -771,7 +772,7 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar:beta dataType:MPSDataTypeFloat32];
MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar:beta dataType:[inputTensor dataType]];
// xn - yn
MPSGraphTensor* diffTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensor
secondaryTensor:targetTensor
@ -797,7 +798,8 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
name:@"lossTensor"];
MPSGraphTensor* outputTensor = lossTensor;
if (reduction == Reduction::Mean) {
MPSGraphTensor* numelTensor = [mpsGraph constantWithScalar:(double)input.numel() dataType:MPSDataTypeFloat32];
MPSGraphTensor* numelTensor = [mpsGraph constantWithScalar:(double)input.numel()
dataType:[lossTensor dataType]];
outputTensor = [mpsGraph divisionWithPrimaryTensor:lossTensor secondaryTensor:numelTensor name:nil];
}
MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:outputTensor

View File

@ -1028,15 +1028,18 @@ TORCH_IMPL_FUNC(prod_out_mps)
}
TORCH_IMPL_FUNC(amax_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amax is not defined for complex types");
reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMAX, "amax_out_mps");
}
TORCH_IMPL_FUNC(amin_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amin is not defined for complex types");
reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMIN, "amin_out_mps");
}
TORCH_IMPL_FUNC(aminmax_out_mps)
(const Tensor& input_t, std::optional<int64_t> dim_opt, bool keepdim, const Tensor& min_t, const Tensor& max_t) {
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "aminmax is not defined for complex types");
reduction_out_mps(input_t,
dim_opt.has_value() ? OptionalIntArrayRef({*dim_opt}) : std::nullopt,
keepdim,

View File

@ -31,6 +31,7 @@ void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& v
indices.copy_(values.toType(at::ScalarType::Long));
return;
}
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()), "kthvalue is not implemented for complex types");
// issue #154890, raising error to prevent crash within MPSGraph until
// workaround is implemented.
TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");

View File

@ -3622,8 +3622,7 @@
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: logaddexp_out
MPS: logaddexp_out_mps
CPU, CUDA, MPS: logaddexp_out
tags: pointwise
- func: logaddexp(Tensor self, Tensor other) -> Tensor
@ -3635,8 +3634,7 @@
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: logaddexp2_out
MPS: logaddexp2_out_mps
CPU, CUDA, MPS: logaddexp2_out
tags: pointwise
- func: logaddexp2(Tensor self, Tensor other) -> Tensor
@ -8867,11 +8865,11 @@
autogen: bitwise_right_shift.Scalar_Tensor_out
tags: pointwise
- func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
- func: tril_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)
structured_delegate: tril.out
variants: method
- func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
- func: triu_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)
structured_delegate: triu.out
variants: method
@ -8995,25 +8993,25 @@
- func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
variants: method, function
- func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
- func: triu.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU: triu_cpu
CUDA: triu_cuda
MPS: triu_mps_out
- func: triu(Tensor self, int diagonal=0) -> Tensor
- func: triu(Tensor self, SymInt diagonal=0) -> Tensor
structured_delegate: triu.out
variants: method, function
- func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
- func: tril.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU: tril_cpu
CUDA: tril_cuda
MPS: tril_mps_out
- func: tril(Tensor self, int diagonal=0) -> Tensor
- func: tril(Tensor self, SymInt diagonal=0) -> Tensor
structured_delegate: tril.out
variants: method, function

View File

@ -73,8 +73,7 @@ void upsample_bilinear2d_out_frame(
const auto rwidth = area_pixel_compute_scale<float>(
input_width, output_width, align_corners, scales_w);
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
float output_scale = output.q_scale() / input.q_scale();
float output_scale = static_cast<float>(output.q_scale() / input.q_scale());
const int64_t input_q_zero_point = input.q_zero_point();
const int64_t output_q_zero_point = output.q_zero_point();

View File

@ -148,7 +148,7 @@ Tensor qcat_nhwc_kernel(
// Vectorized loop
if (c + VLEN <= curr_C) {
auto curr_scale_vec = Vectorized<float>(curr_scale);
auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
for (; c + VLEN <= curr_C; c += VLEN) {
auto inp_vec = Vec::loadu(iptr + c);
@ -174,7 +174,7 @@ Tensor qcat_nhwc_kernel(
int64_t elem_size = curr_C - c;
if ((VLEN == 4 * kVLEN) && elem_size >= kVLEN) {
auto curr_scale_vec = Vectorized<float>(curr_scale);
auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
int64_t vec_num = elem_size / kVLEN;
std::array<typename scalar_t::underlying, VLEN> buf_in{};
@ -611,12 +611,10 @@ void qrelu_kernel(const Tensor& qx, Tensor& qy) {
void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
const Scalar& negval_) {
int64_t i_zp = qx.q_zero_point();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float i_scale = qx.q_scale();
float i_scale = static_cast<float>(qx.q_scale());
int64_t o_zp = out.q_zero_point();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float o_scale = out.q_scale();
float o_scale = static_cast<float>(out.q_scale());
float o_inv_scale = 1.0f / o_scale;
float negval = negval_.to<float>();
@ -627,8 +625,8 @@ void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
Vec zero_vec = Vec(0.0f);
Vec one_vec = Vec(1.0f);
Vec i_scale_vec = Vec((float)i_scale);
Vec i_zp_vec = Vec((float)i_zp);
Vec i_scale_vec = Vec(i_scale);
Vec i_zp_vec = Vec(i_zp);
Vec i_scale_zp_neg_premul_vec = i_scale_vec * i_zp_vec.neg();
Vec negval_vec = Vec(negval);
@ -738,10 +736,9 @@ void qprelu_out_kernel(Tensor& out,
void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
int64_t zero_point = qx.q_zero_point();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float scale = qx.q_scale();
float scale = static_cast<float>(qx.q_scale());
auto scale_vec = Vectorized<float>(scale);
auto zero_point_vec = Vectorized<float>((float)zero_point);
auto zero_point_vec = Vectorized<float>(zero_point);
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
int64_t output_zero_point = zero_point;
float output_scale = scale;
@ -828,10 +825,9 @@ void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
void qsigmoid_kernel(
const Tensor& qx, Tensor& qy, double output_scale, int64_t output_zero_point ) {
int64_t zero_point = qx.q_zero_point();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float scale = qx.q_scale();
float scale = static_cast<float>(qx.q_scale());
auto scale_vec = Vectorized<float>(scale);
auto zero_point_vec = Vectorized<float>((float)zero_point);
auto zero_point_vec = Vectorized<float>(zero_point);
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qsigmoid", [&]() {
float inv_output_scale = 1.0 / output_scale;
@ -870,10 +866,9 @@ void qsigmoid_kernel(
void qhardsigmoid_kernel(const Tensor& qx, Tensor& qy) {
int64_t zero_point = qx.q_zero_point();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float scale = qx.q_scale();
float scale = static_cast<float>(qx.q_scale());
auto scale_vec = Vectorized<float>(scale);
auto zero_point_vec = Vectorized<float>((float)zero_point);
auto zero_point_vec = Vectorized<float>(zero_point);
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qhardsigmoid", [&]() {
@ -1029,13 +1024,10 @@ void qthreshold_kernel(
// defines input and output scales and zero_points
int64_t input_zero_point = qx.q_zero_point();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float input_scale = qx.q_scale();
float input_scale = static_cast<float>(qx.q_scale());
int64_t output_zero_point = qy.q_zero_point();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float output_scale = qy.q_scale();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float inv_output_scale = 1.0 / output_scale;
float output_scale = static_cast<float>(qy.q_scale());
float inv_output_scale = static_cast<float>(1.0 / output_scale);
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qthreshold", [&]() {
qy = at::_empty_affine_quantized(
@ -1096,8 +1088,7 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {
const auto o_scale = qy.q_scale();
const auto o_zero_point = qy.q_zero_point();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
const float o_inv_scale = 1.0 / o_scale;
const float o_inv_scale = static_cast<float>(1.0 / o_scale);
using fVec = Vectorized<float>;
fVec i_scale_vec(i_scale);
@ -1135,10 +1126,9 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {
void qtanh_kernel(const Tensor& qx, Tensor& qy) {
int64_t zero_point = qx.q_zero_point();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float scale = qx.q_scale();
float scale = static_cast<float>(qx.q_scale());
auto scale_vec = Vectorized<float>(scale);
auto zero_point_vec = Vectorized<float>((float)zero_point);
auto zero_point_vec = Vectorized<float>(zero_point);
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qtanh", [&]() {
@ -1198,16 +1188,13 @@ void qelu_kernel(
// they are NOT related to the quantization scale term
int64_t i_zp = qx.q_zero_point();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float i_scale = qx.q_scale();
float i_scale = static_cast<float>(qx.q_scale());
// In a future PR, we can improve on output scale and zero_point
// selection.
int64_t o_zp = qy.q_zero_point();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float o_scale = qy.q_scale();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float inv_o_scale = 1.0 / o_scale;
float o_scale = static_cast<float>(qy.q_scale());
float inv_o_scale = static_cast<float>(1.0 / o_scale);
float alpha_float = alpha.to<float>();
float scale_coef = scale.to<float>();
@ -1227,7 +1214,7 @@ void qelu_kernel(
Vec scale_coef_vec = Vec(scale_coef);
Vec input_scale_coef_vec = Vec(input_scale_coef);
Vec i_scale_vec = Vec(i_scale);
Vec i_zero_point_vec = Vec((float)i_zp);
Vec i_zero_point_vec = Vec(i_zp);
Vec i_scale_neg_zp_premul_vec = i_scale_vec * i_zero_point_vec.neg();
cpu_kernel_vec(
@ -1326,23 +1313,20 @@ void qadd_scalar_kernel(Tensor& out, const Tensor& self, const Scalar& other) {
template <bool ReLUFused = false>
void qadd_kernel(Tensor& out, const Tensor& self, const Tensor& other) {
int64_t zero_point = out.q_zero_point();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float scale = out.q_scale();
float scale = static_cast<float>(out.q_scale());
float inv_scale = 1.0f / scale;
int64_t self_zero_point = self.q_zero_point();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float self_scale = self.q_scale();
float self_scale = static_cast<float>(self.q_scale());
int64_t other_zero_point = other.q_zero_point();
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
float other_scale = other.q_scale();
float other_scale = static_cast<float>(other.q_scale());
// Broadcast out the parameters here to amortize out that cost across
// loop iterations.
// TODO: we can optimize dequantization by doing a premultiplication
// of the zero point by scale and doing FMA on scale*x_q - (scale*zero_point)
auto self_zero_point_vec = Vectorized<float>((float)self_zero_point);
auto self_zero_point_vec = Vectorized<float>(self_zero_point);
auto self_scale_vec = Vectorized<float>(self_scale);
auto other_zero_point_vec = Vectorized<float>((float)other_zero_point);
auto other_zero_point_vec = Vectorized<float>(other_zero_point);
auto other_scale_vec = Vectorized<float>(other_scale);
auto self_scale_neg_zp_premul_vec = self_scale_vec * self_zero_point_vec.neg();
@ -2965,7 +2949,7 @@ void quantized_normalize_kernel(
const bool beta_null = beta_data == nullptr;
int64_t x_zp = X.q_zero_point();
float x_scale = X.q_scale();
fVec x_zp_vec((float)x_zp);
fVec x_zp_vec(x_zp);
fVec one_vec(1.0f);
fVec zero_vec(0.0f);
float x_fake_scale = 1.0f;
@ -3253,7 +3237,7 @@ void quantized_groupnorm_nhwc_kernel(
const bool beta_null = beta_data == nullptr;
int64_t x_zp = X.q_zero_point();
float x_scale = X.q_scale();
fVec x_zp_vec((float)x_zp);
fVec x_zp_vec(x_zp);
fVec one_vec(1.0f);
fVec zero_vec(0.0f);
float x_fake_scale = 1.0f;

View File

@ -414,7 +414,6 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_impl(
TORCH_CHECK(input.size(input.dim() - 1) == packed_weight_fp16.numRows())
TORCH_CHECK(input.dim() >= 2);
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
const int64_t N = packed_weight_fp16.numCols();
std::vector<int64_t> output_sizes = input.sizes().vec();

View File

@ -467,6 +467,28 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, IntArrayRe
!options.has_layout() || options.layout() == kSparse,
"expected sparse layout, but got layout ",
options.layout());
if (indices.numel() > 0) {
Tensor min_indices =
std::get</* values */ 0>(indices.min(/* dim */ 1, /* keepdim */ false));
Tensor cpu_min_indices;
if (!indices.is_cpu()) {
cpu_min_indices = min_indices.to(at::DeviceType::CPU);
} else {
cpu_min_indices = min_indices;
}
auto cpu_min_indices_accessor = cpu_min_indices.accessor<int64_t, 1>();
for (const auto d : c10::irange(indices.size(0))) {
int64_t min_index_in_dim = cpu_min_indices_accessor[d];
TORCH_CHECK(
min_index_in_dim >= 0,
"found negative index ",
min_index_in_dim,
" for dim ",
d);
}
}
return at::native::_sparse_coo_tensor_unsafe(
indices,
values,

View File

@ -22,6 +22,7 @@
#else
#include <ATen/ops/empty.h>
#include <ATen/ops/empty_like.h>
#include <ATen/ops/zeros_like.h>
#include <ATen/ops/reshape.h>
#include <ATen/ops/scalar_tensor.h>
#include <ATen/ops/sum.h>
@ -42,7 +43,6 @@ C10_DIAGNOSTIC_POP()
#include <static_switch.h>
#include <ATen/native/transformers/cuda/flash_attn/flash_api.h>
#include <c10/util/Exception.h>
namespace FLASH_NAMESPACE {
@ -417,6 +417,26 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
const int head_size_og = sizes[3];
const int seqlen_k = k.size(1);
const int num_heads_k = k.size(2);
if (batch_size == 0) {
auto opts = q.options();
at::Tensor out = at::empty({0, seqlen_q, num_heads, head_size_og}, opts);
at::Tensor q_padded = at::empty({0, seqlen_q, num_heads, head_size_og}, opts);
at::Tensor k_padded = at::empty({0, seqlen_k, num_heads_k, head_size_og}, opts);
at::Tensor v_padded = at::empty({0, seqlen_k, num_heads_k, head_size_og}, opts);
at::Tensor softmax_lse = at::empty({0, num_heads, seqlen_q}, opts.dtype(at::kFloat));
at::Tensor rng_state = at::empty({2}, at::dtype(c10::kUInt64).device(at::kCUDA));
at::Tensor _unused = at::empty({}, at::dtype(c10::kUInt64).device(at::kCUDA));
at::Tensor p = at::empty({0}, opts);
if (return_softmax) {
auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
p = at::empty({0, num_heads, seqlen_q_rounded, seqlen_k_rounded}, opts);
}
return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), _unused, std::move(p)};
}
TORCH_CHECK(batch_size > 0, "batch size must be positive");
TORCH_CHECK(head_size_og % 8 == 0, "head_size must be a multiple of 8, this is ensured by padding!");
TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
@ -547,7 +567,7 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
}
return {out, q_padded, k_padded, v_padded, softmax_lse, rng_state, _unused, p};
return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), std::move(_unused), std::move(p)};
}
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
@ -852,7 +872,6 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
const auto sizes = q.sizes();
@ -863,6 +882,20 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
const int head_size = sizes[3];
const int seqlen_k = k.size(1);
const int num_heads_k = k.size(2);
if (batch_size == 0) {
auto opts = q.options();
at::Tensor dq = at::empty_like(q);
at::Tensor dk = at::empty_like(k);
at::Tensor dv = at::empty_like(v);
auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
at::Tensor softmax_d = at::empty({0, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
return {dq, dk, dv, softmax_d};
}
TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
TORCH_CHECK(batch_size > 0, "batch size must be positive");
TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");

View File

@ -1837,6 +1837,10 @@ class BenchmarkRunner:
def skip_models_for_cuda(self):
return set()
@property
def skip_models_for_xpu(self):
return set()
@property
def skip_models_for_cpu(self):
return set()
@ -3927,6 +3931,8 @@ def run(runner, args, original_dir=None):
runner.skip_models.update(runner.skip_models_for_cpu_aarch64)
elif args.devices == ["cuda"]:
runner.skip_models.update(runner.skip_models_for_cuda)
elif args.devices == ["xpu"]:
runner.skip_models.update(runner.skip_models_for_xpu)
if not args.multiprocess:
runner.skip_models.update(runner.skip_multiprocess_models)

View File

@ -124,6 +124,10 @@ class TorchBenchmarkRunner(BenchmarkRunner):
def skip_models_for_cuda(self):
return self._skip["device"]["cuda"]
@property
def skip_models_for_xpu(self):
return self._skip["device"]["xpu"]
@property
def skip_models_for_freezing_cuda(self):
return self._skip["freezing"]["cuda"]

View File

@ -217,6 +217,9 @@ skip:
cuda: []
xpu:
- *DETECTRON2_MODELS
test:
training:
- *DETECTRON2_MODELS

View File

@ -0,0 +1,157 @@
"""Configuration utilities for parsing JSON and YAML config files."""
import json
import re
def heads_input_type(s: str) -> tuple[int, int]:
"""Convert string format 'Hq,Hkv' to tuple (Hq, Hkv)."""
try:
hq, hkv = map(int, s.split(","))
return hq, hkv
except Exception as e:
raise ValueError("Heads must be Hq,Hkv") from e
default_config = {
"dynamic": False,
"calculate_bwd": False,
"dtype": "bfloat16",
"b": [2, 8, 16],
"nh": ["16,16", "16,2"],
"s": [512, 1024, 4096],
"d": [64, 128],
"mods": ["noop", "causal", "alibi", "sliding_window"],
"backend": ["efficient"],
"max_autotune": False,
"decoding": False,
"kv_size": None,
"throughput": True,
"save_path": None,
"output_json_for_dashboard": None,
"benchmark_name": "PyTorch operator microbenchmark",
}
def load_config_file(config_path: str) -> dict:
"""Load configuration from JSON or YAML file.
Automatically converts 'nh' field from strings to tuples.
Args:
config_path: Path to the configuration file
Returns:
Dictionary containing the configuration
Raises:
FileNotFoundError: If config file doesn't exist
ValueError: If config file format is invalid
"""
with open(config_path) as f:
config_str = f.read()
# Try to load as JSON first
try:
config = json.loads(config_str)
except json.JSONDecodeError:
# Fall back to YAML parsing
config = _parse_simple_yaml(config_str)
# Apply automatic conversions for 'nh' field
if "nh" in config and isinstance(config["nh"], list):
config["nh"] = [
heads_input_type(h) if isinstance(h, str) else h for h in config["nh"]
]
return config
def _parse_simple_yaml(yaml_str: str) -> dict:
"""Simple YAML parser for basic configs (without external dependencies).
Supports:
- key: value pairs
- booleans (true/false)
- null values
- integers and floats
- strings (quoted and unquoted)
- lists in JSON format [item1, item2, ...]
- comments (lines starting with # or after #)
Args:
yaml_str: YAML content as string
Returns:
Dictionary containing parsed YAML content
"""
config = {}
for line in yaml_str.split("\n"):
# Remove comments
line = line.split("#")[0].strip()
if not line or ":" not in line:
continue
key, value = line.split(":", 1)
key = key.strip()
value = value.strip()
# Parse value based on type
if value.lower() == "true":
config[key] = True
elif value.lower() == "false":
config[key] = False
elif value.lower() in ("null", "none", ""):
config[key] = None
elif value.startswith("[") and value.endswith("]"):
# Parse list - handle quoted strings properly
pattern = r'"([^"]+)"|\'([^\']+)\'|([^,\[\]\s]+)'
matches = re.findall(pattern, value[1:-1]) # Remove [ ]
parsed_items = []
for match in matches:
# match is a tuple of (double_quoted, single_quoted, unquoted)
item = match[0] or match[1] or match[2]
item = item.strip()
if item:
try:
parsed_items.append(int(item))
except ValueError:
parsed_items.append(item)
config[key] = parsed_items
elif value.startswith(('"', "'")):
config[key] = value.strip("\"'")
else:
# Try to parse as number
try:
config[key] = int(value)
except ValueError:
try:
config[key] = float(value)
except ValueError:
config[key] = value
return config
def print_default_config(output_format: str) -> None:
"""Print a default configuration template in JSON or YAML format.
Args:
output_format: Either "json" or "yaml"
"""
if output_format == "json":
print(json.dumps(default_config, indent=2))
else: # yaml
for key, value in default_config.items():
if value is None:
print(f"{key}: null")
elif isinstance(value, bool):
print(f"{key}: {str(value).lower()}")
elif isinstance(value, str):
print(f'{key}: "{value}"')
elif isinstance(value, list):
print(f"{key}: {json.dumps(value)}")
else:
print(f"{key}: {value}")

View File

@ -0,0 +1,29 @@
# Basic benchmark configuration for PyTorch transformer benchmarks
# Usage: python score_mod.py --config config_basic.yaml
# Core parameters
dynamic: false
calculate_bwd: true
dtype: "bfloat16"
# Shape parameters - larger sweep
b: [1, 2, 4, 8, 16] # batch sizes
nh: ["16,16", "16,2", "32,32", "32,4"] # [query_heads,key_value_heads]
s: [512, 1024, 2048, 4096, 8192] # sequence lengths
d: [64, 128] # head dimensions (limited to 128 for Flash Attention/cuDNN compatibility)
# All attention types
mods: ["noop", "causal", "rel", "head_bias", "alibi", "sliding_window", "prefix_lm", "softcap"]
# Multiple backends for comparison (SDPA + Flash Attention) - flex is always included internally
backend: ["efficient", "math", "cudnn", "fav2"]
max_autotune: true # Enable torch.compile with max-autotune for optimal performance
# Decoding and cache settings
decoding: false
kv_size: null
# Metrics and output
throughput: true # Calculate memory bandwidth & TFLOPS
save_path: "comprehensive_results.csv" # Save to CSV
output_json_for_dashboard: "attn_bench_basic.json"

View File

@ -1,15 +1,19 @@
import argparse
import csv
import gc
import itertools
import json
import random
import sys
from collections import defaultdict
from collections.abc import Callable
from contextlib import nullcontext
from dataclasses import asdict, dataclass
from functools import partial
from typing import Optional, Union
from functools import partial, wraps
from typing import Literal, Optional, Union
import numpy as np
from config_utils import heads_input_type, load_config_file, print_default_config
from tabulate import tabulate
from tqdm import tqdm
@ -33,6 +37,96 @@ torch._dynamo.config.recompile_limit = 1000
from torch._inductor.runtime.benchmarking import benchmarker
def cleanup_memory():
"""Aggressively free GPU memory"""
torch.cuda.empty_cache()
gc.collect()
if torch.cuda.is_available():
torch.cuda.synchronize()
def safe_backend(backend_name=None, return_dict=False):
"""Decorator that wraps backend functions with error handling
Args:
backend_name: Name of the backend for error messages
return_dict: If True, returns dict of results for all backends (for run_single_experiment)
If False, returns single ExperimentResults (for individual backend functions)
"""
def decorator(func):
@wraps(func)
def wrapper(config, *args, **kwargs):
try:
return func(config, *args, **kwargs)
except torch.OutOfMemoryError:
print(
f"[SKIP] OOM for {backend_name or func.__name__} with shape {config.shape}"
)
cleanup_memory()
except RuntimeError as e:
error_msg = str(e)
if "out of resource" in error_msg or "OutOfMemoryError" in error_msg:
print(
f"[SKIP] Triton OOM for {backend_name or func.__name__} with shape {config.shape}"
)
cleanup_memory()
elif "No valid triton configs" in error_msg:
print(
f"[SKIP] No valid Triton config for {backend_name or func.__name__} with shape {config.shape}"
)
else:
print(
f"[SKIP] Runtime error for {backend_name or func.__name__} with shape {config.shape}: {str(e)[:100]}"
)
except Exception as e:
print(
f"[SKIP] Error for {backend_name or func.__name__} with shape {config.shape}: {str(e)[:100]}"
)
# Return appropriate NaN result based on function type
if return_dict:
# For run_single_experiment: return dict with NaN for all backends
nan_result = ExperimentResults(
fwd_time=float("nan"),
bwd_time=float("nan") if config.calculate_bwd_time else None,
)
results = dict.fromkeys(config.backends, nan_result)
results["flex"] = ExperimentResults(
fwd_time=float("nan"),
bwd_time=float("nan") if config.calculate_bwd_time else None,
sparsity=None,
)
return results
else:
# For individual backend functions: return single ExperimentResults
return ExperimentResults(
fwd_time=float("nan"),
bwd_time=float("nan") if config.calculate_bwd_time else None,
)
return wrapper
return decorator
# Type definitions
Backend = Literal["math", "efficient", "cudnn", "fav2", "fav3", "fakv", "og-eager"]
AttentionType = Literal[
"noop",
"causal",
"rel",
"head_bias",
"alibi",
"sliding_window",
"document_mask",
"prefix_lm",
"softcap",
]
DtypeString = Literal["bfloat16", "float16", "float32"]
SpeedupType = Literal["fwd", "bwd"]
def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
# warmup
for _ in range(5):
@ -48,6 +142,7 @@ class ExperimentConfig:
calculate_bwd_time: bool
cal_bandwidth: bool
backends: list[str]
max_autotune: bool
def __post_init__(self):
assert len(self.shape) == 6, (
@ -62,6 +157,7 @@ class ExperimentConfig:
d.pop("cal_bandwidth", None)
d["shape(B,Hq,M,Hkv,N,D)"] = d.pop("shape")
d.pop("backends", None)
d.pop("max_autotune", False)
return d
@ -209,6 +305,7 @@ def query_key_value_clones(
return query_ref, key_ref, value_ref
@safe_backend("SDPA")
def run_single_backend_sdpa(
config: ExperimentConfig,
query: torch.Tensor,
@ -223,6 +320,7 @@ def run_single_backend_sdpa(
backend_context = get_backend_context(backend)
with backend_context:
_device = torch.device("cuda")
eager_sdpa = generate_eager_sdpa(
config.attn_type, config.shape, config.dtype, block_mask, score_mod
)
@ -290,6 +388,7 @@ def run_single_backend_sdpa(
)
@safe_backend("FlashAttention")
def run_single_backend_FA(
config: ExperimentConfig,
query: torch.Tensor,
@ -301,9 +400,9 @@ def run_single_backend_FA(
mask_kwargs,
backend: str,
) -> ExperimentResults:
assert backend in ["fav2", "fav3", "fakv"]
assert backend in ["fav3", "fakv"]
# Generate callable for specific backend.
if backend in ["fav2", "fav3"]:
if backend in ["fav3"]:
FA = generate_FA_callable(
config.attn_type, config.shape, config.dtype, backend, **mask_kwargs
)
@ -354,10 +453,10 @@ def run_single_backend_FA(
)
@safe_backend("flex_attention", return_dict=True)
def run_single_experiment(
config: ExperimentConfig,
dynamic=False,
max_autotune=False,
) -> dict[str, ExperimentResults]:
device = torch.device("cuda")
batch_size, q_heads, q_seq_len, kv_heads, kv_seq_len, head_dim = config.shape
@ -377,7 +476,7 @@ def run_single_experiment(
block_mask, mask_kwargs = generate_block_mask(config.attn_type, config.shape)
kernel_options = get_kernel_options(config.attn_type, config.shape)
if max_autotune:
if config.max_autotune:
compiled_sdpa = torch.compile(
flex_attention, dynamic=dynamic, mode="max-autotune-no-cudagraphs"
)
@ -407,7 +506,7 @@ def run_single_experiment(
results = {}
for backend in config.backends:
if backend in ["fav2", "fav3", "fakv"]:
if backend in ["fav3", "fakv"]:
results[backend] = run_single_backend_FA(
config,
query,
@ -419,7 +518,7 @@ def run_single_experiment(
mask_kwargs,
backend,
)
else: # sdpa
else: # sdpa (also supports fav2)
results[backend] = run_single_backend_sdpa(
config,
query,
@ -440,7 +539,7 @@ def run_single_experiment(
sparsity = block_mask.sparsity() / 100.0 if block_mask is not None else 0.0
sparsity = sparsity if config.attn_type != "document_mask" else 0.5
results["compiled"] = ExperimentResults(
results["flex"] = ExperimentResults(
fwd_time=forward_compiled_time,
bwd_time=backward_compile_time if config.calculate_bwd_time else None,
sparsity=sparsity,
@ -501,15 +600,15 @@ def calculate_tflops(config: ExperimentConfig, results: ExperimentResults) -> fl
softmax_flops = M * N * 2 # Not counting online softmax overhead
o_flops = M * D * N * 2
# Not counting split k overhead
total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - results.sparsity)
sparsity = results.sparsity if results.sparsity is not None else 0.0
total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - sparsity)
return total_flops / results.fwd_time / 1e6 # in TFLOPs/
def get_average_speedups(results: list[Experiment], type: str, backend: str):
# Calculate speedups
speedups = [
calculate_speedup(r.results["compiled"], r.results[backend], type)
for r in results
calculate_speedup(r.results["flex"], r.results[backend], type) for r in results
]
# Find indices of max and min speedups
@ -537,7 +636,7 @@ def get_average_speedups(results: list[Experiment], type: str, backend: str):
def print_results(results: list[Experiment], save_path: Optional[str] = None):
table_data = defaultdict(list)
for experiment in results:
backends = experiment.config.backends + ["compiled"]
backends = experiment.config.backends + ["flex"]
for key, value in experiment.asdict().items():
if key in backends:
if value.fwd_time:
@ -550,45 +649,43 @@ def print_results(results: list[Experiment], save_path: Optional[str] = None):
# Calculate speedups
for backend in results[0].config.backends:
fwd_speedups = [
calculate_speedup(r.results["compiled"], r.results[backend], type="fwd")
calculate_speedup(r.results["flex"], r.results[backend], type="fwd")
for r in results
]
table_data[f"fwd_{backend}_speedup"] = fwd_speedups
table_data[f"fwd_speedup_flex_over_{backend}"] = fwd_speedups
if results[0].config.calculate_bwd_time:
for backend in results[0].config.backends:
bwd_speedups = [
calculate_speedup(r.results["compiled"], r.results[backend], type="bwd")
calculate_speedup(r.results["flex"], r.results[backend], type="bwd")
for r in results
]
table_data[f"bwd_{backend}_speedup"] = bwd_speedups
table_data[f"bwd_speedup_flex_over_{backend}"] = bwd_speedups
# Calculate mem + computational throughput
if results[0].config.cal_bandwidth:
fwd_bandwidth = [
calculate_bandwidth(r.config, r.results["compiled"], type="fwd")
calculate_bandwidth(r.config, r.results["flex"], type="fwd")
for r in results
]
table_data["fwd_mem_bw (TB/s)"] = fwd_bandwidth
fwd_tflops = [
calculate_tflops(r.config, r.results["compiled"]) for r in results
]
fwd_tflops = [calculate_tflops(r.config, r.results["flex"]) for r in results]
table_data["TFlops/s"] = fwd_tflops
print(tabulate(table_data, headers="keys", tablefmt="github", floatfmt=".3f"))
for backend in results[0].config.backends:
if np.isnan(table_data[f"fwd_{backend}_speedup"]).all():
if np.isnan(table_data[f"fwd_speedup_flex_over_{backend}"]).all():
continue
print("\n")
print(f"FWD Speedups vs. {backend}".center(125, "="))
print(f"FWD Speedup of Flex over {backend}".center(125, "="))
print("\n")
average_data = get_average_speedups(results, type="fwd", backend=backend)
print(tabulate(average_data, headers="keys", tablefmt="github", floatfmt=".3f"))
if results[0].config.calculate_bwd_time:
print("\n")
print(f"BWD Speedups vs. {backend}".center(125, "="))
print(f"BWD Speedup of Flex over {backend}".center(125, "="))
print("\n")
average_data = get_average_speedups(results, type="bwd", backend=backend)
print(
@ -791,14 +888,14 @@ def get_backend_context(backend: str):
Returns a context manager for the specified backend.
Args:
backend (str): The name of the backend to use.
Valid options are 'fav2', 'cudnn', 'math', 'efficient', 'fav3', 'fakv', 'og-eager'.
Valid options are 'math', 'efficient', 'cudnn', 'fav2', 'fav3', 'fakv', 'og-eager'.
Returns:
A context manager for the specified backend.
Raises:
ValueError: If an invalid backend is specified.
"""
backends = {
"fav2": nullcontext(),
"fav2": sdpa_kernel(SDPBackend.FLASH_ATTENTION),
"cudnn": sdpa_kernel(SDPBackend.CUDNN_ATTENTION),
"math": sdpa_kernel(SDPBackend.MATH),
"efficient": sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION),
@ -820,15 +917,7 @@ def generate_FA_callable(
) -> Callable | None:
if dtype not in [torch.float16, torch.bfloat16]:
return None
if backend == "fav2":
try:
from flash_attn import flash_attn_func, flash_attn_varlen_func
except ImportError:
print(
"Flash attention 2 is not installed. Please install it to run fav2 backend. "
)
raise
elif backend == "fav3":
if backend == "fav3":
try:
from flash_attn.flash_attn_interface import (
flash_attn_func,
@ -1034,6 +1123,7 @@ def generate_experiment_configs(
kv_cache_size: list[int],
cal_bandwidth: bool,
backends: list[str],
max_autotune: bool,
) -> list[ExperimentConfig]:
assert not (calculate_bwd and decoding), "Decoding does not support backward"
@ -1077,52 +1167,333 @@ def generate_experiment_configs(
calculate_bwd_time=calculate_bwd,
cal_bandwidth=cal_bandwidth,
backends=backends,
max_autotune=max_autotune,
)
)
return all_configs
def main(args):
def _output_json_for_dashboard(
experiments,
output_file,
benchmark_name="PyTorch operator microbenchmark",
):
"""
Write the result into JSON format for PyTorch OSS dashboard.
The JSON format is defined at
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
Args:
experiments: List of experiment results
output_file: Path to output JSON file
benchmark_name: Name of the benchmark
"""
if not experiments:
return
import math
import platform
from dataclasses import asdict, dataclass
from typing import Any, Optional
# Prepare headers and records for JSON output
records = []
for experiment in experiments:
config = experiment.config
results_dict = (
experiment.results
) # This is a dict: backend -> ExperimentResults
# Process each backend result
for backend, results in results_dict.items():
# Skip backends that were not run (NaN results)
if math.isnan(results.fwd_time):
continue
# Extract data from experiment
test_name = f"{backend}_{config.attn_type}_"
input_config = f"shape: {config.shape}, dtype: {config.dtype}"
# Determine mode based on backward pass
mode = "training" if config.calculate_bwd_time else "inference"
# Extract dtype
dtype = (
str(config.dtype).split(".")[1]
if "." in str(config.dtype)
else str(config.dtype)
)
# Determine device
device = "cuda"
# Get device architecture
device_arch = (
torch.cuda.get_device_name(0)
if device == "cuda"
else platform.processor()
if device == "cpu"
else "unknown"
)
# Create dataclasses for JSON structure
@dataclass
class BenchmarkInfo:
name: str
mode: Optional[str]
dtype: str
extra_info: dict[str, Any]
@dataclass
class ModelInfo:
name: str
type: str
origins: list[str]
extra_info: dict[str, Any]
@dataclass
class MetricInfo:
name: str
unit: str
benchmark_values: list[float]
target_value: Optional[float]
@dataclass
class BenchmarkRecord:
benchmark: BenchmarkInfo
model: ModelInfo
metric: MetricInfo
# Benchmark extra info
benchmark_extra_info = {
"input_config": input_config,
"device": device,
"arch": device_arch,
"operator_name": backend,
"attn_type": config.attn_type,
"shape": str(config.shape),
"max_autotune": config.max_autotune,
}
# Add record for forward latency
record_fwd_latency = BenchmarkRecord(
benchmark=BenchmarkInfo(
name=benchmark_name,
mode=mode,
dtype=dtype,
extra_info=benchmark_extra_info,
),
model=ModelInfo(
name=test_name + str(config.shape),
type="attention-benchmark",
origins=["pytorch"],
extra_info={
"operator_name": backend,
"attn_type": config.attn_type,
},
),
metric=MetricInfo(
name="forward latency",
unit="us",
benchmark_values=[results.fwd_time],
target_value=None,
),
)
records.append(asdict(record_fwd_latency))
# Add record for forward memory bandwidth (if available)
if config.cal_bandwidth:
record_fwd_bandwidth = BenchmarkRecord(
benchmark=BenchmarkInfo(
name=benchmark_name,
mode=mode,
dtype=dtype,
extra_info=benchmark_extra_info,
),
model=ModelInfo(
name=test_name + str(config.shape),
type="attention-benchmark",
origins=["pytorch"],
extra_info={
"operator_name": backend,
},
),
metric=MetricInfo(
name="memory bandwidth",
unit="TB/s",
benchmark_values=[calculate_bandwidth(config, results, "fwd")],
target_value=None,
),
)
records.append(asdict(record_fwd_bandwidth))
# Add record for forward TFLOPS (if available)
if config.cal_bandwidth:
record_fwd_tflops = BenchmarkRecord(
benchmark=BenchmarkInfo(
name=benchmark_name,
mode=mode,
dtype=dtype,
extra_info=benchmark_extra_info,
),
model=ModelInfo(
name=test_name + str(config.shape),
type="attention-benchmark",
origins=["pytorch"],
extra_info={
"operator_name": backend,
},
),
metric=MetricInfo(
name="tflops",
unit="TFLOPS/s",
benchmark_values=[calculate_tflops(config, results)],
target_value=None,
),
)
records.append(asdict(record_fwd_tflops))
# Add record for backward latency (if available and not NaN)
if (
config.calculate_bwd_time
and results.bwd_time is not None
and not math.isnan(results.bwd_time)
):
record_bwd_latency = BenchmarkRecord(
benchmark=BenchmarkInfo(
name=benchmark_name,
mode=mode,
dtype=dtype,
extra_info=benchmark_extra_info,
),
model=ModelInfo(
name=test_name + str(config.shape),
type="attention-benchmark",
origins=["pytorch"],
extra_info={
"operator_name": backend,
},
),
metric=MetricInfo(
name="backward latency",
unit="us",
benchmark_values=[results.bwd_time],
target_value=None,
),
)
records.append(asdict(record_bwd_latency))
# Write all records to the output file
with open(output_file, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2)
def main(
dynamic: bool = False,
calculate_bwd: bool = False,
dtype: DtypeString = "bfloat16",
b: list[int] | None = None,
nh: list[str] | None = None,
s: list[int] | None = None,
d: list[int] | None = None,
mods: list[AttentionType] | None = None,
backend: list[Backend] | None = None,
max_autotune: bool = False,
decoding: bool = False,
kv_size: Optional[list[int]] = None,
throughput: bool = True,
save_path: Optional[str] = None,
output_json_for_dashboard: Optional[str] = None,
benchmark_name: str = "PyTorch operator microbenchmark",
) -> None:
"""Run sweep over sizes and score mods for flex attention.
Usage Examples:
# Use a yml config file
python score_mod.py --config basic_config.yaml
# Use a json config file
python score_mod.py --config my_config.json
# Generate a config template
python score_mod.py --print-config json > my_config.json # For a json config
python score_mod.py --print-config yaml > my_config.yaml # For a yaml config
# Override config with CLI args
python score_mod.py --config my_config.json -dtype float16 --max-autotune
# Pure CLI usage
python score_mod.py -b 4 8 -s 1024 2048 -mods causal alibi --backend efficient
Args:
dynamic: Runs a dynamic shapes version of compiled flex attention
calculate_bwd: Calculate backward pass times
dtype: Data type for tensors (bfloat16, float16, float32)
b: Batch sizes to benchmark
nh: Number of query and key/value heads in format "Hq,Hkv"
s: Sequence lengths to benchmark
d: Head dimensions to benchmark
mods: Score modifications: noop, causal, rel, head_bias, alibi, sliding_window, document_mask, prefix_lm, softcap
backend: Backends for attention computation: math, efficient, cudnn, fav2, fav3, fakv, og-eager
max_autotune: Turn on max-autotune optimization
decoding: Benchmark decoding mode (query sequence length = 1)
kv_size: Key/value cache size in MiB (ignores batch size if specified)
throughput: Calculate kernel memory bandwidth & computational throughput (always True)
save_path: Path to save the results CSV file
output_json_for_dashboard: Path to save results in JSON format for PyTorch OSS dashboard
benchmark_name: Name of the benchmark for dashboard output
"""
# Convert dtype string to torch dtype (if not already converted)
import torch
if isinstance(dtype, str):
dtype = getattr(torch, dtype)
# Always calculate throughput
throughput = True
print("Backend: ", backend)
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
results = []
for config in tqdm(
generate_experiment_configs(
args.calculate_bwd,
args.dtype,
args.b,
args.nh,
args.s,
args.d,
args.mods,
args.decoding,
args.kv_size,
args.throughput,
args.backend,
)
for experiment_count, config in enumerate(
tqdm(
generate_experiment_configs(
calculate_bwd,
dtype,
b,
nh,
s,
d,
mods,
decoding,
kv_size,
throughput,
backend,
max_autotune,
)
),
start=1,
):
results.append(
Experiment(
config,
run_single_experiment(
config,
dynamic=args.dynamic,
max_autotune=args.max_autotune,
dynamic=dynamic,
),
)
)
print_results(results, args.save_path)
# Periodic memory cleanup every 50 experiments
if experiment_count % 50 == 0:
cleanup_memory()
print_results(results, save_path)
def heads_input_type(s):
try:
hq, hkv = map(int, s.split(","))
return hq, hkv
except Exception as e:
raise argparse.ArgumentTypeError("Heads must be Hq,Hkv") from e
# Output JSON for dashboard if requested
if output_json_for_dashboard:
_output_json_for_dashboard(results, output_json_for_dashboard, benchmark_name)
if __name__ == "__main__":
@ -1130,6 +1501,12 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Run sweep over sizes and score mods for flex attention"
)
parser.add_argument(
"--config",
type=str,
help="Path to JSON config file. CLI args override config file values.",
default=None,
)
parser.add_argument(
"--dynamic",
action="store_true",
@ -1199,8 +1576,49 @@ Ignores -b batch size and calculate batch size from kv size instead when specifi
default=["efficient"],
help="Backend to use for attention computation",
)
parser.add_argument(
"--output-json-for-dashboard",
type=str,
help="Path to save results in JSON format for PyTorch OSS dashboard",
default=None,
)
parser.add_argument(
"--benchmark-name",
type=str,
help="Name of the benchmark for dashboard output",
default="PyTorch operator microbenchmark",
)
parser.add_argument(
"--print-config",
type=str,
choices=["json", "yaml"],
help="Print a default config template in JSON or YAML format and exit",
default=None,
)
# Parse arguments
args = parser.parse_args()
args.dtype = getattr(torch, args.dtype)
main(args)
# Handle --print-config
if args.print_config:
print_default_config(args.print_config)
sys.exit(0)
# Load and merge config if provided
if args.config:
config = load_config_file(args.config)
# Merge config with CLI args (CLI args take precedence)
json_args = argparse.Namespace()
json_args.__dict__ = config
args = parser.parse_args(namespace=json_args)
# Convert dtype string to torch dtype (only if it's still a string)
if isinstance(args.dtype, str):
args.dtype = getattr(torch, args.dtype)
# Remove config and print_config from args before passing to main
args_dict = vars(args)
args_dict.pop("config", None)
args_dict.pop("print_config", None)
main(**args_dict)

View File

@ -482,6 +482,7 @@ inductor_core_resources = [
"torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp",
"torch/csrc/inductor/inductor_ops.cpp",
"torch/csrc/jit/serialization/pickle.cpp",
"torch/csrc/shim_common.cpp",
]
libtorch_core_sources = sorted(
@ -916,6 +917,7 @@ libtorch_python_core_sources = [
"torch/csrc/autograd/python_torch_functions_manual.cpp",
"torch/csrc/autograd/python_variable.cpp",
"torch/csrc/autograd/python_variable_indexing.cpp",
"torch/csrc/distributed/python_placement.cpp",
"torch/csrc/dynamo/python_compiled_autograd.cpp",
"torch/csrc/dynamo/cache_entry.cpp",
"torch/csrc/dynamo/cpp_shim.cpp",

View File

@ -556,3 +556,26 @@ inline SymBool sym_ge(const SymInt& a, const SymInt& b) {
}
} // namespace c10
#include <limits>
namespace std {
template <>
class numeric_limits<c10::SymInt> {
public:
static constexpr bool is_specialized = true;
static constexpr int64_t max() noexcept {
return std::numeric_limits<int64_t>::max();
}
static constexpr int64_t min() noexcept {
return std::numeric_limits<int64_t>::min();
}
static constexpr bool is_signed = true;
static constexpr bool is_integer = true;
};
} // namespace std

View File

@ -1,4 +1,4 @@
// Implementation of specal math functions for Metal
// Implementation of special math functions for Metal
#pragma once
#include <c10/metal/expm1f.h>
#include <c10/metal/igamma.h>
@ -624,6 +624,64 @@ inline T spherical_bessel_j0(T x) {
return static_cast<T>(::metal::sin(x) / x);
}
template <typename T>
inline ::metal::enable_if_t<is_scalar_floating_point_v<T>, T> logaddexp(
T a,
T b) {
float a0 = static_cast<float>(a);
float b0 = static_cast<float>(b);
if (::metal::isinf(a0) && a0 == b0) {
return static_cast<T>(a0);
} else {
float m0 = ::metal::max(a0, b0);
return static_cast<T>(
m0 + ::c10::metal::log1p(::metal::exp(-::metal::abs(a0 - b0))));
}
}
// The function is ported from mlx
template <typename T>
inline ::metal::enable_if_t<is_complex_v<T>, T> logaddexp(T a, T b) {
if (::metal::isnan(a.x) || ::metal::isnan(a.y) || ::metal::isnan(b.x) ||
::metal::isnan(b.y)) {
return T(NAN, NAN);
}
T maxval = a.x > b.x ? a : b;
T minval = a.x < b.x ? a : b;
constexpr auto inf = ::metal::numeric_limits<T>::infinity().x;
if (minval.x == -inf || maxval.x == inf) {
return maxval;
}
float2 maxval_ = static_cast<float2>(maxval);
float2 minval_ = static_cast<float2>(minval);
float m = ::metal::exp(minval_.x - maxval_.x);
float2 dexp{
m * ::metal::cos(minval_.y - maxval_.y),
m * ::metal::sin(minval_.y - maxval_.y),
};
return static_cast<T>(maxval_ + ::c10::metal::log1p(dexp));
}
template <typename T>
inline T logaddexp2(T a, T b) {
constexpr auto log_2 = float(0.693147180559945309417232121458176);
constexpr auto inv_log_2 = float(1) / log_2;
float a0 = static_cast<float>(a);
float b0 = static_cast<float>(b);
if (::metal::isinf(a0) && a0 == b0) {
return static_cast<T>(a0);
} else {
float m0 = ::metal::max(a0, b0);
return static_cast<T>(
m0 +
::c10::metal::log1p(::metal::pow(float(2), -::metal::abs(a0 - b0))) *
inv_log_2);
}
}
template <typename T>
inline float xlog1py(T x, T y) {
if (::metal::isnan(y)) {

View File

@ -322,6 +322,24 @@ inline float log1p(float x) {
return rc;
}
// The function is ported from mlx
inline float2 log1p(float2 in) {
float x = in.x;
float y = in.y;
float zabs = ::metal::precise::sqrt(x * x + y * y);
float theta = ::metal::atan2(y, x + 1);
if (zabs < 0.5f) {
float r = x * (2 + x) + y * y;
if (r == 0) { // handle underflow
return {x, theta};
}
return {0.5f * log1p(r), theta};
} else {
auto z0 = ::metal::sqrt((x + 1) * (x + 1) + y * y);
return {::metal::log(z0), theta};
}
}
template <typename T1, typename T2 = T1>
struct pair {
T1 first;
@ -329,17 +347,17 @@ struct pair {
};
template <typename T>
static T conj(T a) {
inline T conj(T a) {
return a;
}
template <>
half2 conj(half2 a) {
inline half2 conj(half2 a) {
return half2(a.x, -a.y);
}
template <>
float2 conj(float2 a) {
inline float2 conj(float2 a) {
return float2(a.x, -a.y);
}

View File

@ -34,7 +34,7 @@ struct MemEvent {
bool overlaps(const MemBlock& a, const MemBlock& b) {
// two blocks dont overlap if
// |---a--------|--------------b--------|
// strat_a end_a <= start_b end_b
// start_a end_a <= start_b end_b
return !(
(a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));
}

View File

@ -33,7 +33,7 @@ struct bitset final {
constexpr bitset() noexcept = default;
constexpr bitset(const bitset&) noexcept = default;
constexpr bitset(bitset&&) noexcept = default;
// there is an issure for gcc 5.3.0 when define default function as constexpr
// there is an issue for gcc 5.3.0 when define default function as constexpr
// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68754.
bitset& operator=(const bitset&) noexcept = default;
bitset& operator=(bitset&&) noexcept = default;

Some files were not shown because too many files have changed in this diff Show More