Compare commits

..

390 Commits

Author SHA1 Message Date
31a4f465b4 Change layout of Dummy
[ghstack-poisoned]
2025-11-16 18:00:14 -08:00
800a7e9b71 Add Dummy
[ghstack-poisoned]
2025-11-16 18:00:02 -08:00
cd700e2097 Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-11-16 18:00:02 -08:00
f34fa15150 Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-11-16 18:00:02 -08:00
bc8ab7cca8 Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-28 13:12:02 -07:00
070f5b6640 Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-28 13:12:02 -07:00
f8eeb1bd26 Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-28 10:54:52 -07:00
a7ae54752d Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-28 10:54:52 -07:00
102faffb43 Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-28 10:34:28 -07:00
9f174af10c Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-28 10:34:28 -07:00
7d7025c2a4 Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-27 21:58:27 -07:00
fe8d404744 Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-27 21:58:27 -07:00
88d425d637 Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-27 21:52:50 -07:00
ce0965f230 Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-27 21:52:50 -07:00
4295a9a158 [xla hash update] update the pinned xla hash (#165895)
This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned xla hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165895
Approved by: https://github.com/pytorchbot
2025-10-27 11:47:29 +00:00
90d7be35e9 Update slow tests (#165894)
This PR is auto-generated weekly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/weekly.yml).
Update the list of slow tests.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165894
Approved by: https://github.com/pytorchbot
2025-10-27 11:42:14 +00:00
8d4e48831e Remove JITFunction constexpr and some arg_names (#166280)
https://github.com/triton-lang/triton/pull/8536 breaks torch.compile integration. This PR attempts to fix it.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166280
Approved by: https://github.com/jansel
2025-10-27 09:29:03 +00:00
90b30ebf7e Update torch-xpu-ops commit pin (#166129)
Update the torch-xpu-ops commit to [intel/torch-xpu-ops@8d373b](8d373ba272), includes:

- Add CONFIGURE_DEPENDS in install_xpu_headers macro to track these headers
- Add check to ensure P2P Tensors are dense
- Switch philox_engine_inputs usage to philox_xpu_state per XPU graph request
- Add vectorization path for maxpool backward channel last
- Fix SYCL_PRINT macro usable on Windows
- Eliminate unnecessary warning if no AOT enabled

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166129
Approved by: https://github.com/EikanWang
2025-10-27 08:17:03 +00:00
173bcda436 Quick fix of torch.save memory leak (#165204)
Fix the memory leak shown in https://github.com/pytorch/pytorch/issues/149846#issuecomment-3392634572
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165204
Approved by: https://github.com/ezyang
2025-10-27 07:50:58 +00:00
6530bc70fb [DeviceMesh] Implement a device mesh concatenate api for submesh and SPMD use case (#163358)
Today FSDP needs to slicing out spmd mesh from root mesh here: https://github.com/pytorch/pytorch/blob/main/torch/distributed/fsdp/_fully_shard/_fsdp_param.py#L301. But essentially, users want is a concatenate of some submesh into a big mesh and used as a spmd mesh. This PR is tentatively trying to implement this API for users.

One thing to note is that, all sub-mesh needs to slicing/flatten or unflatten from same root mesh otherwise the indices make no sense when it comes to mesh indexing and device allocation.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163358
Approved by: https://github.com/fegin
2025-10-27 07:39:21 +00:00
4c38887346 [rfc] add debug mode to print meta in fx graphs (#165874)
quite useful in debugging things like unbacked bindings (and presumably other mechanisms that dependent on meta including activation checkpointing and stack trace printing)

<img width="3996" height="748" alt="CleanShot 2025-10-21 at 09 41 54@2x" src="https://github.com/user-attachments/assets/8b885a36-54a5-48b4-a23c-80b39ac7eb12" />

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165874
Approved by: https://github.com/ezyang
ghstack dependencies: #165893
2025-10-27 07:20:28 +00:00
81fa4a204c Enable Intel GPU on 4 unit test cases (#165405)
For https://github.com/pytorch/pytorch/issues/114850, we will port some aten unit tests to Intel GPU. We could enable Intel GPU with following methods and try the best to keep the original code styles:

1. Replaced onlyCUDA with onlyOn(['cuda', 'xpu']) for supported tests
2. Added allow_xpu=True for supported test class in test parameterization.
3. Use torch.accelerator to extend cude specific test to XPU if needed.
4. Enabled 'xpu' for some test pathes

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165405
Approved by: https://github.com/guangyey, https://github.com/ezyang
2025-10-27 06:06:07 +00:00
4e6afa8c07 [BE][Opinfo] Mark [c]double as unsupported for MPS (#166213)
Test plan: Run `python ../test/test_ops.py -v -k test_dtypes___radd___mps` when TestCommon parametrization is enabled for MPS
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166213
Approved by: https://github.com/kulinseth, https://github.com/Skylion007
2025-10-27 05:38:36 +00:00
79aa88cc5d Remove old ROCm version checks and branches (#166111)
This PR removes outdated ROCm version checks and their branches. While there is no explicit mention of minimum supported version. ROCm 6.4 is listed in the installation page and the CI yaml files.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166111
Approved by: https://github.com/ezyang
2025-10-27 05:32:54 +00:00
fa4cb91846 add support for ir scalar literal parsing for inf/-inf/True/False (#163924)
Currently the ir parser doesn't support parse ir like
```
graph():
  %12 : float = prim::Constant[value=-inf]()
  %13 : float = prim::Constant[value=inf]()
  %14 : bool = prim::Constant[value=True]()
  %15 : bool = prim::Constant[value=False]()
  return (%12)
```

So the python script below will throw error.

```
#!/bin/env python
import torch

def test():
    return [True, False]
f = torch.jit.script(test)
torch._C._jit_pass_constant_propagation(f.graph)
ts_str = f.graph.__repr__()
print(ts_str)
ts = torch.parse_ir(ts_str)
func = torch._C._create_function_from_graph("forward", ts)
ret = func()
assert ret == [True, False]

def test():
    return [float("inf"), float("-inf")]
f = torch.jit.script(test)
torch._C._jit_pass_constant_propagation(f.graph)
ts_str = f.graph.__repr__()
print(ts_str)
ts = torch.parse_ir(ts_str)
func = torch._C._create_function_from_graph("forward", ts)
ret = func()
assert ret == [float("inf"), float("-inf")]
```

I add "inf" and bool cases for IRParser::parseScalarLiteral in irparser.cpp.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163924
Approved by: https://github.com/ezyang
2025-10-27 05:10:21 +00:00
c58d0ad85d Propose Out-of-tree Backend Integration (PrivateUse1) as a module and FFFrog as the maintainer (#165958)
I'd like to propose a new module `Out-of-tree Backend Integration` via `PrivateUse1` device key. The out-of-tree backend integration via `PrivateUse1` device key has been a recommended mechanism of plug-in third-party accelerator devices into PyTorch. There are already quite a few documents/tutorials on the usage with the primary one as https://docs.pytorch.org/docs/main/accelerator/index.html.

We also saw more and more HW vendors to leverage the `PrivateUse1` mechanism to support their accelerators. For example:
1. Ascend NPU
2. Microsoft MAIA
3. MooreThreads MUSA
4. Cambricon MLU

The scope of `PrivateUse1` based out-of-tree backend integration is composed of two parts:
1. `PrivateUse1` device as an out-of-tree backend that involves:
    (a) make `PrivateUse1` a function-complete device as other in-tree devices: i.e., device runtime, autograd, autocast, profiling, distributed, quantization etc.
    (b) a pluggable design to allow out-of-tree integration to extend the functionality of `PrivateUse1` such as a backend registration mechanism that allows user-friendly device naming, runtime extension points with either C++ and Python for third-party to plug-in their runtime implementation, customizable tensor implementation for third-party to add extra info/functionality to the tensor and their serialization.
2. OpenReg: A test suite and documentation effort to guarantee the functional correctness of `PrivateUse1` mechanism and to guide HW vendors with the right implementation.

I'm also proposing @FFFrog as the module maintainer for this new module due to his continuous contribution to the design and implementation both parts of the module. Below are the RFCs/Feature Proposals @FFFrog was working on:
1. [An improvement of PrivateUse1 mechanism, facilitating third-party backend integration](https://docs.google.com/document/d/1_2EO5A2Ww3xDwqbhIvs9Nk65-jV0oNYg3XAmNUsHdAY/edit?tab=t.0#heading=h.5vt8c1vo4dc7)
2. [The interoperability Standard of Third-party Backend Integration Mechanism](9bd181e742/RFC-0037-Interoperability-Standard-of-3rd-Backend-Integration-Mechanism.md)
3. [PyTorch Backend Accelerator Integration Verification and Guidance](f6048cbd4f/RFC-0045-PyTorch-Accelerator-Integration-Enhancements.md)

@FFFrog contributed 240+ PRs and a majority of them is related to `PrivateUse1`. (https://github.com/pytorch/pytorch/pulls?q=is%3Apr+author%3Afffrog+). He also reviewed 50+ PRs related to this area. He is also the primary author of OpenReg.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165958
Approved by: https://github.com/albanD, https://github.com/malfet, https://github.com/ezyang
2025-10-27 05:00:15 +00:00
000f49551b [DeviceMesh] Use _flatten_rank_map to replace _flatten_mesh_list so that we don't need to compare root mesh (#166003) (#166264)
Summary:

Since we are already share a flattened tensor `_rank_map` across all meshes from a same root mesh, we can just use a flattened list of it to replace the comparison of root_mesh and flattened_mesh_list (because with same _rank_map and layout, the mesh tensor is guaranteed to be the same). This way we can also give back the CPU overhead added in https://github.com/pytorch/pytorch/pull/164510 and further simply the code.

We do have a more ambitious universe-based change here: https://github.com/pytorch/pytorch/pull/165680 but it needs more discussions and would lead to BC breaking. We might eventually merge that PR but probably not now and this is a change which is not BC breaking and will help concatenate and 2D integration with concatenate.

cc H-Huang awgu wanchaol fegin wz337 wconstab d4l3k pragupta msaroufim dcci

imported-using-ghimport

Test Plan: Imported from OSS

Differential Revision: D85526705

Pulled By: fduwjj

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166264
Approved by: https://github.com/XilunWu
2025-10-27 03:15:15 +00:00
9940e894ea Fix pyrefly ignore syntax in _inductor (#166247)
Ensures pyrefly ignores only ignore the intended error code.

pyrefly check
lintrunner

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166247
Approved by: https://github.com/oulgen
2025-10-27 02:48:42 +00:00
27302a4932 Fix error suppression syntax in onnx, jit, _dynamo (#166249)
Ensures pyrefly will only silence one specific error code

pyrefly check
lintrunner

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166249
Approved by: https://github.com/oulgen
2025-10-27 02:01:54 +00:00
507614ba43 Add GraphModule.recompile_submodules, use for regional inductor (#166002)
This makes it so that `GraphModule.recompile()` will also recompile any submodules that are also graph modules, which allows us to pass all existing regional inductor tests without skipping.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166002
Approved by: https://github.com/oulgen
ghstack dependencies: #165996
2025-10-27 01:40:51 +00:00
86f9f1d0ab Enable local tensor model for DTensor redistribute tests (#166081)
Redistribute test exercise extensively various sharding schemes and
redistribution between them. These tests uncovered more edge cases
that were not supported by the local tensor primarily different flavors
of uneven sharding. In order to handle these cases this change implements
missing functional collectives and adds support for uneven sharding
case where sharding group (ranks) is larger than the size of the dimension
being sharded. In the latter case the "missing" shards are represented
by zero sized tensors so that the rest of the local tensor machinery
can stay oblivious to this special case.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166081
Approved by: https://github.com/ezyang
2025-10-26 22:21:43 +00:00
154e4d36e9 Fix pyrelfy ignore syntax in distributions and ao (#166248)
Ensures existing pyrefly ignores only ignore the intended error code

pyrefly check
lintrunner

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166248
Approved by: https://github.com/oulgen
2025-10-26 22:13:48 +00:00
a2b6afeac5 [dynamo][guards] CLASS_MATCH guard for readability (#166217)
We were using FUNCTION_MATCH guard for classes. This was very confusing
(although correct).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166217
Approved by: https://github.com/jansel
2025-10-26 18:35:27 +00:00
262830d86c [dynamo] Repro for 166238 (#166252)
xfail repro for https://github.com/pytorch/pytorch/issues/166238

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166252
Approved by: https://github.com/XuehaiPan, https://github.com/jansel
2025-10-26 18:34:22 +00:00
e4c01011c2 Mark FlexAttentionBackward as cacheable (#165996)
This probably should have been marked cacheable a long time ago, no reason that it isn't.

Test Plan:
New regional inductor tests for test_flex_attention now are serializable.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165996
Approved by: https://github.com/oulgen, https://github.com/zou3519, https://github.com/drisspg
2025-10-26 14:39:17 +00:00
a60d9e1f6d Fix flake8 B028 warnings (#166224)
This PR fixes flake8 B028 warning by specifying stacklevel=2 in `warnings.warn`. The advantage is that users can know more contextual information about PyTorch warnings.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166224
Approved by: https://github.com/ezyang
2025-10-26 06:18:55 +00:00
f863550192 [dtensor] fix incorrect norm calculation for Partial DTensors (#159856)
The sharding strategies for `aten.linalg_vector_norm` and the optimized `aten._foreach_norm.Scalar` incorrectly assumes the norm operation is always "reduction linear" with respect to its inputs. This bug causes the norm to be computed on local, incomplete data for DTensors with a `Partial(sum)` placement, leading to an inflated result (a sum of norms, rather than the correct norm of the sum).

The error can be reproduced with the following script:
```python
import os
import torch
import torch.distributed as dist
from torch.distributed.device_mesh import init_device_mesh
from torch.distributed.tensor import DTensor, Partial, Replicate, Shard

def setup_distributed():
    """Initializes the distributed environment."""
    rank = int(os.environ["RANK"])
    local_rank = int(os.environ["LOCAL_RANK"])
    world_size = int(os.environ["WORLD_SIZE"])

    dist.init_process_group("nccl")
    torch.cuda.set_device(local_rank)

    print(f"Initialized process {rank}/{world_size} on GPU {local_rank}")
    return rank, world_size

rank, world_size = setup_distributed()
assert world_size == 2, "Please run with exactly 2 GPUs for this minimal repro."

mesh = init_device_mesh("cuda", (world_size,))

if rank == 0:
    local_partial = torch.tensor([1.0, 3.0], dtype=torch.float32)
else:
    local_partial = torch.tensor([2.0, 1.0], dtype=torch.float32)

partial_dtensor = DTensor.from_local(local_partial, mesh, [Partial("sum")])
partial_result = torch.linalg.vector_norm(partial_dtensor)
print(
    f"[Rank {rank}] partial_result: {partial_result}, full_tensor: {partial_result.full_tensor()}"
)

shard_dtensor = partial_dtensor.redistribute(mesh, [Shard(0)])
shard_result = torch.linalg.vector_norm(shard_dtensor)
print(
    f"[Rank {rank}] shard_result: {shard_result}, full_tensor {shard_result.full_tensor()}"
)

replicate_dtensor = partial_dtensor.redistribute(mesh, [Replicate()])
replicate_result = torch.linalg.vector_norm(replicate_dtensor)
print(
    f"[Rank {rank}] replicate_result: {replicate_result}, full_tensor {replicate_result.full_tensor()}"
)

full_tensor = partial_dtensor.full_tensor()
full_result = torch.linalg.vector_norm(full_tensor)
print(f"[Rank {rank}] correct_result: {full_result}")
```

Run results show that the norm is `sqrt(1**2 + 3**2) + sqrt(2**2 + 1**2) = sqrt(10) + sqrt(5) = 5.398` instead of `sqrt(3**2 + 4**2) = 5`.
```
$ torchrun --local-ranks-filter 0 --nproc-per-node 2 script.py
Initialized process 0/2 on GPU 0
[Rank 0] partial_result: DTensor(local_tensor=3.1622776985168457, device_mesh=DeviceMesh('cuda', [0, 1]), placements=(Partial(sum),)), full_tensor: 5.398345947265625
[Rank 0] shard_result: DTensor(local_tensor=3.0, device_mesh=DeviceMesh('cuda', [0, 1]), placements=(_NormPartial(reduce_op='sum', norm_type=2),)), full_tensor 5.0
[Rank 0] replicate_result: DTensor(local_tensor=5.0, device_mesh=DeviceMesh('cuda', [0, 1]), placements=(Replicate(),)), full_tensor 5.0
[Rank 0] correct_result: 5.0
$ torchrun --local-ranks-filter 1 --nproc-per-node 2 script.py
Initialized process 1/2 on GPU 1
[Rank 1] partial_result: DTensor(local_tensor=2.2360680103302, device_mesh=DeviceMesh('cuda', [0, 1]), placements=(Partial(sum),)), full_tensor: 5.398345947265625
[Rank 1] shard_result: DTensor(local_tensor=4.0, device_mesh=DeviceMesh('cuda', [0, 1]), placements=(_NormPartial(reduce_op='sum', norm_type=2),)), full_tensor 5.0
[Rank 1] replicate_result: DTensor(local_tensor=5.0, device_mesh=DeviceMesh('cuda', [0, 1]), placements=(Replicate(),)), full_tensor 5.0
[Rank 1] correct_result: 5.0
```

This fix simply forces `reduction_linear=False` for partial placements. The output becomes:
```
$ python -m torch.distributed.run --local-ranks-filter 0 --nproc-per-node 2 script.py
Initialized process 0/2 on GPU 0
[Rank 0] partial_result: DTensor(local_tensor=5.0, device_mesh=DeviceMesh((2,), device: 'cuda', stride: (1,)), placements=(Replicate(),)), full_tensor: 5.0
[Rank 0] shard_result: DTensor(local_tensor=3.0, device_mesh=DeviceMesh((2,), device: 'cuda', stride: (1,)), placements=(_NormPartial(reduce_op='sum', norm_type=2),)), full_tensor 5.0
[Rank 0] replicate_result: DTensor(local_tensor=5.0, device_mesh=DeviceMesh((2,), device: 'cuda', stride: (1,)), placements=(Replicate(),)), full_tensor 5.0
[Rank 0] correct_result: 5.0
$ python -m torch.distributed.run --local-ranks-filter 1 --nproc-per-node 2 script.py
Initialized process 1/2 on GPU 1
[Rank 1] partial_result: DTensor(local_tensor=5.0, device_mesh=DeviceMesh((2,), device: 'cuda', stride: (1,)), placements=(Replicate(),)), full_tensor: 5.0
[Rank 1] shard_result: DTensor(local_tensor=4.0, device_mesh=DeviceMesh((2,), device: 'cuda', stride: (1,)), placements=(_NormPartial(reduce_op='sum', norm_type=2),)), full_tensor 5.0
[Rank 1] replicate_result: DTensor(local_tensor=5.0, device_mesh=DeviceMesh((2,), device: 'cuda', stride: (1,)), placements=(Replicate(),)), full_tensor 5.0
[Rank 1] correct_result: 5.0
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159856
Approved by: https://github.com/ezyang
2025-10-26 05:58:44 +00:00
84b14f3a10 Fix error suppression syntax in utils and nn (#166242)
Fixes syntax for pyrefly : ignores so they only ignore a specific category. No functional changes

pyrefly check
lintrunner

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166242
Approved by: https://github.com/oulgen, https://github.com/cyyever
2025-10-26 05:21:07 +00:00
5121499f6b Fix pyrefly ignore syntax in /tools/... (#166240)
Second PR for this - only adjusts the syntax used for the ignores so the suppressions hide only one category of pyrefly errors.

test:
pyrefly check
lintrunner

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166240
Approved by: https://github.com/oulgen
2025-10-26 04:20:16 +00:00
8f80892359 Use correct pyrefly syntax in suppressions distributed/... (#166241)
Updates the pyrefy-ignores in the torch/distributed directory to use the correct syntax. No functional changes.

pyrefly check
lintrunner

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166241
Approved by: https://github.com/oulgen
2025-10-26 04:16:41 +00:00
cdb60e44eb [Inductor] Naive foreach autotune support (#162053)
Initial autotuning support for foreach kernels, 4x improvement for some kernels in internal workload. More improvements can surely be made here in the future. Removing num_warps for definition to enable autotune support in generated wrapper code.

Before:
triton_for_fused_18.kd 🔍 | 4.986 ms | 4.986 ms | 2.493 ms | 2 |
triton_for_fused_6.kd 🔍 | 0.098 ms | 0.098 ms | 0.049 ms | 2 |
triton_for_fused_7.kd 🔍 | 0.036 ms | 0.036 ms | 0.018 ms | 2 |

After:
triton_for_fused_18.kd 🔍 | 1.273 ms | 1.273 ms | 0.636 ms | 2 |
triton_for_fused_6.kd 🔍 | 0.044 ms | 0.044 ms | 0.022 ms | 2 |
triton_for_fused_7.kd 🔍 | 0.024 ms | 0.024 ms | 0.012 ms | 2 |

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162053
Approved by: https://github.com/mlazos, https://github.com/naromero77amd

Co-authored-by: Nichols A. Romero <nick.romero@amd.com>
2025-10-26 02:36:15 +00:00
25909d2629 Simplify SingletonOrSharedTypePtr (#166183)
@neildhar pointed out at PTC yesterday that the assumption SingletonOrSharedTypePtr makes about shared_ptr's pointers being either both null or both non-null is incorrect because of the aliasing constructor, and furthermore that SingletonOrSharedTypePtr needn't be as fancy as it is because said constructor exists. (See also https://github.com/pytorch/pytorch/issues/166152 .)

Differential Revision: [D85458769](https://our.internmc.facebook.com/intern/diff/D85458769/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166183
Approved by: https://github.com/Skylion007, https://github.com/cyyever
2025-10-26 01:25:24 +00:00
c7eee49525 Fix pyrefly ignores 1/n (#166239)
First diff adjusting the syntax for pyrefly: ignore suppressions so they only hide one class of type error.

Test:
lintrunner
pyrefly check

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166239
Approved by: https://github.com/oulgen
2025-10-26 00:44:10 +00:00
eqy
621ba05107 [cuDNN][SDPA] Handle c10:Error when checking device capability for prefer-cuDNN SDPA check (#166201)
Fake device test can execute this function when the number of visible CUDA devices is 0, fix to unblock #165922

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166201
Approved by: https://github.com/Skylion007
2025-10-25 23:00:06 +00:00
39a70cead1 [feat]: add optimized exp_u20 implementation from Arm Optimized Routi… (#161049)
This patch adds an optimized exp_u20() implementation, based on Arm Optimized Routines (AOR). The legacy svexp_f32_z function is removed, and internal uses (such as in tanh) now leverage the new exp_u20() logic. Unit tests have been updated to cover all scenarios.
The implementation ensures correct handling of edge cases by falling back to exp() for extreme inputs (|x| ≥ 0x1.5d5e2ap+6f or |x| ≤ -0x1.5d5e2ap+6f).

Performance:
<html>
<body>
<!--StartFragment--><h3 data-start="70" data-end="182"><strong data-start="77" data-end="182">Performance Improvements for <code data-start="108" data-end="144">aten::scaled_dot_product_attention</code> (Neoverse-V2, <code data-start="159" data-end="179">OMP_NUM_THREADS=16</code>)</strong></h3>
<div class="_tableContainer_1rjym_1"><div tabindex="-1" class="group _tableWrapper_1rjym_13 flex w-fit flex-col-reverse">
<html xmlns:v="urn:schemas-microsoft-com:vml"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:x="urn:schemas-microsoft-com:office:excel"
xmlns="http://www.w3.org/TR/REC-html40">

<head>

<meta name=ProgId content=Excel.Sheet>
<meta name=Generator content="Microsoft Excel 15">
<link id=Main-File rel=Main-File
href="file:///C:/Users/anaabu01/AppData/Local/Temp/msohtmlclip1/01/clip.htm">
<link rel=File-List
href="file:///C:/Users/anaabu01/AppData/Local/Temp/msohtmlclip1/01/clip_filelist.xml">

<!--table
	{mso-displayed-decimal-separator:"\.";
	mso-displayed-thousand-separator:"\,";}
@page
	{margin:.75in .7in .75in .7in;
	mso-header-margin:.3in;
	mso-footer-margin:.3in;}
tr
	{mso-height-source:auto;}
col
	{mso-width-source:auto;}
br
	{mso-data-placement:same-cell;}
td
	{padding-top:1px;
	padding-right:1px;
	padding-left:1px;
	mso-ignore:padding;
	color:black;
	font-size:11.0pt;
	font-weight:400;
	font-style:normal;
	text-decoration:none;
	font-family:"Aptos Narrow", sans-serif;
	mso-font-charset:0;
	mso-number-format:General;
	text-align:general;
	vertical-align:bottom;
	border:none;
	mso-background-source:auto;
	mso-pattern:auto;
	mso-protection:locked visible;
	white-space:nowrap;
	mso-rotate:0;}
.xl63
	{font-weight:700;
	text-align:center;
	vertical-align:middle;
	border:.5pt solid windowtext;
	white-space:normal;}
.xl64
	{text-align:center;
	vertical-align:middle;
	border:.5pt solid windowtext;
	white-space:normal;}
-->

</head>

<body link="#467886" vlink="#96607D">

Configuration | Current | With Changes   (F32) | Speedup
-- | -- | -- | --
Batch 1 · 16 Heads   · Seq 512 · Q 128 | 654.102 µs | 551.031 µs | 1.19× faster (≈ 19%)
Batch 8 · 64 Heads   · Seq 2048 · Q 128 | 30.308 ms | 17.142 ms | 1.77× faster (≈ 43%)

</body>

</html>

</div></div><!--EndFragment-->
</body>
</html>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/161049
Approved by: https://github.com/fadara01, https://github.com/jgong5

Co-authored-by: Fadi Arafeh <Fadi.Arafeh@arm.com>
2025-10-25 20:44:11 +00:00
d97f6550a2 [Intel GPU] Xpu matmul implementation for complex dtype (#160867)
Enabling complex datatype support for 4 ops: `mm`, `bmm`, `addmm`, `baddbmm` for XPU. From now implementation will call functions created in: https://github.com/intel/torch-xpu-ops/pull/1992.

Additionally added complex datatype tests for matmul operators. More detailed tests are going to be enabled in: https://github.com/intel/torch-xpu-ops/pull/1993

CI runs have found that `test_comprehensive_linalg_eig_xpu` tests were calling internally matmul with complex datatype. With this PR test starts to pass so linalg.eig was removed from `inductor_expected_failures_single_sample["xpu"]` as otherwise it was failing with: `Unexpected success` message.

Part of: https://github.com/intel/torch-xpu-ops/issues/1853

Pull Request resolved: https://github.com/pytorch/pytorch/pull/160867
Approved by: https://github.com/guangyey, https://github.com/ZhiweiYan-96, https://github.com/gujinghui, https://github.com/EikanWang, https://github.com/Silv3S, https://github.com/CuiYifeng, https://github.com/jansel
2025-10-25 17:13:13 +00:00
516e58965a Revert "Export flex attention with kwargs and DTensor (#166045)"
This reverts commit de7fdfe41ad12aec719e3662be58ce9e9bf255a8.

Reverted https://github.com/pytorch/pytorch/pull/166045 on behalf of https://github.com/malfet due to Broke distributed tests, see b55b779ad3/1 ([comment](https://github.com/pytorch/pytorch/pull/166045#issuecomment-3446850955))
2025-10-25 15:47:32 +00:00
b55b779ad3 Add file size limits to linters and refactor grep_linter (#166202)
- Add 1GB file size limits to grep_linter, newlines_linter, codespell_linter
- Refactor grep_linter
  - process files once instead of per-line
  - Extract allowlist check to separate function
  - Add 512KB limit for computing replacements, 100 match limit per file
  - Detect duplicate arguments
- Fix .lintrunner.toml: RAWCUDADEVICE used --pattern twice
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166202
Approved by: https://github.com/Skylion007
2025-10-25 14:57:19 +00:00
74e53d0761 [TorchScript] clearer debug for ConcreteModuleType::findSubmoduleConcreteType (#166192)
Summary:
right now the log is just
```
RuntimeError: it != data_.modules_.end() INTERNAL ASSERT FAILED at "fbcode/caffe2/torch/csrc/jit/frontend/concrete_module_type.cpp":207, please report a bug to PyTorch.
```
we have no clue where the error happens
https://fb.workplace.com/groups/gpuinference/posts/789257990578348/?comment_id=789284783909002&reply_comment_id=789415260562621

Test Plan: UT

Reviewed By: jcwchen

Differential Revision: D80020093

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166192
Approved by: https://github.com/gmagogsfm
2025-10-25 14:07:54 +00:00
798a6d2be1 [Inductor][Autotune] Gracefully restart the autotune process after ULF failure (#166073)
This PR partially fixes https://github.com/pytorch/torchtitan/issues/1791, as it will work with `TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC=1` setting only.

The core of the problem: In `max-autotune` mode Inductor runs multiple benchmarks to determine the best config. If one of these benchmarks fails with `cudaErrorLaunchFailure`, all other CUDA calls within the same process will fail including the rest of the benchmarks.

The solution: Restart the child process gracefully and continue benchmarking.

Unfortunately, if autotuning is done in the main process, the whole program falls into unrecoverable state. In this case, the only way of successful execution would be just preventing the ULF.

Here is some info from [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html):
>cudaErrorLaunchFailure = 719
An exception occurred on the device while executing a kernel. ... . This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166073
Approved by: https://github.com/syed-ahmed, https://github.com/drisspg
2025-10-25 10:40:59 +00:00
b0e9c86971 [MPS] Move hypot to Metal (#166216)
Which also prevents crashes, when invoked for integer types, for example, before this change following crashes
```
python -c "import torch; print(torch.hypot(torch.randint(0, 10, (3,), device='mps'), torch.randint(0, 10, (3,), device='mps')))"
*** Terminating app due to uncaught exception 'NSInvalidArgumentException', reason: '*** -[__NSDictionaryM setObject:forKey:]: object cannot be nil (key: squareRoot_i64)'
*** First throw call stack:
(
	0   CoreFoundation                      0x0000000194d33ae0 __exceptionPreprocess + 176
	1   libobjc.A.dylib                     0x00000001947f6b90 objc_exception_throw + 88
	2   CoreFoundation                      0x0000000194c7d884 -[__NSDictionaryM setObject:forKey:] + 1288
	3   MPSCore                             0x00000001a1187d0c _ZN12MPSKernelDAG15duodenaryCoreOpEP10BaseTensorS1_S1_S1_S1_S1_S1_S1_S1_S1_S1_S1_RKNSt3__16vectorIlNS2_9allocatorIlEEEE11MPSDataTypePKc + 37044
	4   MPSCore                             0x00000001a113fab0 _ZN12MPSKernelDAGD0Ev + 4256
	5   MPSCore                             0x00000001a1139f6c _ZN12MPSKernelDAG13getDAGAndHashEPU21objcproto10MTLLibrary11objc_objectP14MPSDAGKernelOpP19NSMutableDictionaryIP8NSStringPU22objcproto11MTLFunction11objc_objectEP14NSMutableArrayIS6_ERDv4_yPb + 8	6   MPSCore                             0x00000001a113c7a4 _ZN12MPSKernelDAG13getDAGAndHashEPU21objcproto10MTLLibrary11objc_objectP14MPSDAGKernelOpP19NSMutableDictionaryIP8NSStringPU22objcproto11MTLFunction11objc_objectEP14NSMutableArrayIS6_ERDv4_yPb + 1	7   MPSCore                             0x00000001a11c03c8 _ZN10MPSLibrary19CreateUberShaderKeyEP8NSStringRK23MPSFunctionConstantListyPFPU22objcproto11MTLFunction11objc_objectPU21objcproto10MTLLibrary11objc_objectPK13MPSKernelInfoS4_RK33MPSFunctionConstr	8   MPSNDArray                          0x00000001a27b546c MPSSetResourcesOnCommandEncoder + 154176
	9   MPSNDArray                          0x00000001a27967d8 MPSSetResourcesOnCommandEncoder + 28076
	10  MPSNDArray                          0x00000001a2798ec8 MPSSetResourcesOnCommandEncoder + 38044
	11  MetalPerformanceShadersGraph        0x00000001f97689ac _ZN3GPU17IdentityOpHandler15encodeNDArrayOpEPNS_16EncodeDescriptorEP7NSArray + 436
	12  MetalPerformanceShadersGraph        0x00000001f977f93c _ZN3GPU17StitchedOpHandler8encodeOpEPNS_16EncodeDescriptorE + 924
	13  MetalPerformanceShadersGraph        0x00000001f9544898 _ZN16GPURegionRuntime5runOpIN3GPU23AbsoluteSquareOpHandlerEEEvPN4mlir9OperationEPNS1_16EncodeDescriptorE + 120
	14  MetalPerformanceShadersGraph        0x00000001f9543894 _ZN16GPURegionRuntime8encodeOpEPN4mlir9OperationEPN3GPU16EncodeDescriptorE + 4700
	15  MetalPerformanceShadersGraph        0x00000001f954251c _ZN16GPURegionRuntime29encodeOpWithCommitAndContinueEPN4mlir9OperationEPN3GPU16EncodeDescriptorE + 92
	16  MetalPerformanceShadersGraph        0x00000001f954189c _ZN16GPURegionRuntime11evaluateOpsEPN3GPU16EncodeDescriptorEP7NSArrayIP18MPSGraphTensorDataES7_ + 3572
	17  MetalPerformanceShadersGraph        0x00000001f953f7b4 _ZN10MPSRuntime11evaluateOpsEN4mlir4func6FuncOpEP21RuntimeSpecializationP7NSArrayIP18MPSGraphTensorDataES9_P37MPSGraphExecutableExecutionDescriptorP16MPSCommandBufferbbbPb + 824
	18  MetalPerformanceShadersGraph        0x00000001f988dd38 -[MPSGraphExecutable runInternalWithDevice:commandBuffer:feeds:results:executableExecutionDescriptor:mpsGraphOwnedCommandBuffer:] + 3848
	19  MetalPerformanceShadersGraph        0x00000001f988ca04 -[MPSGraphExecutable runInternalWithDevice:commandBuffer:feedsDictionary:resultsDictionary:executableExecutionDescriptor:mpsGraphOwnedCommandBuffer:] + 608
	20  MetalPerformanceShadersGraph        0x00000001f9728aa0 -[MPSGraph runInternalWithMPSCommandBuffer:feeds:targetTensors:targetOperations:resultsDictionary:executionDescriptor:mpsGraphOwnedCommandBuffer:] + 320
	21  MetalPerformanceShadersGraph        0x00000001f9727b58 -[MPSGraph encodeToCommandBuffer:feeds:targetOperations:resultsDictionary:executionDescriptor:] + 188
	22  libtorch_cpu.dylib                  0x00000001556c9478 ___ZN2at3mps9MPSStream15executeMPSGraphEP8MPSGraphP12NSDictionaryS5_NS0_8SyncTypeE_block_invoke + 128
	23  libdispatch.dylib                   0x0000000194a3985c _dispatch_client_callout + 16
	24  libdispatch.dylib                   0x0000000194a2f7a8 _dispatch_lane_barrier_sync_invoke_and_complete + 56
	25  libtorch_cpu.dylib                  0x00000001556c93e0 _ZN2at3mps9MPSStream15executeMPSGraphEP8MPSGraphP12NSDictionaryS5_NS0_8SyncTypeE + 160
	26  libtorch_cpu.dylib                  0x00000001556fd0f4 _ZN2at6native3mpsL14binaryOpTensorERKNS_6TensorES4_S4_NSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEU13block_pointerFP14MPSGraphTensorPNS1_19BinaryOpCachedGraphESD_SD_E + 3040
	27  libtorch_cpu.dylib                  0x00000001556ff680 _ZN2at6native24structured_hypot_out_mps4implERKNS_6TensorES4_S4_ + 84
	28  libtorch_cpu.dylib                  0x00000001522682e4 _ZN2at12_GLOBAL__N_117wrapper_MPS_hypotERKNS_6TensorES3_ + 216
	29  libtorch_cpu.dylib                  0x0000000153a1378c _ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_S9_EXadL_ZN5torch8autograd12VariableType12_G	30  libtorch_cpu.dylib                  0x0000000151241714 _ZN2at4_ops5hypot4callERKNS_6TensorES4_ + 304
	31  libtorch_python.dylib               0x0000000105d9a848 _ZN5torch8autogradL17THPVariable_hypotEP7_objectS2_S2_ + 752
	32  Python                              0x00000001036afa7c cfunction_call + 72
	33  Python                              0x000000010365db08 _PyObject_MakeTpCall + 124
	34  Python                              0x0000000103750f40 _PyEval_EvalFrameDefault + 23304
	35  Python                              0x000000010374b1c8 PyEval_EvalCode + 184
	36  Python                              0x00000001037ab8bc run_eval_code_obj + 88
	37  Python                              0x00000001037a9994 run_mod + 132
	38  Python                              0x00000001037a8fdc PyRun_StringFlags + 124
	39  Python                              0x00000001037a8f08 PyRun_SimpleStringFlags + 64
	40  Python                              0x00000001037cd464 Py_RunMain + 716
	41  Python                              0x00000001037cd950 pymain_main + 304
	42  Python                              0x00000001037cd9f0 Py_BytesMain + 40
	43  dyld                                0x0000000194836b98 start + 6076
)
libc++abi: terminating due to uncaught exception of type NSException
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166216
Approved by: https://github.com/Skylion007
ghstack dependencies: #166210
2025-10-25 08:51:38 +00:00
661a56002f [AI Codemod][DevmateFBSourceTestFailureBot] Fix for T241916639 ("Your diff, D84932408, broke one test") (#166168)
Reviewed By: XilunWu

Differential Revision: D84983164

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166168
Approved by: https://github.com/H-Huang, https://github.com/fduwjj
2025-10-25 06:46:23 +00:00
c9bc00f016 Split grouped_mm methods into their own file (#166140)
Summary:

`Blas.cpp` was getting a little full and hard to work with, split out
the `*_grouped_mm` methods into their own file

Test Plan:

```
pytest -svv -k group test/test_matmul_cuda.py
pytest -svv -k group test/test_scaled_matmul_cuda.py
```

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166140
Approved by: https://github.com/drisspg
ghstack dependencies: #166139
2025-10-25 05:40:31 +00:00
ec51b139e1 Factor out shared scaled mm routines (#166139)
Summary:

In preparation for splitting out scaled grouped mm functions, factor out
scaled-specific routines into their own file(s)

Test Plan:

```
pytest -svv test/test_scaled_matmul_cuda.py
```

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166139
Approved by: https://github.com/drisspg
2025-10-25 05:40:31 +00:00
eb83c3ca23 Clean up unused Pyrefly suppressions (#166178)
Cleaning up ignores that are no longer needed in the repo and adding select suppressions so the main branch is clean.

test plan:
`lintrunner -a`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166178
Approved by: https://github.com/oulgen
2025-10-25 05:32:21 +00:00
7924e3aacf Remove likely unnecessary _EXPAND trick for non-windows in HIDDEN_NAMESPACE_BEGIN (#166203)
I've learned that the EXPAND trick is needed mostly for an MSVC quirk to properly expand arguments. I tested on Linux locally and suspect that we don't need the _EXPAND for non-windows.  This PR is BE to minimalize what we need and remove what we don't, but I'm also okay not landing this if @malfet tells me that this quirk goes beyond MSVC.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166203
Approved by: https://github.com/malfet
ghstack dependencies: #166076, #166077, #166078, #166079
2025-10-25 04:44:07 +00:00
78bcfcf870 [fx] Optimize torch.fx.Node.replace_all_uses_with (#165889)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165889
Approved by: https://github.com/aorenste
2025-10-25 03:44:41 +00:00
1e2e7cb18b Add doc for Symmetric Memory (#166148)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166148
Approved by: https://github.com/fduwjj
2025-10-25 03:41:15 +00:00
003601a70d Set prefer_deferred_runtime_asserts_over_guards to True (#165820)
Set prefer_deferred_runtime_asserts_over_guards to True and allow a flag to control the behavior, just in case.

This option has enable the gemma3 model export with transformers==4.57. I am not sure how best to test it though.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165820
Approved by: https://github.com/titaiwangms
2025-10-25 03:38:19 +00:00
1d58d5fe25 [hops] fix unbacked runtime asserts for cond higher order op (#165893)
At a high level after this fix we get the following nice tlparse https://manifold.edge.x2p.facebook.net/v0/read/tree/logs/bobren/54a57665-7dcc-41e0-8ca7-df01393cd4aa/custom/index.html?bucketName=tlparse_reports&apiKey=tlparse_reports-key&withPayload=1&timeoutMsec=10000

As seen in this doc, previously we were simply dropping assert post
dynamo: https://docs.google.com/document/d/1nRQwvw_gWL0_9T3VKb5Ly3_tNI1fgqG9WtryeD6qaZI/edit?tab=t.0

The fixes are a couple things:

1) Actually run the runtime assertion fx graph pass on subgraphs
2) Reset fake mode unbacked memo across speculate subgraph invocations
   since the memos actually break the runtime assertion insertions since
   calls like nonzero end up not allocating new unbacked symints and
   hence not populating pending_unbacked which then results in incorrect
   unbacked_bindings on fx_nodes in subgraphs.

This is a first step in hardening runtime asserts across all phases of
the compiler (eager, aot_eager, inductor, etc.). I will continue kicking
tires and fixing bugs until we get runtime assert generations in a good
place. One obvious next step is the added test case in this PR fails
when compiled with inductor with the following error (NB: it fails before this PR as well):

```
  File "/data/users/bobren/a/pytorch/torch/_inductor/ir.py", line 659, in get_dtype
    return self.dtype
torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
LoweringException: AttributeError: 'ShapeAsConstantBuffer' object has no attribute 'dtype'
  target: cond
  args[0]: Eq(Mod(s77, 4), 0)
  args[1]: Subgraph(name='true_graph_0', graph_module=<lambda>(), graph=<torch._inductor.graph.SubgraphLowering object at 0x7fbcbb11e110>)
  args[2]: Subgraph(name='false_graph_0', graph_module=<lambda>(), graph=<torch._inductor.graph.SubgraphLowering object at 0x7fbcbb21cf70>)
  args[3]: (s77, TensorBox(StorageBox(
    ComputedBuffer(name='buf0', layout=FlexibleLayout('cuda:0', torch.float32, size=[s77, s77], stride=[s77, 1]), data=Pointwise(device=device(type='cuda', index=0), dtype=torch.float32, inner_fn=<function make_pointwise.<locals>.inner.<locals>.inner_fn at 0x7fbcbb2f37f0>, ranges=[s77, s77]))
  )))
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165893
Approved by: https://github.com/zou3519
2025-10-25 03:25:36 +00:00
de7fdfe41a Export flex attention with kwargs and DTensor (#166045)
Fixes #165948

Adding registration of the MaskBlock makes flex attention with kwargs exportable.

Also modified unittests to accept kwargs

```
python test/distributed/tensor/test_dtensor_export.py -k test_flex_attention_dtensor_export

python test/inductor/test_flex_attention.py -k test_pytree_
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166045
Approved by: https://github.com/drisspg
2025-10-25 03:17:22 +00:00
b31bad1b8f [Pytorch] Enable autovec on aarch64 for type conversion (#166049)
Summary:
Implementing autovec template for type conversions on aarch64-NEON

Generated code can be seen here: https://godbolt.org/z/1K6T1d9TE

We've seen significant performance improvements for converting to and from bytes, compiling using clang with -march=armv9-a+sve2:

Before
float->uint8->float ===> 683.212us
float->int8->float ===> 687.846us
int32->uint8->int32 ===> 497.121us
int32->int8->int32 ===> 481.889us

After:
float->uint8->float ===> 198.204us  ----> 245% higher throughput
float->int8->float ===> 200.241us ----> 244% higher throughput
int32->uint8->int32 ===> 197.970us ----> 151% higher throughput
int32->int8->int32 ===> 198.206us ----> 143% higher throughput

Test Plan:

buck2 test mode/opt //caffe2/test:test_ops
buck2 test mode/opt //caffe2/test:torch

Differential Revision: D85213420

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166049
Approved by: https://github.com/ezyang, https://github.com/mcfi, https://github.com/aditew01
2025-10-25 02:55:50 +00:00
2efcf3ca98 Reverts #163712 and forces allgather/scatter inputs/outputs to be contiguous (#166181)
Per title

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166181
Approved by: https://github.com/kwen2501
2025-10-25 02:43:10 +00:00
761f946043 [ROCm] new implementation of upsample_bilinear2d_backward (#164572)
Changed the implementation from an output-based approach to an input-based one to remove `atomicAdd` operations, and it appears to deliver at least a 20× speedup.

The changes are from Yu-Yun <YuYun.Chang@amd.com>.

# Summary: Refactor of the implementation of the `upsample_bilinear2d_backward` opertion on MI300X/MI325X
- The original "scatter-add" approach
  - Each thread, representing an output pixel, scattered gradient contributions to four input pixels, using costly atomic operations on MI300X/MI325X GPUs.
- The new "gather-sum" approach
  - Each thread is responsible for a single input pixel and gathers all relevant gradient contributions from a small, calculated region of the output tensor (done by the `compute_output_range` device function).
# Breakdown of the code changes
- Inversion of the parallelization strategy of the kernel function `upsample_bilinear2d_backward_out_frame`
  - Originally, the main kernel loop was parallelized over the number of elements in the output gradient tensor (`const size_t o_numel = nc * width2 * height2;`).
    - Each thread processed one output pixel.
  - The new loop is parallelized over the number of elements in the input gradient tensor (`const size_t i_numel = nc * height1 * width1;`).
    - Each thread is responsible for calculating the final gradient for a single input pixel.
  - The kernel launch changes accordingly in the function `upsample_bilinear2d_backward_out_cuda_template`.
- Added a device function for calculating the range of output pixels that could have possibly used that the input pixel (`input_pos`) during the forward pass interpolation
  - This is essentially the mathematical inverse of the forward pass.
  - This function tries to prune a thread's search space so that it only needs to inspect a small, local window of the output tensor.
- Gradient calculation approach switching from "scatter-add" to "gather-sum"
  - Scatter-add
    - For each output pixel, the thread calculated 4 gradient contributions and use `fastAtomicAdd` 4 times to add these values to 4 different (and potentially highly contended) memory locations in the input gradient tensor.
  - Gather-sum
    - A thread responsible for one input pixel calls `compute_output_range` to determine the small rectangular region of output pixels that influence the input's final gradient value.
    - The thread iterates through this region, and for each output pixel in the regionre, it re-calculates the interpolation weights to determine the exact contribution to its specific input pixel.
    - All these contributions are accumulated into a private, per-thread register variable (`accscalar_t grad_sum = 0;`).
      - W/o any gloabl memory access, this accumulation is extremely fast.
    - When the loops are done, the thread performs a single, direct write (non-atomic) of the final summed gradient to its designated location in global memory (`idata[index] = static_cast<scalar_t>(grad_sum);`).
# Why performance gets boosted
- Analysis of the root cause of performance drop
  - Ref. (internal only) - https://amd.atlassian.net/wiki/spaces/~glencao2/pages/1140493327/PyTorch__upsample_bilinear2d_backward
- First and foremost, elimination of the contention of atomic operations
  - Many parallel threads called `atomicAdd` frequently attempting to update the exact same memory location in the input gradient tensor at the same time.
    - The GPU's memory controler has to serialize these operations, effectively nullifying the benefit of parallel capability at those contention points.
  - MI300X/MI325X chiplet-based CDNA 3 architeture amplified the issue.
    - When contending threads reside on different XCDs, resolving the atomic operation requires high-latency coherence traffic across the Infinity Fabric interconnect.
  - The implementation change eliminates hardware-level serialization and cross-chiplet coherence traffic caused by many `atomicAdd`.
- Improved memory access pattern and locality
  - Write coalescing
    - The regular sum writes `idata[index] = static_cast<scalar_t>(grad_sum);` can be perfectly coalesced by GPUs.
  - Read locality
    - Even though there are many (potentially repeated) reads from the output tensor (`static_cast<accscalar_t>(odata[output_idx])`), these are highly cache-friendly, meaning the data for one thread is likely to be in the L1 or L2 cache already due to an access from a neighboring thread.
- Trade-off: computation for memory synchronization
  - The recalculation of interpolation weights fits well on high-computational-throughput modern GPUs like MI300X/MI325X.
  - Removal of atomic operations avoids expensive memory synchronization.

---

Optimizations of `grid_sampler_2d_backward` will be addressed in a separate PR.
Doc for reference: (internal only) https://amd.atlassian.net/wiki/spaces/~glencao2/pages/1162750701/PyTorch__grid_sampler_2d_backward

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164572
Approved by: https://github.com/jeffdaily

Co-authored-by: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
2025-10-25 02:39:24 +00:00
8aa465f18e [MPS] Migrate angle to Metal ops (#166210)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166210
Approved by: https://github.com/Skylion007
2025-10-25 01:52:33 +00:00
0a5d68d92d [dynamo] Remove unnecessary NAME_MATCH guard (#166112)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166112
Approved by: https://github.com/Lucaskabela
ghstack dependencies: #166155
2025-10-25 01:27:42 +00:00
42bd210fff [dynamo] Avoid ID_MATCH on methods - use CLOSURE_MATCH on functions (#166155)
id on methods can change from invocation to invocation. Here we guard on
__code__ objects which does not change

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166155
Approved by: https://github.com/jansel
2025-10-25 01:27:42 +00:00
1d13c314b3 [OpenReg] Remove the Unnecessary Fallback Implementation for AutogradPrivate1 (#165316)
As the title stated.

The fallback for AutogradPrivateUse1 is builtin in PyTorch, so it is no need to register general implementation for out of tree backend.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165316
Approved by: https://github.com/ezyang, https://github.com/albanD
ghstack dependencies: #165315
2025-10-25 01:27:27 +00:00
0c9763a5a0 [Autograd] Add Default Autograd Fallback for PrivateUse1 in PyTorch (#165315)
Please refer to this [link](https://github.com/pytorch/pytorch/issues/163979) for more background.

- Allow register fallback for AutogradPrivateUse1 multiple.
- Add Autograd fallback implemetation for AutogradPrivateUse1

PyTorch can privide a common implementation for AutogradPrivateUse1, and the user can override it based on the need of specififc accelerator.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165315
Approved by: https://github.com/albanD
2025-10-25 01:27:27 +00:00
79a4a9c02e Fix race condition and make CUDA kthvalue deterministic (#165762)
The gatherKthValue kernel had a race condition where multiple threads could write to the same output location without synchronization when duplicate k-th values exist, resulting in non-deterministic output.

Changes:
- aten/src/ATen/native/cuda/Sorting.cu: Use atomicMin with shared memory to deterministically find minimum index. Add early termination and remove redundant inRange checks. (We have to cast the index to `int32_t`, but this is already assumed to fit earlier in the kernel.)
- aten/src/ATen/native/cuda/Sorting.cpp: Remove non-deterministic alert since kthvalue is now deterministic on CUDA.
- torch/__init__.py: Remove kthvalue from non-deterministic operations list and remove kthvalue example from use_deterministic_algorithms() docstring.
- test/test_torch.py: Remove test_nondeterministic_alert_kthvalue since kthvalue no longer raises alerts on CUDA.

Benefits:
- Deterministic: always returns minimum index when duplicates exist
- Potential performance improvement on large arrays with repetitions

Test Results:
- All existing PyTorch tests pass (test_kthvalue)
- Custom determinism tests confirm consistent results
- Custom CUDA vs CPU correctness validated across 50+ scenarios
- Custom performance benchmarks show improvements with no visible regressions

Addresses #165227

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165762
Approved by: https://github.com/ngimel, https://github.com/eqy
2025-10-25 00:45:57 +00:00
9d0b77f4cd [10/N] Apply ruff UP035 rule (#165709)
This is a follow-up of #165515. ruff `UP035` rules are applied to  dynamo code to use Py 3.10+ typing.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165709
Approved by: https://github.com/ezyang
2025-10-25 00:20:13 +00:00
d486eee234 Hide APIs in torch::headeronly (#166079)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166079
Approved by: https://github.com/malfet, https://github.com/cyyever
ghstack dependencies: #166076, #166077, #166078
2025-10-25 00:18:26 +00:00
cddd5f74ab Hide stable Library structs instead of using anon namespace (#166078)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166078
Approved by: https://github.com/malfet
ghstack dependencies: #166076, #166077
2025-10-25 00:18:26 +00:00
dfdb68e51f Hide all APIs in torch::stable (#166077)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166077
Approved by: https://github.com/malfet
ghstack dependencies: #166076
2025-10-25 00:18:26 +00:00
98c818320a Add HIDDEN_NAMESPACE_BEGIN and END macros for hiding header APIs (#166076)
Spurred by the conversation started in https://github.com/pytorch/pytorch/issues/163343.

Context:
* Header implementations may be inlined _but_ are not necessarily inlined, even when using the `inline` keyword.
* When someone wants to use multiple extensions in the same runtime, e,g., with FA3 and AO, then 2 `.so`s are loaded that may have been built with different libtorch versions. Thus, if an API is not inlined and are differently implemented, one implementation will be arbitrarily picked up and used across the runtime, depending on link order. This is bad!
* Consequently, we need to be very good at guaranteeing that we don't modify header implementations within a namespace. This is easy to mess up by accident, which would be a dire mistake.

Solution:
In essence, we want APIs in torch::headeronly and torch::stable to be visible in each individual extension only, and nowhere else. We want to hide these symbols! Thankfully, pybind already solved this problem (thanks @malfet for bringing that to my attention). This PR is heavily inspired by the code in pybind here: e6984c805e/include/pybind11/detail/pybind11_namespace_macros.h (L73-L82).

In this PR, we introduce the macros for defining hidden namespaces in PyTorch.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166076
Approved by: https://github.com/malfet
2025-10-25 00:18:26 +00:00
cc20b7ad72 [FlexFlash] update names (#166193)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166193
Approved by: https://github.com/BoyuanFeng
2025-10-25 00:07:11 +00:00
bc11a42b3f [inductor][ez] fix score fusion memory typo (#166029)
Fix https://github.com/pytorch/pytorch/issues/165724 .
The typo does not affect the compilation result. It just affect compilation time a little bit.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166029
Approved by: https://github.com/eellison
2025-10-24 23:48:05 +00:00
4fc06f2e0a Use std::min for #165927 (#166199)
Summary: Like D85463674 (pr https://github.com/pytorch/pytorch/pull/166195) but for D85357351 (https://github.com/pytorch/pytorch/pull/165927)

Differential Revision: D85464917

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166199
Approved by: https://github.com/Camyll, https://github.com/malfet, https://github.com/Skylion007
2025-10-24 23:19:00 +00:00
82473c3d59 [torch.export] Add original module type to UnflattenedModule class (#166145)
Summary: Currently all sub modules of UnflattenedModule have orginal type name. This diff will orginal type for UnflattenedModule.

Test Plan:
```
buck test mode/opt caffe2/test:test_export
```
https://www.internalfb.com/intern/testinfra/testrun/17732923654320197

Differential Revision: D85373454

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166145
Approved by: https://github.com/angelayi
2025-10-24 22:47:29 +00:00
b6a4236e5d [label_to_label] minor updates (#166172)
vllm-compile implies "module: vllm" and "oncall: pt2".
The volume of issues in Flex -> HigherOrderOperators is too noisy,
plus we have a different set of folks looking at each, so I'm going to
make that not automatic anymore. We can still manually label flex issues
as higher order operator issues.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166172
Approved by: https://github.com/angelayi
2025-10-24 22:47:23 +00:00
b04173be9b [ONNX] Add a test to backed_size_oblivious patch in onnx (#166196)
Follow-up https://github.com/pytorch/pytorch/pull/166151

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166196
Approved by: https://github.com/justinchuby
2025-10-24 22:47:10 +00:00
32ac38f85d [lint] workflow consistency linter to look at all files instead of just changed files (#165171)
As in title

If you change only one workflow file, lintrunner (default arg, also the one in CI since it only inputs changed files) won't look at other files in the repo, but the sync-tag might come from those other files

This makes it so that it looks at all workflow files so it will catch those failures

Also change output line so it prints which file + which job it is different from

Pros:
catches errors

Cons:
unusual behavior (getting around what lintrunner says the linter should run on)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165171
Approved by: https://github.com/malfet, https://github.com/izaitsevfb, https://github.com/atalman
2025-10-24 21:43:18 +00:00
c9b49e506e [MPS] Add linalg.householder_product for MPS (#166090)
Fixes #166089
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166090
Approved by: https://github.com/malfet
2025-10-24 21:13:56 +00:00
6038e476e8 [Dynamo][Logging]Fix regression on stack adding to latest bytecode by… (#165946)
… adding verbose check (#165926)

[ghstack-poisoned]

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165946
Approved by: https://github.com/williamwen42
2025-10-24 20:36:50 +00:00
2c851c16e5 [FX][ez] fix the split_module tutorial code (#166154)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166154
Approved by: https://github.com/BoyuanFeng
2025-10-24 20:16:04 +00:00
31584f2d91 Add a Claude skill for writing docstrings. (#166175)
Generated with prompt:

> torch/_tensor_docs.py and torch/nn/functional.py contain the "gold standard" for docstrings in the PyTorch project. Write a skill describing how to write a docstring for a function/method in the PyTorch project. Note that add_docstring is specifically for C binded functions; a native Python function can just be a direct docstring. Sphinx is used to generate docs.

Signed-off-by: Edward Yang <ezyang@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166175
Approved by: https://github.com/Skylion007
2025-10-24 20:05:44 +00:00
0442125362 [Inductor] Restore original dtype for rank-0 CPU tensors (#166118)
# Problem
Inductor implicitly upcasts certain rank-0 kernel arguments from float16 to float32. Currently, this happens only on the `"cpu"` device, which appears to be related to float16 support in CPU Triton. However, it can also affect the behavior of GPU kernels, when a model contains tensors from multiple devices. Upcasting may be undesirable on some platforms, so users can typically disable it with the `config.triton.codegen_upcast_to_fp32` flag. However, this flag was not respected by the rank-0 kernel argument codepath.

Through an improbable series of events, float32 upcasting caused an internal model to fail compilation on MTIA. (Internal reviewers see T242444110.)

# Fix
If `config.triton.codegen_upcast_to_fp32` evaluates to `False`, cast the kernel argument to the original dtype.

# Test plan
Added a new CI test checking for the downcast iff the config flag is false. The test mixes GPU and CPU tensors to generate a GPU kernel with the implicit float32 upcast and explicit float16 downcast.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166118
Approved by: https://github.com/jfix71, https://github.com/jansel, https://github.com/kundaMwiza
2025-10-24 19:59:25 +00:00
fdcf402d82 vllm test build (#166146)
FIx the vllm test build it's broken due to the flashinfer dependency
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166146
Approved by: https://github.com/huydhn
2025-10-24 19:18:10 +00:00
13cda9b89e Allow BlockDescriptorOptions classes to be overridden In TritonKernel (#165899)
By allowing the options classes (`BlockPtrOptions`/`TensorDescriptorOptions`) to be overridden in `TritonKernel`, subclasses with custom behaviour can be used in place of them, which provides greater flexibility.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165899
Approved by: https://github.com/jansel
2025-10-24 18:59:59 +00:00
fa6d911dda [MPS] Sparse mul enable tests and fix on MPS (#166164)
Apparently mul tests in test_sparse were disabled. The dense representation i.e. when nnz is not a scalar was broken on MPS. This PR fixes it and enables the tests in test_sparse.py

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166164
Approved by: https://github.com/malfet
2025-10-24 18:30:30 +00:00
0db6bcc015 Fix accuracy for layernorm/rmsnorm benchmarking (#166005)
Example command:
    python benchmarks/dynamo/genai_layers/benchmark.py --exit-on-accuracy-failure --tolerance=1e-2 rmsnorm_backward

Fix the accuracy problem for layernorm/rmsnorm fwd/bwd.
Also fix some quack calls (maybe due to quack API change)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166005
Approved by: https://github.com/BoyuanFeng
2025-10-24 18:14:51 +00:00
eqy
60ac039998 [CUDA][Grouped Gemm] remove xFail on Group GEMM tests after fallback was added (#165378)
https://github.com/pytorch/pytorch/pull/162059 means we get unexpected successes now on e.g., SM 12.0

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165378
Approved by: https://github.com/Skylion007
2025-10-24 17:42:40 +00:00
380d440d1c Revert "inductor: avoid unrolling argmin/argmax reductions to preserve index … (#164040)"
This reverts commit 9038a30cee56e0d577a666fffa32e990732572d4.

Reverted https://github.com/pytorch/pytorch/pull/164040 on behalf of https://github.com/karthickai due to Kindly add the test case mentioned in the issue ([comment](https://github.com/pytorch/pytorch/pull/164040#issuecomment-3444137989))
2025-10-24 17:14:45 +00:00
9038a30cee inductor: avoid unrolling argmin/argmax reductions to preserve index … (#164040)
…semantics on views; add regression test for transposed mutation (fixes #163929)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164040
Approved by: https://github.com/ngimel, https://github.com/jansel
2025-10-24 16:37:43 +00:00
690c8c13b9 Revert "Export should use aot_export_joint_with_descriptors (#165931)"
This reverts commit 882b834082719afd8ee41769c2cb224bc9031632.

Reverted https://github.com/pytorch/pytorch/pull/165931 on behalf of https://github.com/clee2000 due to breaking internal tests D85084301 for test_auto_functionalize?  I checked that they did run on OSS CI so I'm not entirely sure whats going on, I assume its the IS_FBCODE stuff ([comment](https://github.com/pytorch/pytorch/pull/165931#issuecomment-3443887361))
2025-10-24 16:02:20 +00:00
28ee6b62ed Revert "[DeviceMesh] Implement a device mesh concatenate api for submesh and SPMD use case (#163358)"
This reverts commit 5a4997dcae47acf69c929ac5b081321143bfbf11.

Reverted https://github.com/pytorch/pytorch/pull/163358 on behalf of https://github.com/clee2000 due to probably need to revert this one  too, its stacked with https://github.com/pytorch/pytorch/pull/166003#issuecomment-3443668389 ([comment](https://github.com/pytorch/pytorch/pull/163358#issuecomment-3443874910))
2025-10-24 15:58:54 +00:00
81577bdb3f Revert "[DeviceMesh] Use _flatten_rank_map to replace _flatten_mesh_list so that we don't need to compare root mesh (#166003)"
This reverts commit 8625ffbd45884464f736cfc61300c14f47633641.

Reverted https://github.com/pytorch/pytorch/pull/166003 on behalf of https://github.com/clee2000 due to failing internal tests D85405179 I believe there are uses of _flatten_mesh_list internally that need to be updated ([comment](https://github.com/pytorch/pytorch/pull/166003#issuecomment-3443668389))
2025-10-24 15:14:23 +00:00
e67e3d95f3 Simplify the CUPTI CMake check for kineto (#161370)
Simplify the CUPTI check because kineto has used `CUDA::cupti`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/161370
Approved by: https://github.com/Skylion007
2025-10-24 08:13:17 +00:00
27af8480ea Refactor api and configs of overlapping (#166130)
- pass important configs values directly into the class
- migrate those configs from `test_configs` to another class
- add an (off by default) config to enable inside inductor, instead of requiring a custom post pass

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166130
Approved by: https://github.com/bdhirsh
2025-10-24 07:03:54 +00:00
6494cdc40c [DebugMode] add nn.Module tracking (#165498)
Uses ModTracker to record nn.Module entries, much like CommDebugMode.

Can be switched on with `DebugMode(record_nn_module=True)`:
```
    [nn.Mod] Bar
      [nn.Mod] Bar.abc
        [nn.Mod] Bar.abc.l1
          aten::t(t: f32[4, 4])
          aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])
        [nn.Mod] Bar.abc.l2
          aten::t(t: f32[4, 4])
          aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])
      [nn.Mod] Bar.xyz
        aten::t(t: f32[4, 4])
        aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])"""
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165498
Approved by: https://github.com/SherlockNoMad
2025-10-24 05:08:33 +00:00
ac7074efa2 [CUDA][cuBLAS] Fix a compilation issue in #163955 when CUDA_VERSION < 12010 (#166137)
Summary:
This PR fixes a compilation issue when `CUDA_VERSION < 12010`. Even if we might drop old CUDA support, let's correct the code itself.

## Issue
When `CUDA_VERSION` is `12010`, the following does not compile.
```
      mat1_sizes[0] > 1 && mat1_sizes[1] > 1 &&
      mat2_sizes[0] > 1 && mat2_sizes[1] > 1
      #if !(defined(CUDA_VERSION) && CUDA_VERSION >= 12010 || defined(USE_ROCM))
      // Here not "&&"
      mat2_sizes[0] < 65535 * 32 && mat2_sizes[1] < 65535 * 32 &&
```
This patch adds "&&"

Test Plan: CI

Differential Revision: D85356831

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166137
Approved by: https://github.com/ngimel, https://github.com/cyyever
2025-10-24 04:06:03 +00:00
263901cec4 [pytorch/kineto] Update Kineto Submodule (#166150)
Summary: Update to include some race condition fixes.

Test Plan: n/a

Differential Revision: D85390799

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166150
Approved by: https://github.com/sraikund16, https://github.com/cyyever
2025-10-24 04:03:13 +00:00
c12293dcbe [ONNX] Cover all FX passes into backed size oblivious (#166151)
Found a bug that after `run_decomposition()`, the shape could be fixed to 1. It's caused by the fact that all FX graph (related to shape inference) surgery should happen inside backed size oblivious patch.

```python
import torch
from transformers.models.phi3.modeling_phi3 import Phi3RMSNorm

# Previous to this PR, this will generate a fixed batch size
op = torch.onnx.export(
    Phi3RMSNorm(256).eval(),
    args=(),
    kwargs={"hidden_states": torch.rand((1, 32, 256))},
    dynamic_shapes={"hidden_states": {0: torch.export.Dim.DYNAMIC, 1: torch.export.Dim.DYNAMIC}},
)

# It is dynamic when it's only in torch.export
with torch.fx.experimental._config.patch(backed_size_oblivious=True):
    ep = torch.onnx.export(
    Phi3RMSNorm(256).eval(),
    args=(),
    kwargs={"hidden_states": torch.rand((1, 32, 256))},
    dynamic_shapes={"hidden_states": {0: torch.export.Dim.DYNAMIC, 1: torch.export.Dim.DYNAMIC}},
)
# But when run_decomposition is called outside of the patch, it is static.
# ep = ep.run_decompositions()
print(ep)

```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166151
Approved by: https://github.com/justinchuby
2025-10-24 03:25:16 +00:00
5a4997dcae [DeviceMesh] Implement a device mesh concatenate api for submesh and SPMD use case (#163358)
Today FSDP needs to slicing out spmd mesh from root mesh here: https://github.com/pytorch/pytorch/blob/main/torch/distributed/fsdp/_fully_shard/_fsdp_param.py#L301. But essentially, users want is a concatenate of some submesh into a big mesh and used as a spmd mesh. This PR is tentatively trying to implement this API for users.

One thing to note is that, all sub-mesh needs to slicing/flatten or unflatten from same root mesh otherwise the indices make no sense when it comes to mesh indexing and device allocation.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163358
Approved by: https://github.com/fegin
ghstack dependencies: #166003
2025-10-23 23:31:17 +00:00
47f638eae7 [ROCm] deserialize loads in planer sum portion of stats() of norm (#166021)
Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166021
Approved by: https://github.com/jeffdaily
2025-10-23 22:47:42 +00:00
882b834082 Export should use aot_export_joint_with_descriptors (#165931)
This diff moves export run_decompositions to use aot_export_joint_with_descriptors instead of aot_export_module. Doing so, i ran into 2 main bugs:
1) aot_export_joint_with_descriptors don't correctly pass in record_nn_module_stack flag that is needed to populate nn_module_stack by switching the internal tracer.
2) When creating symint with negative inputs, we need to pass in positive=False. This didn't matter before because aot_autograd directly returns integer inputs instead of creating symint.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165931
Approved by: https://github.com/zhxchen17
2025-10-23 22:42:11 +00:00
b146ea411e Save GitHub env variables on ROCm (#165821)
As `.github/actions/setup-rocm/action.yml` is now used on `linux_job_v2` to setup ROCm, we need to have this step here to save the list of GitHub env variables.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165821
Approved by: https://github.com/atalman
2025-10-23 22:13:37 +00:00
8625ffbd45 [DeviceMesh] Use _flatten_rank_map to replace _flatten_mesh_list so that we don't need to compare root mesh (#166003)
Since we are already share a flattened tensor `_rank_map` across all meshes from a same root mesh, we can just use a flattened list of it to replace the comparison of root_mesh and flattened_mesh_list (because with same _rank_map and layout, the mesh tensor is guaranteed to be the same). This way we can also give back the CPU overhead added in https://github.com/pytorch/pytorch/pull/164510 and further simply the code.

We do have a more ambitious universe-based change here: https://github.com/pytorch/pytorch/pull/165680 but it needs more discussions and would lead to BC breaking. We might eventually merge that PR but probably not now and this is a change which is not BC breaking and will help concatenate and 2D integration with concatenate.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166003
Approved by: https://github.com/Skylion007, https://github.com/fegin
2025-10-23 20:49:59 +00:00
0977cc4474 [lint] Extend workflowsync linter to more files (#166082)
And fix the lint issues found
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166082
Approved by: https://github.com/izaitsevfb, https://github.com/atalman
2025-10-23 20:29:29 +00:00
d9a55faccc [Pytorch] Add NEON Vectorized<double> translation layers (#166092)
Summary:
Adding NEON specializations of Vectorized<double>

Correcness has been checked using test_ops.py and running torch test

Test Plan:
Correctness:

buck2 test mode/opt //caffe2/test:test_ops
buck2 test mode/opt //caffe2/test:torch

Performance:

Added torch.float64 as data type to test within binary_test.py

Reviewed By: mcfi

Differential Revision: D84924406

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166092
Approved by: https://github.com/malfet
2025-10-23 20:20:48 +00:00
75b8295868 Revert "Warn if AccumulateGrad stream does not match producer node stream (#165065)"
This reverts commit 12f742941d6aecb72c18d8e602f90ac9b4f00af0.

Reverted https://github.com/pytorch/pytorch/pull/165065 on behalf of https://github.com/clee2000 due to broke internal builds D85273204 usages of TORCH_API void add need to be updated? ([comment](https://github.com/pytorch/pytorch/pull/165065#issuecomment-3438061854))
2025-10-23 17:02:49 +00:00
defb6a80d8 Enable torch.Generator to support pytorch/xla generator implementation (#161369)
Currently, the implementation of `torch.Generator` only support "cpu" and "cuda" device type.  https://github.com/pytorch/pytorch/blob/main/torch/csrc/Generator.cpp#L55-L61

This change enables `torch.Generator` to support more device type by allowing any device backend to register their own generator factory through a Generator Registry. This is similar to what "DeviceGuardImpl registry" does today.

# Key Changes:

## New registry API:

* Added GeneratorRegistry.h and GeneratorRegistry.cpp in c10/core/impl.
* API supports registerGenerator(DeviceType, GeneratorFactory), unregisterGenerator(DeviceType), and getGeneratorFactory(DeviceType).
* Uses c10::DeviceType as the key and stores a factory function returning c10::intrusive_ptr<c10::GeneratorImpl>.

## Python/C++ integration:

* The registry is consulted in the torch.Generator constructor path for non-CPU/CUDA devices.
* If a factory is registered for the requested device, it constructs the appropriate generator; otherwise, raises an error.

## Backend extensibility:

* Out-of-tree backends (e.g., torch_xla, torch-directml, torch_npu) can now register their custom generator implementation at module load via a static registrar object.
Example usage:
```
C++
namespace {
  struct Registrar {
    Registrar() {
      at::detail::registerGenerator(c10::DeviceType::XLA, &CreateXlaGenerator);
    }
  } registrar_instance;
}
```

This allows torch.Generator(device='xla') to return an XlaGeneratorImpl when the torch_xla extension is imported.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/161369
Approved by: https://github.com/FFFrog, https://github.com/qihqi, https://github.com/albanD
2025-10-23 16:49:28 +00:00
f8fccb1e48 [Code Clean] Clean asserts in torch/optim. (#165629)
Replaces 50 assert statements across 15 files in torch.optim with explicit  if-checks raising AssertionError to prevent assertions from being disabled with Python -O flag.

fix partially #164878

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165629
Approved by: https://github.com/albanD
2025-10-23 15:56:29 +00:00
5aac4cfce4 Use is rather than == to work around slow enum comparion in _ops.py (#165936)
This shows up (under _are_we_tracing) in DTensor dispatch. I have some work in flight to speed up enum comparison in pybind11, but `is` is just much faster and easy to use.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165936
Approved by: https://github.com/Skylion007, https://github.com/zou3519
2025-10-23 15:01:55 +00:00
baf91bbbfc Revert "[inductor][choices] lookup table choices 1/3 (#164978)"
This reverts commit ab9e466928e7a37844c4f2a8bf90c76d16ac3c34.

Reverted https://github.com/pytorch/pytorch/pull/164978 on behalf of https://github.com/malfet due to Looks like it broke slow tests, see cbcb4f7768/1 ([comment](https://github.com/pytorch/pytorch/pull/164978#issuecomment-3437424559))
2025-10-23 14:47:07 +00:00
cbcb4f7768 [pytorch][torchelastic] Duplicate stdout and stderr and apply custom filter in torchrun (#160712)
Summary:
Part of an effort to extract some important error logs (e.g. [#157996](https://github.com/pytorch/pytorch/pull/157996)) that was `tee`'ed to `stdout` and `stderr`.

The general idea is to:

- Duplicate the `tee`s on `stdout` and `stderr` to a separate file, `filtered_stdout.log` and `filtered_stderr.log`, respectively.
- In these files, as its name suggests, only log lines matching a customizable filter.
- Later on in another PR, append the contents of these files to the reply file.

Outline of changes in this PR:

- Enhance `TailLog` to be able to 1) stream to a file, and 2) only write when the line matches the passed filter.
- Add `filtered_stdout` and `filtered_stderr` to `LogsDest` and have `LogsSpecs` `reify` them.
- In `start_processes()` and `PContext`, add params `duplicate_stdout_filters` and `duplicate_stderr_filters` to filter and write the duplicated stream to the files above. When no filters are passed in, no duplicated streams are created.

Test Plan:
```
$ buck test 'fbcode//mode/opt' caffe2/test/distributed/elastic/multiprocessing:api_test
```
```
Buck UI: https://www.internalfb.com/buck2/f5c6b7da-217d-4a0b-872a-c7cd3d05587f
Test UI: https://www.internalfb.com/intern/testinfra/testrun/4222124951617688
Network: Up: 398B  Down: 44MiB  (reSessionID-a489a961-b602-45be-b851-3490ebb7a26a)
Analyzing targets. Remaining     0/200
Executing actions. Remaining     0/12856                                                                                                                                        0.1s exec time total
Command: test.     Finished 1 local
Time elapsed: 17:37.9s
Tests finished: Pass 52. Fail 0. Fatal 0. Skip 0. Build failure 0
```
```
$ buck test 'fbcode//mode/opt' caffe2/test/distributed/elastic/multiprocessing:tail_log_test
```
```
Buck UI: https://www.internalfb.com/buck2/d6d5c1c1-db98-4d9c-b608-7ba6fbb5e3ee
Test UI: https://www.internalfb.com/intern/testinfra/testrun/13510798985149262
Network: Up: 94KiB  Down: 417MiB  (reSessionID-27b46fba-d31c-4c04-8ede-a506454e6922)
Analyzing targets. Remaining     0/3                                                                                                                                            536 actions, 555 artifacts declared
Executing actions. Remaining     0/186                                                                                                                                          1:05.5s exec time total
Command: test.     Finished 7 local, 1 remote, 115 cache (93% hit)                                                                                                              37.0s exec time cached (56%)
Time elapsed: 1:11.5s
Tests finished: Pass 7. Fail 0. Fatal 0. Skip 0. Build failure 0
```

Rollback Plan:

Differential Revision: D80188995

Pull Request resolved: https://github.com/pytorch/pytorch/pull/160712
Approved by: https://github.com/fduwjj
2025-10-23 14:22:21 +00:00
2b93d5b450 [FlexAttention][CUDA] Add flex configs for Blackwell (#165760)
This PR fixes ULFs on `max_autotune` mode for high head-dim sizes on B200. Closes https://github.com/pytorch/torchtitan/issues/1791

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165760
Approved by: https://github.com/syed-ahmed, https://github.com/drisspg
2025-10-23 10:22:06 +00:00
6b7cd48e7e [ROCm] Deserialize loads in planer sum portion of reduce() of norm. (#165927)
Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165927
Approved by: https://github.com/jeffdaily
2025-10-23 09:45:01 +00:00
bf5aa9e42e [dynamo] Remove ID guard on method object (#166096)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166096
Approved by: https://github.com/tugsbayasgalan
2025-10-23 06:22:49 +00:00
b1eb6dede5 [vision hash update] update the pinned vision hash (#166046)
This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166046
Approved by: https://github.com/pytorchbot
2025-10-23 04:27:44 +00:00
673060beae [inductor] turn Inductor deterministic mode on with torch.use_deterministic_algorithms (#165950)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165950
Approved by: https://github.com/v0i0, https://github.com/eellison
2025-10-23 02:48:42 +00:00
2e8e9a59a8 Revert "[dynamo][easy] Support torch.accelerator.current_accelerator (#165734)" (#166094)
This reverts commit c18ddfc5721dd91bf29c769e850a99c4fdb6f380.

Discovers some latent issues causing internal failures. Will fix those issues first and resend the PR

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166094
Approved by: https://github.com/bdhirsh
2025-10-23 01:24:46 +00:00
fb277a5916 Enable new tracer by default (#165332)
Differential Revision: [D84516080](https://our.internmc.facebook.com/intern/diff/D84516080)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165332
Approved by: https://github.com/avikchaudhuri
ghstack dependencies: #165582, #163580
2025-10-23 00:40:29 +00:00
73fa0d0c63 test for #165446 (#165853)
Per title

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165853
Approved by: https://github.com/drisspg
2025-10-23 00:08:18 +00:00
36c21cc84e state dict staging fixes (#166025)
Summary:
This PR contains three changes -
1. We are losing non-blocking flag value and defaulting to False during the deep_copy. This is introducing a cuda synchronize after each tensor. This is slowing the staging.
2. Adding the capability to skip pinning for scalar tensors to reduce initial staging buffer creation cost. Setting it by default to 65 to avoid pinning small tensors.
3. Tensor share storage but each storage needs to be processed only once in the deep_copy with offloading logic. so, use the memoization table to cache storage ids.

Test Plan:
1. Verified non-blocking copies via kineto profile.
2. ran A/B jobs old and new staging with fixes such that it crashes after ever 2 checkpoints and restarts for several hours and compared loss curves and they are exactly identical.
3. tests

Differential Revision: D85180484

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166025
Approved by: https://github.com/pradeepfn
2025-10-22 23:32:41 +00:00
0b68814b44 Forward fix to D80948073 (#166023)
Summary:
realize tensor before accessing layout.

Differential Revision: D85172267

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166023
Approved by: https://github.com/laithsakka
2025-10-22 22:00:53 +00:00
e64a814ae7 [CUDA] Add experimental green context support for SM carveout (#159104)
Low-level PyTorch APIs should be usable/stable enough at this point but we might move the underlying driver API usage a bit from here...

Built on top of @drisspg 's branch

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159104
Approved by: https://github.com/ngimel, https://github.com/malfet, https://github.com/kwen2501

Co-authored-by: drisspg <drisspguessous@gmail.com>
Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
2025-10-22 21:38:52 +00:00
0b58d87aec [Submodule] Bump FBGEMM to latest (#165544)
Summary:

* FBGEMM submodule updated to main
* CMake updated to reflect necessary changes
* Notably pulls in NVFP4 grouped gemm kernels

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165544
Approved by: https://github.com/cyyever, https://github.com/jeffdaily
2025-10-22 20:57:15 +00:00
757975ad50 [export] Unified graph capture with fullgraph_capture. (#165562)
Summary:
_dynamo_graph_capture_for_export in the current form has the compability issue
with the main torch.compile() path despite we reuse fullgraph_capture as the
bytecode tracer. The reason is that we flip on many export specific flags
and even trace with a wrapped function which will cause divergence with
torch.compile() again.

This PR instead creates a new implementation of dynamo_graph_capture_for_export
which 100% relies on fullgraph capture and post-processing on CaptureOutput so
that we can avoid the inversion of phases in PT2 compiler stack.

This also benefits precompile workflow since we want to have a feature that
only accepts pytree inputs and ship portable python wrappers in package. In
other words, I think the code here is sharable between export and precompile
for exporting portable graph.

Test Plan:
===================================================================== test session starts =====================================================================
platform linux -- Python 3.12.11, pytest-7.3.2, pluggy-1.6.0
rootdir: /data/users/zhxchen17/pytorch
configfile: pytest.ini
plugins: xdoctest-1.1.0, hypothesis-5.35.1, xdist-3.3.1, subtests-0.13.1, rerunfailures-14.0, flakefinder-1.1.0, cpp-2.3.0, anyio-4.10.0
collected 9 items
Running 9 items in this shard

test/distributed/tensor/test_dtensor_export.py ........x                                                                                                [100%]

================================================================ 8 passed, 1 xfailed in 11.42s ================================================================

Reviewers:

Subscribers:

Tasks:

Tags:

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165562
Approved by: https://github.com/tugsbayasgalan
2025-10-22 20:44:55 +00:00
291712026b [dynamo][user_defined] Replace UserFunctionVariable with VariableTracker build (#165706)
Audit: To prevent future issues with functools.partial or callable
objects.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165706
Approved by: https://github.com/Lucaskabela, https://github.com/williamwen42
2025-10-22 19:28:27 +00:00
3e77a2b478 [PyTorch] Improve aarch64 performance of bfloat16 ops (#166028)
Summary:
PR allows compiler to better optimize some bfloat16-based operations, when ran on NEON

Benchmarks show measurable improvements:

Before:
bfloat16 add: 250.503us
bfloat16 sub: 245.674us
bfloat16 neg: 113.945us

After:
bfloat16 add: 203.862us ---> 23% higher throughput
bfloat16 sub: 201.526us ---> 22% higher throughput
bfloat16 neg: 74.986us ---> 52% higher throughput

Test Plan:
Correctness:

buck2 test mode/opt //caffe2/test:test_ops
buck2 test mode/opt //caffe2/test:torch

Performance:

 binary_test.py has been updated, to run bfloat16 benchmarks using basic arithmetic functions

Differential Revision: D85186786

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166028
Approved by: https://github.com/Skylion007
2025-10-22 19:25:33 +00:00
82ef1b5db3 [DebugMode] refactor logs into _DebugCalls (#165376)
Refactors `DebugMode.operators` to be more structured `_DebugCall` objects, instead of (op, args, kwargs, call_depth) tuples. Useful going forward for attaching more information (e.g. output info, call metadata).

Is BC-breaking, but attaches an `__iter__` method for `_OpCall` and `_RedistributeCall` so previous tuple usage is accessible.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165376
Approved by: https://github.com/yushangdi
2025-10-22 19:01:56 +00:00
5f370f5c42 inductor_provenance: Correctly handle null provenance (#166019)
Summary:
If the provenance is null, we're getting crashes of the form
```
[trainers0]:E1021 10:51:31.990525  2752 PythonApi.h:87] Exception caught in
GeneratedDynamoCompileLoggerConfig: <class
'dsi.logger.py3.GeneratedDynamoCompile.LogEntry.thrift_types.GeneratedDynamoCompileLogEntryThriftBase'>:
error initializing Thrift struct field 'inductor_provenance_thrift_safe':
Cannot create internal string data representation. Expected type <class 'str'>,
got: <class 'NoneType'>.
```

Also fixed a type signature that wasn't being enforced. (It's still not
enforced, but it's accurate).

Test Plan:
Added a new test which reproduces the logging issue

Differential Revision: D85173596

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166019
Approved by: https://github.com/ppanchalia, https://github.com/yushangdi
2025-10-22 18:21:57 +00:00
05b2e02cb4 Revert "[lint] workflow consistency linter to look at all files instead of just changed files (#165171)"
This reverts commit c746feb86a1459db5f6294730d1d72ed15f16dd3.

Reverted https://github.com/pytorch/pytorch/pull/165171 on behalf of https://github.com/clee2000 due to broke lint [GH job link](https://github.com/pytorch/pytorch/actions/runs/18723760085/job/53402955955) [HUD commit link](c746feb86a) ([comment](https://github.com/pytorch/pytorch/pull/165171#issuecomment-3433501457))
2025-10-22 17:47:29 +00:00
12f742941d Warn if AccumulateGrad stream does not match producer node stream (#165065)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165065
Approved by: https://github.com/ngimel
2025-10-22 17:33:27 +00:00
35180fafee Allow GraphPickler to pickle graph modules containing AOTCompiled subgraphs (#165844)
This PR allows GraphPickler to pickle aot_eager graph modules that have regional inductor bits in them, with a few exceptions:
- FlexAttentionBackward isn't marked cacheable, so those tests don't work immediately since we're not sure how to serialize it. But it's safe to serialize/cache, so the next PR fixes those unit tests.
- It seems that when reloading a GraphPickled object, we don't recompile subgraphs. Will investigate this in a future PR

All unit tests in test_regional_inductor are parameterized so that we try serializing and deserializing the returned graph module before returning.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165844
Approved by: https://github.com/oulgen
ghstack dependencies: #165843
2025-10-22 17:03:49 +00:00
c746feb86a [lint] workflow consistency linter to look at all files instead of just changed files (#165171)
As in title

If you change only one workflow file, lintrunner (default arg, also the one in CI since it only inputs changed files) won't look at other files in the repo, but the sync-tag might come from those other files

This makes it so that it looks at all workflow files so it will catch those failures

Pros:
catches errors

Cons:
unusual behavior (getting around what lintrunner says the linter should run on)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165171
Approved by: https://github.com/malfet
2025-10-22 16:57:59 +00:00
c5f26db5bf fix #166057: add tmp ptr to avoid gcc internal compiler error (#165717)
Fixes #166057

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165717
Approved by: https://github.com/malfet
2025-10-22 16:38:26 +00:00
18e99b6d45 [dirsync] Switch to top-level xplat/third-party/pthreadpool (#165995)
Summary: `fbcode//xplat/third-party/pthreadpool:` just redirects to the xplat version. Switch to the real location

Test Plan: This should be a no-op, so CI?

Differential Revision: D83999534

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165995
Approved by: https://github.com/bigfootjon, https://github.com/Skylion007
2025-10-22 16:18:23 +00:00
ab9e466928 [inductor][choices] lookup table choices 1/3 (#164978)
\# why

- enable users to control which choices get used on which inputs
- reduce lowering time, and pin kernel selection, by selecting
  them for the inputs

\# what

- a new InductorChoices subclass that implements a lookup table
- a README explaining the usage
- corresponding testing

- currently only supports templates that go through
  `V.choices.get_template_configs`

\# testing

```
python3 -bb -m pytest test/inductor/test_lookup_table.py -v
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164978
Approved by: https://github.com/PaulZhang12, https://github.com/eellison
2025-10-22 16:11:31 +00:00
af4ba78543 [scan x vmap] support scan in vmap (#165580)
This is required by the chunked_with_scan work where two nested vmap(vmap) with chunk sizes > 1 are invoked, which produces a scan-> vmap -> scan -> vmap chain and we need to handle the case of vmap(scan) and scan(vmap).

The way we handle vmap(scan) is to turn it into scan(vmap(combine_fn)). The idea being that the combine_fn no longer do the combine_fn for a single slice, it vmaps over the combine_fn and do multiple combine_fns in one step. We need to need know how combine_fn propagates the batched tensor and what are the batched dims of the output. For this purpose, we use restore_vmap to give us the out_dims information.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165580
Approved by: https://github.com/zou3519
ghstack dependencies: #165675
2025-10-22 09:46:00 +00:00
282f39a4bc [vmap][dynamo] use create_proxy instead of create_node in vmap increate nesting ctx manager (#165675)
create_node won't do the auto closure lifting, this cause problems when the context manager is used in a hop region. Switch to create_proxy instead.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165675
Approved by: https://github.com/zou3519, https://github.com/guilhermeleobas
2025-10-22 09:46:00 +00:00
a479769488 [dynamo] Clean up assert in dynamo [2/N] (#165745)
Extend from #165430
* #165903(Clean up for graph break)
* ->#165745
* #165430

One main refractor from the previous PR:
* For assertions like checking `len(args)` or `len(kwargs)`, using `raise_args_mismatch` instead of `raise_type_error_exc`

I am also considering moving `raise_type_error_exc` into `utils.py` for consistency.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165745
Approved by: https://github.com/Lucaskabela
2025-10-22 07:12:37 +00:00
26c7375477 Remove the branch of IS_CUSPARSE11_AVAILABLE is False (#166048)
This PR removes the branch when `IS_CUSPARSE11_AVAILABLE` is 0. Note that the condition `ROCM_VERSION >= 60300` holds currently as the minimum supported ROCm is 6.3 .
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166048
Approved by: https://github.com/Skylion007
2025-10-22 07:10:11 +00:00
d01f15152c Move toUnderlying to headeronly (#165694)
As in the title. Required in upper PRs of this ghstack.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165694
Approved by: https://github.com/janeyx99
2025-10-22 05:31:16 +00:00
4fae6968b1 Move toString(ScalarType) and ScalarType ostream operator to headeronly (#164405) (#166018)
This PR is created to replace the reverted PR https://github.com/pytorch/pytorch/pull/164405
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166018
Approved by: https://github.com/janeyx99
2025-10-22 05:16:58 +00:00
f9953e0f61 Enable PLC0414 on ruff (#165828)
This PR enables `PLC0414` that fixes redundant import aliases.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165828
Approved by: https://github.com/albanD
2025-10-22 04:56:52 +00:00
34ed7a8f0d [ROCm] Skip test_blockwise_nvfp4_with_global_scale (#165968)
Disable the fp4 global_scale test till the feature is enabled on ROCm.

Fixes #166027.
Not really, but we're trading an issue for a test skip decorator since the test is parameterized.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165968
Approved by: https://github.com/jeffdaily, https://github.com/drisspg
2025-10-22 04:23:05 +00:00
2fde10d914 [ROCm] fix test_allocator_backend (#166035)
Fixes #165872.

Forward fix PR #165298. hipify was causing some symbols to be replaced.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166035
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-10-22 03:46:23 +00:00
0a93295da0 Update doc (#166024)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166024
Approved by: https://github.com/yiming0416
2025-10-22 03:41:31 +00:00
4b898b51b9 [12/n][take2] : Remove fbandroid_compiler_flags platform args (#165916)
Summary: This diff removes the `fbandroid_compiler_flags` and merges its content with `compiler_flags` and wraps it in a android select. My first attempt at this got reverted - D84626885.

Test Plan:
CI and failing builds are now passing
```
buck2 build --target-universe fbsource//fbandroid/apps/wearable/system/healthservices:healthservices_target30_mosnative_xhdpi_arm64_release_debug_keystore_redex_postprocessed_repack_resign @//fbandroid/mode/nosan @//fbandroid/mode/opt @//fbandroid/mode/milan_build_rdk @//fbandroid/mode/relr-relocations fbsource//fbandroid/apps/wearable/system/healthservices:healthservices_target30_mosnative_xhdpi_arm64_release_debug_keystore_redex_postprocessed_repack_resign fbsource//fbandroid/apps/wearable/system/healthservices:healthservices_target30_mosnative_xhdpi_arm64_release_debug_keystore_redex_genrule fbsource//fbandroid/apps/wearable/system/healthservices:healthservices_target30_mosnative_xhdpi_arm64_release_debug_keystore-mobileconfig-definition-resource-gen fbsource//fbandroid/apps/wearable/system/healthservices:healthservices_target30_mosnative_xhdpi_arm64_release_debug_keystore
File changed: fbsource//tools/build_defs/fb_xplat_cxx_library.bzl
Buck UI: https://www.internalfb.com/buck2/509c0b7b-ada3-421a-8c32-2f1d3a7babdd
Network: Up: 1.3MiB  Down: 293MiB  (reSessionID-17f73b81-3c34-4c01-9f6c-2b4f3c8332e3)
Loading targets.   Remaining     0/1311                                                                                                                                                                                                292986 targets declared
Analyzing targets. Remaining     0/13515                                                                                                                                                                                               216715 actions, 359204 artifacts declared
Executing actions. Remaining     0/40415                                                                                                                                                                                               6:33.3s exec time total
Command: build.    Finished 40 local, 790 remote
Time elapsed: 32.0s
BUILD SUCCEEDED
```

Reviewed By: jaejunku

Differential Revision: D84868234

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165916
Approved by: https://github.com/malfet
2025-10-22 03:01:55 +00:00
550e3e6efb [dynamo] Fix MATCH_KEYS for dict pattern matching (#165956)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165956
Approved by: https://github.com/guilhermeleobas, https://github.com/cyyever
2025-10-22 02:52:07 +00:00
715449ca76 [MPS] Fix parity between CPU and MPS on singular matrices in linalg.lu_factor (#165871)
Fixes #165870. Follow up from #165254.

This PR [a] removes the MPS specific version of `lu_factor` in favor of the version in BatchedLinearAlgebra.cpp which uses `lu_factor_ex`, and [b] updates `lu_factor_ex` error codes to match expectations.

When `lu_factor` was first implemented for MPS (#99269), it bypassed the implementation in BatchedLinearAlgebra.cpp since we did not have `lu_factor_ex`. Since #144651 implements `lu_factor_ex`, we can now remove the MPS specific wrapper.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165871
Approved by: https://github.com/kulinseth, https://github.com/albanD
2025-10-22 02:48:40 +00:00
84d8d06fc3 Fixes floating point exception in torch.nn.PixelShuffle (#163154)
Fixes #162251

**Previous Output:**
`Floating point exception (core dumped)`

**Now Output:**
`RuntimeError: upscale factor is too large, (upscale_factor}^2 overflowed: upscale_factor=545460846592`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163154
Approved by: https://github.com/cyyever, https://github.com/albanD
2025-10-22 02:22:16 +00:00
60992d98b2 [dynamo][remaining] Replace UserFunctionVariable with VariableTracker build (#165896)
Audit: To prevent future issues with functools.partial or callable objects.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165896
Approved by: https://github.com/Lucaskabela
2025-10-22 02:13:00 +00:00
59e015e3a1 Remove outdated CUB macros (#164656)
This PR removes `CUB_SUPPORTS_NV_BFLOAT16` and `CUB_SUPPORTS_FUTURE_VALUE` because they are always true on CUDA >=12 installations with its CUB version. Their branches are also removed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164656
Approved by: https://github.com/albanD, https://github.com/eqy, https://github.com/jeffdaily
2025-10-22 02:02:50 +00:00
8904a5a7c9 Move allocation size config to AllocatorConfig for cross-allocator sharing (#159553)
# Motivation
Make CUDA and XPU share the same config and code. And allow the other backends to reuse them.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159553
Approved by: https://github.com/albanD
ghstack dependencies: #160067
2025-10-22 01:48:56 +00:00
f5df9ca03a Fix creation of BINARY_SUBSCR in Python 3.14+ (#165864)
Python 3.14 replaced `BINARY_SUBSCR` by `BINARY_OP(opcode=BN_SUBSCR)`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165864
Approved by: https://github.com/williamwen42
2025-10-22 01:43:03 +00:00
2998abd777 [Code Clean] Better error handling in torch/csrc/distributed (#165053)
Replace the runtime_error of the vallina C++ exceptions with TORCH_CEHCK
Including:

torch/csrc/distributed/*

fix partialy #148114

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165053
Approved by: https://github.com/FFFrog, https://github.com/albanD
2025-10-22 01:40:36 +00:00
e13580e41c [AMD] Run int4_mm tests only for compatible arch (#165630)
Such tests should be skipped for rest including gfx1100(Navi3x)

Fixes for CI HUD for gfx1100

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165630
Approved by: https://github.com/jeffdaily

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
2025-10-22 01:38:55 +00:00
f3b8e15f20 [AMD][gfx1100] test_decompose_mem_bound_mm.py tolerance increase (#165625)
test_decompose_mem_bound_mm.py tolerance increase for navi3x(gfx11x)

(cherry picked from commit 03c7da05f61890bbf5ae41e23c8df6d5f6805bac) from

Fixes for CI HUD for gfx1100

Signed-off-by: Artem Kuzmitckii <artem.kuzmitckii@amd.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165625
Approved by: https://github.com/jeffdaily

Co-authored-by: iupaikov-amd <Iurii.Paikov@amd.com>
Co-authored-by: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com>
Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-10-22 01:38:48 +00:00
5211f4c108 [MPS] Fix SDPA fp16 overflow (#165961)
Do not cast intermediate result back to lower precision data data until
softmax is finished, otherwise it might produce NaN

Adjust the test to use 256 as filler value rather than 64

Fixes https://github.com/pytorch/pytorch/issues/160841
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165961
Approved by: https://github.com/dcci, https://github.com/Skylion007
ghstack dependencies: #165960
2025-10-22 01:29:42 +00:00
ad9027b80d [BE] Remove unused 'rows' parameter from spmm_bmm_coo_rows_grouped (#166041)
To fix following compilation warning
```
Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/sparse/mps/kernels/Mul.metal:76:14: warning: unused variable 'B' [-Wunused-variable]
  const uint B = dims.x;
             ^
/Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/sparse/mps/kernels/Mul.metal:65:26: warning: unused parameter 'rows' [-Wunused-parameter]
    device const long*   rows      [[buffer(0)]],
                         ^
2 warnings generated.
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166041
Approved by: https://github.com/Skylion007
2025-10-22 00:59:41 +00:00
a1005427bf [xpu] Support high stream for ProcessGroupXCCL (#163049)
Add high priority stream support for ProcessGroupXCCL. Just like CUDA, XPU streams also support execution with higher priority compared to other streams. Implementation in https://github.com/intel/torch-xpu-ops/pull/1715, add register here.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163049
Approved by: https://github.com/guangyey, https://github.com/gujinghui, https://github.com/EikanWang, https://github.com/albanD
2025-10-22 00:54:25 +00:00
35153d0846 Simplify c10::guts::apply (#164566)
There is only one call site of `c10::guts::apply` that can be replaced by `:std::apply` except for ROCm. This PR therefore simplifies the implementation of `c10::guts::apply`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164566
Approved by: https://github.com/Aidyn-A, https://github.com/albanD
2025-10-22 00:47:43 +00:00
7773a22cdb Revert "[AMP][Refactor] Autocast dtype handling to simplify device-specific c… (#165221)"
This reverts commit 4be1e3bf926b8e798fede3be6a3051560e9e00c5.

Reverted https://github.com/pytorch/pytorch/pull/165221 on behalf of https://github.com/clee2000 due to I think this broke test_openreg [GH job link](https://github.com/pytorch/pytorch/actions/runs/18698271058/job/53322459496) [HUD commit link](4be1e3bf92) note to self: bad TD ([comment](https://github.com/pytorch/pytorch/pull/165221#issuecomment-3430012693))
2025-10-22 00:26:57 +00:00
7cb467a169 [CI] Update ONNX CI packages to latest (#165883)
This PR updates ONNX related packages to their latest versions used in CI environments.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165883
Approved by: https://github.com/justinchuby, https://github.com/albanD
2025-10-22 00:25:35 +00:00
12aac12b8d [Code Clean] Replace std::runtime_error with TORCH_CHECK (#165209)
Including:
1. `aten/src/ATen/core`
2. `c10/core`

Fixes part of #148114

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165209
Approved by: https://github.com/FFFrog, https://github.com/albanD
2025-10-22 00:05:22 +00:00
2b748d0a56 Add operator name to output json (#164583)
The benchmarks, model_name on dashboard needs to be grouped with operator_name. This PR passed an additional argument operator_name to the json for grouping.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164583
Approved by: https://github.com/yangw-dev
2025-10-21 23:58:39 +00:00
16745a882a [aoti][win] add support for a list of shim libraries (#165914)
As title, support passing in a list of shim libraries when cross compiling artifacts

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165914
Approved by: https://github.com/desertfire
2025-10-21 22:55:17 +00:00
8daef35cf1 Revert "[Code Clean] Clean asserts in torch/ao/quantization (root, quantizer, backend_config) (#165433)"
This reverts commit df64c0c4649984093bd1a46f1e9c658c72018200.

Reverted https://github.com/pytorch/pytorch/pull/165433 on behalf of https://github.com/clee2000 due to I think this broke some quantization tests ([comment](https://github.com/pytorch/pytorch/pull/165433#issuecomment-3429741770))
2025-10-21 22:10:19 +00:00
51319ca090 [Pytorch] Add NEON Vectorized<uint> family of translation layers (#165690)
Summary:
Adding NEON specializations of Vectorized<T> for uint8, uint16, uint32 and uint64.

Correcness has been checked using test_ops.py

operator_benchmark_test.py, which uses the PyTorch API, shows significant enhancements in some operations:

Before:

uint8 mul: 1460.751us
uint8 add: 2359.565us
uint8 lsl: 2151.206us

After:

uint8 mul: 194.792us ---> 650% higher throughput
uint8 add: 195.609us ---> 1100% higher throughput
uint8 lsl: 186.249us ---> 1055% higher throughput

Test Plan:
Correctness:

buck2 test mode/opt //caffe2/test:test_ops
buck2 test mode/opt //caffe2/test:torch

Performance:

buck2 run mode/opt //caffe2/benchmarks/operator_benchmark/fb:operator_benchmark_test

Reviewed By: mcfi

Differential Revision: D84770153

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165690
Approved by: https://github.com/malfet
2025-10-21 21:46:55 +00:00
d311a3d1dc A temporary fix to autotune out of range and related IMA (#165943)
Summary:
Autotune issue during lowering w/ AOTI:
```
setStorage: sizes [1536, 32, 8192], strides [8192, 8192, 1], storage offset 0, and itemsize 2 requiring a storage size of 25673728 are out of bounds for storage of size 25362432
```
Need a hack to create new base tensor with sufficient storage

Test Plan: Finally be able to see the e2e test passes on CI. See the detailed Test Plan in D83520844

Differential Revision: D84872792

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165943
Approved by: https://github.com/laithsakka
2025-10-21 21:40:20 +00:00
04adfe5ba9 Make Backend::setGroupUid virtual (#165957)
As titled, so that we may customize this function in custom backends

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165957
Approved by: https://github.com/d4l3k
2025-10-21 21:33:24 +00:00
4be1e3bf92 [AMP][Refactor] Autocast dtype handling to simplify device-specific c… (#165221)
This PR refactors the autocast context manager in autocast_mode.py to simplify and centralize the logic for checking supported dtypes for each device. The previous implementation repeated similar checks for multiple device types. Now, a single mapping device_supported_dtypes is used to associate device types with their supported dtypes, and the validation logic is unified.

**The former PR #163446 was merged but reverted due to failed CI test on `openreg` related tests.**

This RR additionally slightly modified some test assertions for passing the CI tests. CI failed due to assertion for the exactly same error message. For example:
```
File "/var/lib/jenkins/workspace/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_autocast.py", line 9, in test_autocast_with_unsupported_type
    with self.assertWarnsRegex(
        AssertionError: "In openreg autocast, but the target dtype torch.float32 is not supported." does not match "In openreg autocast, but the target dtype is not supported. Disabling autocast."
```

Sorry for the inconvenience again.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165221
Approved by: https://github.com/FFFrog, https://github.com/albanD
2025-10-21 21:32:12 +00:00
e7592f4005 [CI] Move the periodic debug tests to newer runner (#165158)
Previously g3 = NVIDIA Tesla M60
Now g6 = NVIDIA L4
Also change cuda arch list accordingly

Pros:
More memory, newer GPU

Cons:
That was one of the few remaining tests on g3 runners, so we probably lost coverage?

We can probably run more tests in parallel now but I'm not going to do that here

Disabled a bunch of sparse tests and nestedtensor tests that were previously skipped due to not having sufficient hardware?  They are now failing with
```
Traceback (most recent call last):
  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 3293, in wrapper
    method(*args, **kwargs)
  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 3292, in wrapper
    with policy():
  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 2532, in __enter__
    self.beforeStreams[-1].synchronize()
  File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/cuda/streams.py", line 105, in synchronize
    super().synchronize()
torch.AcceleratorError: CUDA error: device-side assert triggered
Search for `cudaErrorAssert' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Exception raised from stream_synchronize at /var/lib/jenkins/workspace/c10/cuda/CUDAFunctions.h:120 (most recent call first):
C++ CapturedTraceback:
#4 std::_Function_handler<std::shared_ptr<c10::LazyValue<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > const> (), c10::SetStackTraceFetcher(std::function<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > ()>)::{lambda()#1}>::_M_invoke(std::_Any_data const&) from Logging.cpp:0
#5 c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) from ??:0
#6 c10::cuda::c10_cuda_check_implementation(int, char const*, char const*, unsigned int, bool) [clone .cold] from CUDAException.cpp:0
#7 THCPStream_synchronize(_object*, _object*) from Stream.cpp:0
#8 cfunction_vectorcall_NOARGS from /usr/local/src/conda/python-3.10.14/Objects/methodobject.c:489
#9 _PyObject_VectorcallTstate from /usr/local/src/conda/python-3.10.14/Include/cpython/abstract.h:114
#10 _PyEval_EvalFrame from /usr/local/src/conda/python-3.10.14/Include/internal/pycore_ceval.h:46
#11 _PyObject_VectorcallTstate from /usr/local/src/conda/python-3.10.14/Include/cpython/abstract.h:114
#12 _PyEval_EvalFrame from /usr/local/src/conda/python-3.10.14/Include/internal/pycore_ceval.h:46
```
when run with cuda launch blocking I got a ton of stuff like
```

/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [5,3,0], thread: [2,7,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [5,3,0], thread: [3,7,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [3,8,0], thread: [0,0,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [3,8,0], thread: [1,0,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [3,8,0], thread: [2,0,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [3,8,0], thread: [3,0,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [3,8,0], thread: [0,1,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [3,8,0], thread: [1,1,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [3,8,0], thread: [3,1,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [3,8,0], thread: [0,2,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [3,8,0], thread: [2,2,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [3,8,0], thread: [3,2,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [3,8,0], thread: [0,3,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [3,8,0], thread: [1,3,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [3,8,0], thread: [1,4,0] Assertion `value < upper_bound` failed.
/var/lib/jenkins/workspace/third_party/cutlass/include/cutlass/integer_subbyte.h:124: cutlass::integer_subbyte<Bits, Signed>::integer_subbyte(unsigned int) [with int Bits = 2; __nv_bool Signed = false]: block: [3,8,0], thread: [3,4,0] Assertion `value < upper_bound` failed.
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165158
Approved by: https://github.com/seemethere
2025-10-21 21:28:12 +00:00
d334c3649d [CUDA] fix reflection padding for large batch size (#165942)
Fixes [#165861](https://github.com/pytorch/pytorch/issues/165861)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165942
Approved by: https://github.com/eqy
2025-10-21 21:07:38 +00:00
9f82535c5a [ROCm] [Normalization] Update block size (#165941)
* Seeing upto 6x improvement

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165941
Approved by: https://github.com/jeffdaily
2025-10-21 20:53:05 +00:00
5b35fc8777 Support multiple commits on push events in trunk tagging workflow (#165937)
Context:
* this workflow is used to create tags like `trunk/{sha}` for all `main` commits
* those tags are used by [autorevert](https://github.com/pytorch/test-infra/blob/main/aws/lambda/pytorch-auto-revert/README.md) to rerun selected workflows

Problem: currently the workflow creates only a single tag per push event, while ghstack pushes multiple commits per single push.

This PR supports tag creation for all commits in the push event.

Complimentary autorevert PR: https://github.com/pytorch/test-infra/pull/7291

---

### Testing

I created an identical copy of this workflow in my personal repo: https://github.com/izaitsevfb/pr-head-test/actions/workflows/trunk-tagging.yml

See action runs there.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165937
Approved by: https://github.com/huydhn
2025-10-21 20:52:34 +00:00
2f38eece7c [CUDA][cuBLAS] addmm -- some refactoring for easier navigation between the Lt and non-Lt paths (#163955)
As per title. Additionally, some Lt selection conditions are revisited, and some redundancy removed (especially in the ROCm vs non-ROCm paths).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163955
Approved by: https://github.com/ngimel, https://github.com/eqy
2025-10-21 20:48:12 +00:00
830e789a55 [dynamo][annotate] Graph break cleanly on fx.traceback.annotate reconstruction (#166006)
This avoids generation of bad bytecode, leading to really confusing
error. I am not sure why we can't reconstruct cleanly, it has to do with
the input being a dict, while other supported ctx managers take bools.

Fixing that is for another day. Lets give a good error message for now.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166006
Approved by: https://github.com/yushangdi, https://github.com/SherlockNoMad
2025-10-21 20:48:04 +00:00
ad4dc52bf6 Revert "shrink_group implementation to expose ncclCommShrink API (#164518)"
This reverts commit 4e643422f63a3cdd71bd141615f98de6bb54d15f.

Reverted https://github.com/pytorch/pytorch/pull/164518 on behalf of https://github.com/albanD due to Breaks lint ([comment](https://github.com/pytorch/pytorch/pull/164518#issuecomment-3429426503))
2025-10-21 20:24:14 +00:00
dac9ed9790 Bump uv from 0.8.6 to 0.9.5 in /.ci/lumen_cli (#166017)
Bumps [uv](https://github.com/astral-sh/uv) from 0.8.6 to 0.9.5.
- [Release notes](https://github.com/astral-sh/uv/releases)
- [Changelog](https://github.com/astral-sh/uv/blob/main/CHANGELOG.md)
- [Commits](https://github.com/astral-sh/uv/compare/0.8.6...0.9.5)

---
updated-dependencies:
- dependency-name: uv
  dependency-version: 0.9.5
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2025-10-21 13:16:30 -07:00
1c7fe8f861 [BugFix] chunk_size should always be int64_t (#165971)
aspired by https://github.com/pytorch/pytorch/pull/156872
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165971
Approved by: https://github.com/albanD
2025-10-21 19:52:47 +00:00
4e643422f6 shrink_group implementation to expose ncclCommShrink API (#164518)
Closes #164529

To expose the new [ncclCommShrink](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommshrink) API to PyTorch.

This is useful when you need to exclude certain GPUs or nodes from a collective operation, for example in fault tolerance scenarios or when dynamically adjusting resource utilization.

For more info:  [Shrinking a communicator](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#shrinking-a-communicator)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164518
Approved by: https://github.com/kwen2501
2025-10-21 19:47:33 +00:00
3c3b278872 [reland][fx] Move Node._prepend/Node._remove_from_list to C++ (#165882)
Relands #148261 that was reverted by #150542

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165882
Approved by: https://github.com/ezyang
2025-10-21 19:43:55 +00:00
0bd12c1168 [CI] Extend test_transfomers to MPS (#165960)
Just skip grad_checks as they need float64
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165960
Approved by: https://github.com/Skylion007
2025-10-21 19:27:44 +00:00
ce8a7764e2 Revert "[dynamo][misc] Replace UserFunctionVariable with VariableTracker build (#165707)"
This reverts commit 1290b077f26543a34262587137ef64ca9ca5e17d.

Reverted https://github.com/pytorch/pytorch/pull/165707 on behalf of https://github.com/clee2000 due to failing internal tests D85160820 ([comment](https://github.com/pytorch/pytorch/pull/165707#issuecomment-3429084393))
2025-10-21 19:25:03 +00:00
d1269a0434 update fr trace analysis (#165994)
Summary:
- allow empty entries from ranks
- allow not all ranks to provide dump

---
[//]: # (BEGIN SAPLING FOOTER)
Stack created with [Sapling](https://sapling-scm.com). Best reviewed with [ReviewStack](https://reviewstack.dev/pytorch/pytorch/pull/165994).
* #165638
* #165640
* #165642
* __->__ #165994
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165994
Approved by: https://github.com/fduwjj
2025-10-21 19:14:33 +00:00
c87cf1be32 Update workaround to old CUDA bug (#164354) (#165984)
The workaround cannot be removed because of BC. Here we'll
update PyTorch code base to not use the workaround.

See https://github.com/pytorch/pytorch/pull/164354 for the BC breakage issue.

Resolves https://github.com/pytorch/pytorch/issues/164348.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165984
Approved by: https://github.com/janeyx99
2025-10-21 19:09:43 +00:00
2fc5e45a41 better error message when there is no pytree impl (#165955)
Differential Revision: [D85117597](https://our.internmc.facebook.com/intern/diff/D85117597)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165955
Approved by: https://github.com/avikchaudhuri
2025-10-21 18:49:22 +00:00
f9022ba93b [PyTorch] Add user_metadata display to memory visualizer (#165939)
Summary: Enhanced the PyTorch CUDA memory visualizer to display user_metadata alongside stack frames when inspecting allocations. The user_metadata field is now shown in all views (Allocator State History, Active Memory Timeline, etc.) with consistent formatting. The implementation handles both string and object metadata types, displaying strings directly and objects as key-value pairs.

Test Plan:
1. Generate a memory snapshot with user_metadata
2. Open the memory visualizer in a browser
3. Load the snapshot file
4. Verify user_metadata appears
5. Test with both string metadata ("testing") and object metadata ({"key": "value"})
6. Verify formatting shows "User Metadata:\n  <value>" for strings

 {F1982860439}

Differential Revision: D85095152

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165939
Approved by: https://github.com/yushangdi
2025-10-21 18:48:33 +00:00
ff8be889ad Remove unused exception parameter from some files, to work with -Wunused-exception-parameter (#165770)
Summary: address compiler complains that were coming up to unblock the build

Test Plan:
before the change
```
aten/src/ATen/native/LinearAlgebra.cpp:3623:36: error: unused exception parameter 'e' [-Werror,-Wunused-exception-parameter]
 3623 |     } catch (const std::exception& e) {
      |
```

after: targets build with `-Wunused-exception-parameter`

Differential Revision: D84876246

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165770
Approved by: https://github.com/Skylion007, https://github.com/cyyever

Co-authored-by: Tony Targonski <tony.targonski@meta.com>
2025-10-21 18:30:29 +00:00
292454942e [CD] Introduce windows.12xlarge runners for CD Windows build (#165287)
Follows https://github.com/pytorch/test-infra/pull/7174. Windows CD build time cost comparison as below

|Runner|cpu|cuda|xpu|
|-|-|-|-|
|windows.4xlarge|1.5h| 4.0h| 5.5h|
|windows.12xlarge|0.5h|1.5h|2.5h|

Fixes #162962
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165287
Approved by: https://github.com/zxiiro, https://github.com/malfet, https://github.com/seemethere
2025-10-21 18:28:23 +00:00
6c4412f72b Revert "[Inductor] support masked vectorization for the tail_loop for float64 datatype (#163316)"
This reverts commit e9d89734274a4a2640fa77b898c800a87d1d874e.

Reverted https://github.com/pytorch/pytorch/pull/163316 on behalf of https://github.com/clee2000 due to seems to have broken some no_gpu tests? test/inductor/test_cpu_repro.py::CPUReproTests::test_double_reduction_vec [GH job link](https://github.com/pytorch/pytorch/actions/runs/18689033019/job/53290772740) [HUD commit link](e9d8973427) ([comment](https://github.com/pytorch/pytorch/pull/163316#issuecomment-3428210509))
2025-10-21 17:44:42 +00:00
78bf6186f2 Revert "[Inductor] support masked vectorization for the tail_loop for fp8 datatype (#163324)"
This reverts commit e8cb34dd52c063a130f3e659576c313bbe4b4981.

Reverted https://github.com/pytorch/pytorch/pull/163324 on behalf of https://github.com/clee2000 due to seems to have broken some no_gpu tests? test/inductor/test_cpu_repro.py::CPUReproTests::test_double_reduction_vec [GH job link](https://github.com/pytorch/pytorch/actions/runs/18689033019/job/53290772740) [HUD commit link](e9d8973427) ([comment](https://github.com/pytorch/pytorch/pull/163316#issuecomment-3428210509))
2025-10-21 17:44:42 +00:00
c40048472c Remove AOTI cross compilation time from internal CI (#165935)
Summary: as title

Test Plan: CI

Differential Revision: D85088451

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165935
Approved by: https://github.com/desertfire
2025-10-21 16:58:28 +00:00
3dfd0c7584 Improve PATH hints in FindvecLib.cmake (#165881)
Change  /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk to /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk in `cmake/Modules/FindvecLib.cmake` which is more general (and MacOSX10.9 is not supported now). Otherwise, vecLib can't be found on MacOS 26.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165881
Approved by: https://github.com/ezyang
2025-10-21 16:44:12 +00:00
e6ba4d0725 Back out "Do not decompose in functionalization/proxy tensor if autograd wouldn't have decomposed (#164939)" (#165910)
Summary:
Original commit changeset: d6d62d0c96dd

Original Phabricator Diff: D84468451 and D84613184

D84468451 caused CUDA OutOfMemoryError in model.

Test Plan:
D84468451 was found through bisect.  Also double checked on recent trunk 9866939225248c2adc307be7a804b26db0b9b555: f815887517

With this diff that backs out D84468451 and D84613184 : f816114560

Differential Revision: D85025378

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165910
Approved by: https://github.com/clee2000
2025-10-21 16:36:38 +00:00
bdf7cb9d9c Revert "[torch/utils][Code Clean] Clean asserts in torch/utils/*.py (#165410)"
This reverts commit e20c9bf2889b9252ac45ae6af35c93c795eab701.

Reverted https://github.com/pytorch/pytorch/pull/165410 on behalf of https://github.com/clee2000 due to sorry I'm going to revert this since I want to try to back out some other things that are conflicting with this, there is nothing wrong with this PR, rebasing and resolving the merge conflicts should be enough, sorry for the churn ([comment](https://github.com/pytorch/pytorch/pull/165410#issuecomment-3427532373))
2025-10-21 16:27:54 +00:00
6aed378958 [export] Handle kwargs better in aot_export_joint_with_descriptors (#165334)
fx.Interpreter doesn't handle kwargs... not sure how this code worked previously

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165334
Approved by: https://github.com/tugsbayasgalan, https://github.com/ezyang
2025-10-21 15:53:05 +00:00
8b3dc0d1b0 Better error handling in torch/csrc/jit/runtime/* (#165118)
Refactor error handling by using TORCH_CHECK for improved clarity in constants and scope management in some files in torch/csrc/jit/runtime/*

Fixes some parts of ISSUE https://github.com/pytorch/pytorch/issues/148114

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165118
Approved by: https://github.com/FFFrog, https://github.com/albanD
2025-10-21 15:22:49 +00:00
06773663b5 Implement an AOT precompile mode for standalone_compile (#165843)
This PR introduces an `aot` flag to standalone_compile that uses BundledAOTAutogradCacheEntry, and then allows regional_inductor to use this so that we can start aot compiling regional compiler graphs. The diff above this will attempt to allow GraphPickler to fully serialize graphs that have regionally compiled subgraphs.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165843
Approved by: https://github.com/oulgen
2025-10-21 15:02:45 +00:00
0bff65503c Move hardware_destructive_interference_size to c10/core/alignment.h (#160067)
# Motivation
Move `hardware_destructive_interference_size` to `c10/core/alignment.h`, which gives a chance to reuse it across different accelerators.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/160067
Approved by: https://github.com/Skylion007, https://github.com/EikanWang
2025-10-21 14:39:46 +00:00
21131a2444 Revert "[ROCm][CI] Update rocm.yml workflow to use 1 GPU ARC runners (#165481)"
This reverts commit ffa90d46e61650834d5f926008f48f50c6a7e87a.

Reverted https://github.com/pytorch/pytorch/pull/165481 on behalf of https://github.com/jeffdaily due to timeouts after merge ([comment](https://github.com/pytorch/pytorch/pull/165481#issuecomment-3426898171))
2025-10-21 14:15:55 +00:00
1009790ad8 [pytree][dynamo] trace on native optree functions for community pytree support (#165860)
Resolves #164972

- #164972

All `torch.utils._cxx_pytree` functions are based on `optree` functions with hardcoded `none_is_leaf=True` and `namespace="torch"`. This PR changes the polyfills to generic `optree` functions with those arguments unhardcoded. This means `torch.utils._cxx_pytree` functions are still traceable while the community `optree` usages can get dynamo support additionally.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165860
Approved by: https://github.com/Lucaskabela
2025-10-21 14:13:08 +00:00
410e6a4321 Better error handling in torch/csrc/jit/frontend/* (#165213)
Refactor error handling by using TORCH_CHECK for improved clarity in constants and scope management in some files in torch/csrc/jit/frontend/*

Fixes some parts of ISSUE https://github.com/pytorch/pytorch/issues/148114

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165213
Approved by: https://github.com/FFFrog, https://github.com/albanD
2025-10-21 13:54:59 +00:00
23c55c5b66 [Code Clean]Replace assert statements with explicit if/raise patterns (#165735)
Fix part of #164878

Replace 75 assert statements with explicit if/raise patterns in `torch/ao/ns` , include:

- `torch/ao/ns/_numeric_suite_fx.py`  - 5 asserts

- `torch/ao/ns/fx/graph_matcher.py` - 6 asserts

- `torch/ao/ns/fx/graph_passes.py` -12 asserts

- `torch/ao/ns/fx/n_shadows_utils.py` - 20 asserts

- `torch/ao/ns/fx/pattern_utils.py` - 2 asserts

- `torch/ao/ns/fx/utils.py` - 21 asserts

- `torch/ao/ns/fx/weight_utils.py` - 19 asserts

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165735
Approved by: https://github.com/albanD
2025-10-21 11:21:57 +00:00
1290b077f2 [dynamo][misc] Replace UserFunctionVariable with VariableTracker build (#165707)
Audit: To prevent future issues with functools.partial or callable
objects.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165707
Approved by: https://github.com/Lucaskabela
2025-10-21 09:27:41 +00:00
9f9ab881b2 [ROCm][inductor] heuristic improvements for reduction kernels (#161280)
Improvements to reduction kernel heuristics for MI350.

Contributions from several members of the AMD Inductor and Triton teams: @jataylo @iupaikov-amd @AmdSampsa @xiaohuguo2023

Pull Request resolved: https://github.com/pytorch/pytorch/pull/161280
Approved by: https://github.com/jansel, https://github.com/PaulZhang12, https://github.com/eellison, https://github.com/jeffdaily
2025-10-21 07:48:54 +00:00
f2bb22ff84 [Inductor-FX] Support Tensor.item (#165599)
# Feature
This PR supports compiling `Tensor.item` with Inductor's FX backend. This maps to a custom WrapperCodeGen method called `codegen_dynamic_scalar`.

# Implementation
The implementation is fairly mechanical, following the usual flow for these types of PRs.
1. Introduce a new Wrapper IR line for this, called `DynamicScalarLine`.
2. Split `PythonWrapperCodegen.codegen_dynamic_scalar` into 2 parts: a public method which generates the Wrapper IR line, and a private one generating Python from Wrapper IR.
3. Implement an FX codegen method for the wrapper IR line. This one calls `aten.where.Scalar` to handle code like `1 if x.item() else 0`, which is a bit tricky. It also calls `aten.item.default` to convert tensors to scalars.

# Test plan
Added CI tests mirroring the AOTI ones. They test float, int and bool types, the latter taking a distinct codegen path.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165599
Approved by: https://github.com/angelayi, https://github.com/jansel
2025-10-21 07:09:56 +00:00
03f3f7899c [ATen] Add reduction tag to reduction operators (#165155)
Add a new 'reduction' tag to tags.yaml and apply it to 98 reduction
operator variants across 21 operator families (sum, mean, min, max,
argmin, argmax, amin, amax, aminmax, prod, all, any, norm, var, std,
std_mean, var_mean, nansum, logsumexp, count_nonzero, linalg_vector_norm).

This tag categorizes operators that perform reduction operations,
computing aggregate values across one or more dimensions of input
tensor(s).

Based on PR #153342 - co-written with @AlonSardas.

Just as we have pointwise tag - this can be useful for compiler passes, or for opting into sharding rules.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165155
Approved by: https://github.com/ezyang, https://github.com/zou3519, https://github.com/mlazos
2025-10-21 04:35:03 +00:00
771170807b [dynamo][nn_module] Replace UserFunctionVariable with VariableTracker build (#165708)
Audit: To prevent future issues with functools.partial or callable objects.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165708
Approved by: https://github.com/Lucaskabela
2025-10-21 04:13:12 +00:00
ffa90d46e6 [ROCm][CI] Update rocm.yml workflow to use 1 GPU ARC runners (#165481)
* Moving rocm.yml from using persistent non-ARC runners from the combined MI2xx (MI210 + MI250) cluster to the ARC runners from the MI250 cluster. This halves the number of nodes, but provides access to approximately 4 times the runners, since every 8-GPU MI250 node now provides 8 1-GPU runners. This should help with concurrent capacity and queueing on the MI2xx jobs.

Tested here successfully: https://github.com/pytorch/pytorch/actions/runs/18620814622/job/53092469720

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165481
Approved by: https://github.com/jeffdaily

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
2025-10-21 04:02:04 +00:00
0e083942cc Enable PLW0127 in ruff (#165851)
This PR enables `PLW0127` in ruff, which checks self-assignment of variables with the form `var=var`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165851
Approved by: https://github.com/Lucaskabela
2025-10-21 03:30:57 +00:00
ce1fcff03e [ROCm] Keep amdgpu-coerce-illegal-types flag if rocm version is less than 7.2 (#165789)
The `-amdgpu-coerce-illegal-types=1` flag is for LLVM that is in ROCm 6.3, 6.4, 7.0, and 7.1. It will not be in ROCm7.2. It was added to enable performance improvements for composable kernel. ROCm7.2 and newer changed the compiler so that the flag isn't needed to achieve those performance improvements. Keeping the flag with ROCm 7.2 breaks the PyTorch build.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165789
Approved by: https://github.com/jithunnair-amd, https://github.com/jeffdaily
2025-10-21 03:17:33 +00:00
a238a9a100 Add clang-tidy misc-definitions-in-headers check (#164959)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164959
Approved by: https://github.com/Skylion007, https://github.com/mikaylagawarecki
ghstack dependencies: #164882, #164956
2025-10-21 02:59:46 +00:00
fe69a2bbbd Move from/to to torch::stable::detail (#164956)
To not pollute the global namespace, we should move the `from`/`to` APIs into torch::stable::detail. We are also following our normal deprecation cycle and choosing to continue exposing the global `from`/`to` for the time being as people who onboard their extensions onto 2.9 would not be able to build with 2.10 otherwise.

Note that this means that within libtorch, we do not get the luxury of tacking on a `using torch::stable::detail::from` because then it leads to build time ambiguous calls --> both the global and namespace APIs are exposed, which one do I want? So that is why you see every local site is updated.

Note that the update is _not_ necessary from a custom op writer point of view. FA3 can continue to build on torch nightlies without changing any code. (Since this is a header change, this PR has no implication on runtime, a previously built FA3 ABI stable wheel will continue to work fine with newer torch versions after this PR.)

Once TORCH_BOX lands, we would be free to remove these global APIs when the deprecation cycle is up (April 2026) and encourage people to use TORCH_BOX and avoid from/to entirely.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164956
Approved by: https://github.com/malfet
ghstack dependencies: #164882
2025-10-21 02:59:46 +00:00
0be0de4ffa Add type suppressions to _inductor/runtime (#165918)
Original PR that did this was reverted due to merge conflicts.

Trying it again

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165918
Approved by: https://github.com/oulgen
2025-10-21 02:54:22 +00:00
7406d2e665 [DeviceMesh] Clean up the call into mesh_resouces to get root mesh (#165787)
We moved the method to get root mesh into class in https://github.com/pytorch/pytorch/pull/164510. This is to further clean code up.

Differential Revision: [D85090191](https://our.internmc.facebook.com/intern/diff/D85090191)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165787
Approved by: https://github.com/fegin
2025-10-21 02:54:04 +00:00
303c9cf048 Save Python refcount bump on each arg in maybe_handle_torch_function (#164625)
Pybind's API entails a small unnecessary overhead when working with args. (Similarly, we should probably be using vectorcall, but that's a bigger change for both us and pybind11.)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164625
Approved by: https://github.com/albanD
ghstack dependencies: #164624
2025-10-21 02:40:12 +00:00
d7d4bb7c51 Add XPU part for persons_of_interest (#165920)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165920
Approved by: https://github.com/albanD
2025-10-21 01:57:17 +00:00
0b1c462979 Making Numpy depedency in Local Tensor optional to fix broken Torchao CI (#165938)
In recent change LocalTensor introduced dependency on Numpy and has broken Torchao CI.
This dependency cna be made optional and required only when Local Tensor is used.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165938
Approved by: https://github.com/atalman
2025-10-21 01:46:53 +00:00
4a6cf0a93e Fix dynamo stack trace (#165930)
Fixes #165911

- Add message to Attribute error so we see `  Developer debug context: raised exception AttributeError(["'Linear' object has no attribute 'w'"])` instead of just `Developer debug context: raised exception AttributeError([])`
- Add stack trace in `ObservedException` so we display the inner most error stack trace back to user code

Output:

```
/data/users/shangdiy/pytorch/torch/__init__.py:2641: UserWarning: You are calling torch.compile inside torch.export region. To capture an useful graph, we will implicitly switch to torch.compile(backend=eager)
  warnings.warn(
Traceback (most recent call last):
  File "/data/users/shangdiy/pytorch/torch/_dynamo/variables/user_defined.py", line 1385, in var_getattr
    subobj = self._getattr_static(name)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/users/shangdiy/pytorch/torch/_dynamo/variables/user_defined.py", line 1256, in _getattr_static
    subobj = type(self.value).__getattribute__(self.value, name)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'Linear' object has no attribute 'w'

During handling of the above exception, another exception occurred:

torch._dynamo.exc.ObservedAttributeError: 'Linear' object has no attribute 'w'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/data/users/shangdiy/pytorch/test.py", line 34, in <module>
    mod = torch._dynamo.functional_export._dynamo_graph_capture_for_export(Model())(x)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/users/shangdiy/pytorch/torch/_dynamo/functional_export.py", line 481, in inner
    out = fullgraph_capture(
          ^^^^^^^^^^^^^^^^^^
  File "/data/users/shangdiy/pytorch/torch/_dynamo/convert_frame.py", line 1053, in fullgraph_capture
    return _fullgraph_capture_frame(
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/users/shangdiy/pytorch/torch/_dynamo/convert_frame.py", line 1115, in _fullgraph_capture_frame
    raise e.with_traceback(None) from e.__cause__  # User compiler error
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch._dynamo.exc.Unsupported: Observed exception
  Explanation: Dynamo found no exception handler at the top-level compiled function when encountering an exception. Exception will propagate outside the compiled region.
  Hint: Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled.
  Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.

  Developer debug context: raised exception AttributeError(["'Linear' object has no attribute 'w'"])

 For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0088.html

from user code:
   File "/data/users/shangdiy/pytorch/torch/_dynamo/functional_export.py", line 171, in forward
    res = self._export_root(*args, **kwargs)
  File "/data/users/shangdiy/pytorch/test.py", line 31, in forward
    weight = self.linear.w

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"

```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165930
Approved by: https://github.com/anijain2305
2025-10-21 01:32:23 +00:00
4c963a68d7 Use inline instead of anon namespace for stableivalue from/to (#164882)
Fixes https://github.com/pytorch/pytorch/issues/163343.

After some consideration, I propose we remove the anonymous namespace around from/to in favor of:
1. Adding inline to the function implementations, assuming that they will not change in the near future
2. If we decide to change them, we will wrap the code in inline versioned namespaces such that the implementations within any versioned namespace will be guaranteed identical.

Note that:
- We eventually intend to abstract away usage of `from`/`to` (related: @lw's TORCH_BOX work)
- The from/to implementations are now powered through class template specializations, where adding a specialization does not change the from/to signatures.

I do plan to deprecate top-level from/to in favor of torch::stable::details::from/to consequently. This way we can stop polluting the global namespace.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164882
Approved by: https://github.com/lw, https://github.com/albanD
2025-10-21 00:12:15 +00:00
b20deec3d1 [PP] Add optional argument to not save outputs (#165822)
Fix https://github.com/pytorch/pytorch/issues/159251

Add an optional argument `return_outputs` to the schedule `step`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165822
Approved by: https://github.com/wconstab
2025-10-21 00:09:31 +00:00
51d0d8ee67 [ATen] Fix CUDA reduction warp shuffle order (#164790)
Typical warp shuffle reduction has the following pattern:
<img width="1138" height="501" alt="image" src="https://github.com/user-attachments/assets/3bd176dc-0ad2-4df6-90c7-06e467337166" />

which is exhibited in Triton generated by torch.compile:
<img width="663" height="403" alt="image" src="https://github.com/user-attachments/assets/7f9f36cd-b9eb-44c1-879e-b469668a2ea8" />

Switch the warp shuffle order to make bitwise equivalence between the 2 easier.
PTX difference between old and new, we see a few extra instructions: https://www.diffchecker.com/h6ly3INC/

Comparing the performance on different reduction operations, we see minimal differences. New represents the changes in this PR, old represents the past warp shuffle order:
```
Tensor Shape              Operation            New all dims (ms)       New dim=0 (ms)      New dim=1 (ms)     Old all dims (ms)    Old dim=0 (ms)      Old dim=1 (ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 1024)              mean                 0.015817             0.016259             0.013642             0.015990             0.016258             0.013631
(1024, 1024)              sum                  0.015917             0.015906             0.013359             0.015707             0.016266             0.013226
(1024, 1024)              min                  0.016021             0.024625             0.015631             0.015761             0.024485             0.015317
(1024, 1024)              max                  0.016349             0.024971             0.015972             0.015771             0.025001             0.015314
(1024, 1024)              argmin               0.018070             0.024448             0.015578             0.018135             0.025370             0.015322
(1024, 1024)              argmax               0.018427             0.024859             0.015932             0.018164             0.024452             0.015639
(1024, 1024)              var                  0.020078             0.026413             0.020295             0.020199             0.026381             0.020214
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(2048, 2048)              mean                 0.023826             0.023726             0.022273             0.023236             0.023776             0.022248
(2048, 2048)              sum                  0.023840             0.023355             0.021974             0.023294             0.023354             0.021884
(2048, 2048)              min                  0.024519             0.041263             0.024620             0.023292             0.041491             0.024358
(2048, 2048)              max                  0.024509             0.041670             0.024277             0.023334             0.041231             0.024395
(2048, 2048)              argmin               0.026125             0.041282             0.024567             0.026772             0.041773             0.024296
(2048, 2048)              argmax               0.026117             0.041487             0.024572             0.026412             0.041477             0.024273
(2048, 2048)              var                  0.026603             0.048581             0.031308             0.027587             0.048603             0.030860
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(4096, 4096)              mean                 0.053927             0.057070             0.054073             0.053028             0.057544             0.053935
(4096, 4096)              sum                  0.053604             0.057410             0.054451             0.053076             0.057033             0.054266
(4096, 4096)              min                  0.054293             0.109122             0.058363             0.053821             0.108689             0.058382
(4096, 4096)              max                  0.054258             0.108035             0.058703             0.053492             0.110552             0.058376
(4096, 4096)              argmin               0.056805             0.111167             0.058301             0.056836             0.112325             0.058292
(4096, 4096)              argmax               0.056488             0.110958             0.058636             0.056844             0.111000             0.057928
(4096, 4096)              var                  0.058936             0.141755             0.068693             0.059735             0.141284             0.068500
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 8192)              mean                 0.145552             0.148082             0.138647             0.145364             0.147818             0.138207
(8192, 8192)              sum                  0.145985             0.147900             0.138714             0.145755             0.148031             0.138616
(8192, 8192)              min                  0.146566             0.205359             0.192739             0.145611             0.205237             0.182335
(8192, 8192)              max                  0.146526             0.204844             0.193050             0.146073             0.205457             0.182697
(8192, 8192)              argmin               0.150190             0.206605             0.192543             0.150654             0.206847             0.182007
(8192, 8192)              argmax               0.150481             0.206368             0.192535             0.150845             0.206430             0.182022
(8192, 8192)              var                  0.150884             0.184546             0.203900             0.151594             0.184172             0.197983
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1, 1024, 128)            mean                 0.014293             0.008119             0.014533             0.013861             0.008022             0.014449
(1, 1024, 128)            sum                  0.014039             0.007877             0.014111             0.014219             0.008227             0.014045
(1, 1024, 128)            min                  0.014159             0.011354             0.023493             0.014271             0.010862             0.023644
(1, 1024, 128)            max                  0.014154             0.011027             0.023368             0.014259             0.011234             0.023692
(1, 1024, 128)            argmin               0.016403             0.005677             0.023328             0.016273             0.005683             0.024073
(1, 1024, 128)            argmax               0.016734             0.005675             0.023437             0.016580             0.005318             0.023331
(1, 1024, 128)            var                  0.018338             0.009549             0.025538             0.018528             0.009391             0.024777
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(5, 1024, 128)            mean                 0.014873             0.010131             0.015546             0.015123             0.010131             0.015481
(5, 1024, 128)            sum                  0.015334             0.009673             0.015824             0.014736             0.009671             0.015438
(5, 1024, 128)            min                  0.015047             0.013252             0.024573             0.014803             0.013163             0.024551
(5, 1024, 128)            max                  0.015050             0.013339             0.024197             0.014810             0.013525             0.024230
(5, 1024, 128)            argmin               0.017341             0.012737             0.024306             0.017471             0.012379             0.024991
(5, 1024, 128)            argmax               0.017345             0.012411             0.024421             0.017422             0.012471             0.024237
(5, 1024, 128)            var                  0.019973             0.011453             0.026188             0.020050             0.011438             0.026282
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(10, 1024, 128)           mean                 0.016976             0.011575             0.016831             0.016722             0.011927             0.017173
(10, 1024, 128)           sum                  0.017039             0.011841             0.017159             0.016385             0.011860             0.016753
(10, 1024, 128)           min                  0.017036             0.015331             0.026770             0.016944             0.015205             0.027166
(10, 1024, 128)           max                  0.017369             0.015348             0.027077             0.016531             0.015716             0.026819
(10, 1024, 128)           argmin               0.019203             0.014447             0.026813             0.018994             0.014497             0.027313
(10, 1024, 128)           argmax               0.019563             0.014795             0.027140             0.019460             0.014912             0.026733
(10, 1024, 128)           var                  0.020529             0.014316             0.030405             0.020719             0.013960             0.029964
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(100, 1024, 128)          mean                 0.045046             0.039168             0.046082             0.044839             0.039217             0.045782
(100, 1024, 128)          sum                  0.045094             0.039150             0.045777             0.044496             0.039542             0.046083
(100, 1024, 128)          min                  0.045768             0.054466             0.076244             0.044915             0.053943             0.076599
(100, 1024, 128)          max                  0.045748             0.054459             0.076188             0.044931             0.053949             0.076856
(100, 1024, 128)          argmin               0.048275             0.054046             0.076647             0.048694             0.054105             0.077004
(100, 1024, 128)          argmax               0.048267             0.054395             0.077401             0.048691             0.054131             0.076751
(100, 1024, 128)          var                  0.049710             0.043254             0.083077             0.050971             0.043251             0.082378
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1000, 1000, 100)         mean                 0.202312             0.196723             0.197765             0.201774             0.196641             0.197459
(1000, 1000, 100)         sum                  0.202651             0.196682             0.197736             0.202175             0.196313             0.197523
(1000, 1000, 100)         min                  0.203022             0.264762             0.269200             0.202729             0.264129             0.268694
(1000, 1000, 100)         max                  0.202864             0.264396             0.269388             0.202486             0.263896             0.268720
(1000, 1000, 100)         argmin               0.226727             0.263781             0.268651             0.226597             0.264676             0.268983
(1000, 1000, 100)         argmax               0.226412             0.264469             0.269090             0.226570             0.264595             0.269178
(1000, 1000, 100)         var                  0.243223             0.204079             0.216096             0.241942             0.204079             0.215925
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(10000, 100)              mean                 0.016193             0.020277             0.014316             0.016152             0.020324             0.013712
(10000, 100)              sum                  0.016289             0.020237             0.014034             0.016168             0.020265             0.013708
(10000, 100)              min                  0.016046             0.030872             0.019609             0.016208             0.030867             0.018627
(10000, 100)              max                  0.016369             0.030835             0.019257             0.016218             0.030861             0.018209
(10000, 100)              argmin               0.017957             0.031171             0.019517             0.018050             0.031556             0.018077
(10000, 100)              argmax               0.017961             0.031658             0.019521             0.018060             0.031564             0.018087
(10000, 100)              var                  0.020393             0.035652             0.019339             0.020144             0.035987             0.019171
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(100000, 10)              mean                 0.015718             0.016576             0.016555             0.015999             0.016246             0.014869
(100000, 10)              sum                  0.015833             0.016247             0.016572             0.016007             0.016627             0.014872
(100000, 10)              min                  0.015888             0.020510             0.023920             0.015671             0.020821             0.021417
(100000, 10)              max                  0.015889             0.020479             0.023918             0.016077             0.020386             0.021421
(100000, 10)              argmin               0.018233             0.020863             0.023647             0.017574             0.020864             0.021103
(100000, 10)              argmax               0.017896             0.020527             0.023296             0.017569             0.020447             0.021098
(100000, 10)              var                  0.020005             0.024198             0.024372             0.020075             0.024167             0.022415
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1023, 1023, 1023)        mean                 1.874816             1.963506             1.903909             1.873279             1.963859             1.903230
(1023, 1023, 1023)        sum                  1.875030             1.965716             1.902458             1.873566             1.960730             1.901642
(1023, 1023, 1023)        min                  1.878563             2.473455             2.179092             1.875174             2.482086             2.183027
(1023, 1023, 1023)        max                  1.879128             2.474803             2.178895             1.874831             2.482253             2.183884
(1023, 1023, 1023)        argmin               1.921800             2.476629             2.174831             1.923987             2.472641             2.170453
(1023, 1023, 1023)        argmax               1.922605             2.476688             2.177927             1.923366             2.472808             2.172979
(1023, 1023, 1023)        var                  1.972606             3.088695             2.758797             1.978679             3.095658             2.762243
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1023, 1023, 255)         mean                 0.489984             0.500954             0.492957             0.489891             0.500654             0.491971
(1023, 1023, 255)         sum                  0.490228             0.500764             0.492289             0.489624             0.501089             0.492824
(1023, 1023, 255)         min                  0.491457             0.563560             0.553334             0.490355             0.564709             0.554754
(1023, 1023, 255)         max                  0.491396             0.563628             0.553345             0.490017             0.565004             0.554947
(1023, 1023, 255)         argmin               0.503666             0.561512             0.551831             0.503845             0.560972             0.551017
(1023, 1023, 255)         argmax               0.503602             0.561185             0.551407             0.504328             0.561267             0.551448
(1023, 1023, 255)         var                  0.510844             0.709452             0.701630             0.512693             0.710365             0.701965
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1023, 1023, 377)         mean                 0.707439             0.727646             0.712019             0.706769             0.727101             0.711632
(1023, 1023, 377)         sum                  0.707780             0.727453             0.711554             0.706807             0.726656             0.711729
(1023, 1023, 377)         min                  0.709423             0.819809             0.794379             0.707847             0.822086             0.796664
(1023, 1023, 377)         max                  0.709297             0.819780             0.794308             0.707566             0.821913             0.796690
(1023, 1023, 377)         argmin               0.725028             0.817088             0.791695             0.726039             0.816445             0.790828
(1023, 1023, 377)         argmax               0.725301             0.817011             0.791420             0.726040             0.816917             0.791143
(1023, 1023, 377)         var                  0.740859             1.034165             1.006712             0.743413             1.035506             1.007638
```

Differential Revision: [D85022826](https://our.internmc.facebook.com/intern/diff/D85022826)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164790
Approved by: https://github.com/ngimel, https://github.com/eqy
2025-10-21 00:09:13 +00:00
70592c6819 [ROCm][CI] Move gfx1100 workflows to own yaml file (#165699)
This should allow us to move gfx1100 workflow to a lower frequency and also allow it to be triggered on PRs via a dedicated label, for any PRs that target Navi fixes such as [this](https://github.com/pytorch/pytorch/pull/165630) or [this](https://github.com/pytorch/pytorch/pull/165625).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165699
Approved by: https://github.com/jeffdaily
2025-10-20 23:52:48 +00:00
259cb945f5 [stage 2c] make autograd and inference functions (#165668)
Add final stage of aot_stage2_compile for autograd and inference.

Differential Revision: D84844699

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165668
Approved by: https://github.com/zhxchen17, https://github.com/tugsbayasgalan
2025-10-20 23:50:31 +00:00
e20c9bf288 [torch/utils][Code Clean] Clean asserts in torch/utils/*.py (#165410)
Including:
- `torch/utils/*.py`

Fixes part of #164878

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165410
Approved by: https://github.com/albanD
2025-10-20 23:29:17 +00:00
99c8640b5d [1/N] Change C-style casts to static_cast or reinterpret_cast (#165750)
This series of changes try to cover C style casts into C++ alternatives.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165750
Approved by: https://github.com/Skylion007
2025-10-20 23:27:13 +00:00
96b0e7aaa6 [Code Clean] Clean asserts in torch/ao/quantization/experimental/* and torch/ao/quantization/pt2e/* (#165317)
Replace assert statements with explicit if/raise patterns in:
- torch/ao/quantization/experimental/* (11 errors)
- torch/ao/quantization/pt2e/* (68 errors)

fix partialy #164878
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165317
Approved by: https://github.com/albanD
2025-10-20 23:07:11 +00:00
850ba8c96d [Code Clean] Clean asserts in torch/autograd. (#165627)
Replaces 78 assert statements across 10 files in torch.autograd with explicit if-checks raising AssertionError to prevent assertions from being disabled with Python -O flag. This ensures error checking remains active in optimized builds.

fix partially #164878

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165627
Approved by: https://github.com/albanD
2025-10-20 23:03:47 +00:00
1bcd736f91 fix bad merge duplicate pre pass (#165917)
fix for https://github.com/pytorch/pytorch/issues/165624 - we were applying pre pass multiple times.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165917
Approved by: https://github.com/bdhirsh
2025-10-20 22:54:36 +00:00
df64c0c464 [Code Clean] Clean asserts in torch/ao/quantization (root, quantizer, backend_config) (#165433)
Replace assert statements with explicit if/raise patterns in:

- torch/ao/quantization/~
- torch/ao/quantization/quantizer/
- torch/ao/quantization/backend_config/

fix partialy #164878

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165433
Approved by: https://github.com/albanD
2025-10-20 22:42:51 +00:00
1891239a1d [Graph Partition] fix graph partition input signature for fallback kernels (#165815)
Scheduler relies on node.last_usage to free buffers. `last_usage` may contain a buffer that is allocated in previous graph partition AND not directly accessed in the current graph partition.

## Example
```python
def f(x):
    y = x + 1
    z = torch.ops.aten.view.dtype(y, torch.float8_e4m3fn)
    z_cpu = z.cpu()
    u_cuda = z_cpu.cuda()
    return u_cuda
```

In the generated code, we have
```
def partition_0(args):
    ...
    # Topologically Sorted Source Nodes: [y, z], Original ATen: [aten.add, aten.view]
    buf1 = torch.ops.aten.view.dtype(buf0, torch.float8_e4m3fn) # < ------ buf1 is a view of buf0
    buf2 = buf1 # <------- buf2 is buf1
    assert_size_stride(buf2, (8, ), (1, ), 'torch.ops.aten.view.dtype')
    assert_alignment(buf2, 16, 'torch.ops.aten.view.dtype')
    return (buf2, )

def call(self, args):
    ...
    (buf2,) = self.partitions[0](partition0_args)
    ...
    buf3.copy_(buf2, False)
    del buf0
    del buf1
    del buf2  # <---- `del buf2` leads to `del buf0`. BUT `buf0` is not returned from partition_0.
    ...
```

Note: view is treated as a fallback kernel due to its special dtype.
de09bab4b6/torch/_inductor/lowering.py (L841-L843)

## Fix

This PR fixes the issue by also returning these buffers to be freed later.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165815
Approved by: https://github.com/eellison
2025-10-20 22:23:29 +00:00
cf280ca1e8 Revert "[Inductor] Naive foreach autotune support (#162053)"
This reverts commit 779296a3fce5db0829377c792f13a8eafe537b30.

Reverted https://github.com/pytorch/pytorch/pull/162053 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/162053#issuecomment-3423808492))
2025-10-20 21:36:44 +00:00
efc277cac7 [annotation] add logging for debugging annotation (#165797)
Add logging for debugging annotation bugs. Log will show with `TORCH_LOGS="+annotation" `

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165797
Approved by: https://github.com/ezyang, https://github.com/Skylion007, https://github.com/SherlockNoMad
2025-10-20 21:27:38 +00:00
4f7f43253d Revert "[ROCm][CI] Update rocm.yml workflow to use 1 GPU ARC runners (#165481)"
This reverts commit 8700d68fef855850e2e0aa65056a77b8f80adbdb.

Reverted https://github.com/pytorch/pytorch/pull/165481 on behalf of https://github.com/malfet due to Broke lint somehow, see 8f06a1308f/1 ([comment](https://github.com/pytorch/pytorch/pull/165481#issuecomment-3423642456))
2025-10-20 20:39:56 +00:00
779296a3fc [Inductor] Naive foreach autotune support (#162053)
Initial autotuning support for foreach kernels, 4x improvement for some kernels in internal workload. More improvements can surely be made here in the future. Removing num_warps for definition to enable autotune support in generated wrapper code.

Before:
triton_for_fused_18.kd 🔍 | 4.986 ms | 4.986 ms | 2.493 ms | 2 |
triton_for_fused_6.kd 🔍 | 0.098 ms | 0.098 ms | 0.049 ms | 2 |
triton_for_fused_7.kd 🔍 | 0.036 ms | 0.036 ms | 0.018 ms | 2 |

After:
triton_for_fused_18.kd 🔍 | 1.273 ms | 1.273 ms | 0.636 ms | 2 |
triton_for_fused_6.kd 🔍 | 0.044 ms | 0.044 ms | 0.022 ms | 2 |
triton_for_fused_7.kd 🔍 | 0.024 ms | 0.024 ms | 0.012 ms | 2 |

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162053
Approved by: https://github.com/mlazos, https://github.com/naromero77amd
2025-10-20 20:39:04 +00:00
8f06a1308f [MPS] slightly faster cholesky (#165867)
Slightly faster cholesky, removed one redundant simdgroup_multiply
<img width="721" height="593" alt="Screenshot 2025-10-19 at 22 00 19" src="https://github.com/user-attachments/assets/e3a9005b-9347-4e62-a24d-16ba5e28849a" />

Generate benchmarks with(measured on M1 Pro):
```
import torch
import numpy as np
import time
import csv

matrix_sizes = [512, 1024, 2048, 4096]
batch_sizes = [1, 2, 4, 8, 16]
num_runs = 10
warmup_runs = 3

def create_spd_matrix(n, batch_size):
    torch.manual_seed(42)
    A = torch.randn(batch_size, n, n, dtype=torch.float32)
    return A @ A.transpose(-2, -1) + n * torch.eye(n).expand(batch_size, -1, -1)

def run_cholesky_mps(A):
    torch.mps.synchronize()
    start = time.perf_counter()
    b = torch.linalg.cholesky(A, upper=False)
    torch.mps.synchronize()
    end = time.perf_counter()
    return b, end - start

results = {
    'N': [],
    'batch_size': [],
    'mean_time': [],
    'std_time': []
}

for n in matrix_sizes:
    for batch_size in batch_sizes:
        print(f"\nBenchmarking N={n}, batch_size={batch_size}")

        try:
            A_cpu = create_spd_matrix(n, batch_size)
            A_mps = A_cpu.to("mps")

            for _ in range(warmup_runs):
                _, _ = run_cholesky_mps(A_mps)

            times = []
            for _ in range(num_runs):
                _, t = run_cholesky_mps(A_mps)
                times.append(t)

            mean_time = np.mean(times)
            std_time = np.std(times)

            results['N'].append(n)
            results['batch_size'].append(batch_size)
            results['mean_time'].append(mean_time)
            results['std_time'].append(std_time)

            print(f"Mean time: {mean_time:.4f}s ± {std_time:.4f}s")

        except RuntimeError as e:
            print(f"Error for N={n}, batch_size={batch_size}: {e}")
            continue

with open('cholesky_benchmark_times.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['N', 'batch_size', 'mean_time', 'std_time'])
    for i in range(len(results['N'])):
        writer.writerow([
            results['N'][i],
            results['batch_size'][i],
            results['mean_time'][i],
            results['std_time'][i]
        ])
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165867
Approved by: https://github.com/malfet
2025-10-20 18:56:17 +00:00
240c13394e Revert "[inductor] require shape in TritonCSEVariable (#162275)"
This reverts commit 3af2f0c12accc6bd10ef2b76fb5c51aa0f6b73a3.

Reverted https://github.com/pytorch/pytorch/pull/162275 on behalf of https://github.com/clee2000 due to still failing due to the above D84932446 ([comment](https://github.com/pytorch/pytorch/pull/162275#issuecomment-3423153819))
2025-10-20 17:55:54 +00:00
150682ba7f Revert "Remove workaround to old CUDA bug (#164354)"
This reverts commit 26f38034332a99f2bdcc67ce1f4ba9403d420e52.

Reverted https://github.com/pytorch/pytorch/pull/164354 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/164354#issuecomment-3423132083))
2025-10-20 17:48:08 +00:00
ca7360e996 Revert "Move toString(ScalarType) and ScalarType ostream operator to headeronly (#164405)"
This reverts commit ca8bd5dbedb5b46f78026e0378b0f47500ddba38.

Reverted https://github.com/pytorch/pytorch/pull/164405 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/164354#issuecomment-3423132083))
2025-10-20 17:48:08 +00:00
0bf604320f Revert "[dynamo][user_defined] Replace UserFunctionVariable with VariableTracker build (#165706)"
This reverts commit 1dc9a05d0323ee3c7a20945c62463959d40f1a51.

Reverted https://github.com/pytorch/pytorch/pull/165706 on behalf of https://github.com/clee2000 due to breaking internal tests D84961097 ([comment](https://github.com/pytorch/pytorch/pull/165706#issuecomment-3423059867))
2025-10-20 17:28:58 +00:00
9875e70da8 Revert "[dynamo][misc] Replace UserFunctionVariable with VariableTracker build (#165707)"
This reverts commit 630520b346b8883db7821562e589ccde7d12687a.

Reverted https://github.com/pytorch/pytorch/pull/165707 on behalf of https://github.com/clee2000 due to breaking internal tests D84961097 ([comment](https://github.com/pytorch/pytorch/pull/165706#issuecomment-3423059867))
2025-10-20 17:28:58 +00:00
69a4bfe8bb Revert "Refactor out headeronly ArrayRef (#164991)"
This reverts commit 3806e9767b03d06edc317cb90a3a996abdf192a0.

Reverted https://github.com/pytorch/pytorch/pull/164991 on behalf of https://github.com/clee2000 due to breaking internal tests D84961075 ([comment](https://github.com/pytorch/pytorch/pull/164991#issuecomment-3423058017))
2025-10-20 17:26:42 +00:00
62a263b8d4 Revert "Widen ops support to take in IntHOArrayRef vs only std::vec (#165152)"
This reverts commit e4454947e2c692db1a249591121f8583fefe7df1.

Reverted https://github.com/pytorch/pytorch/pull/165152 on behalf of https://github.com/clee2000 due to breaking internal tests D84961075 ([comment](https://github.com/pytorch/pytorch/pull/164991#issuecomment-3423058017))
2025-10-20 17:26:42 +00:00
0da1f911dc Revert "[Submodule] Bump FBGEMM to latest (#165544)"
This reverts commit 23417ae50f5d9bc02e988d916c103ff3a03c5903.

Reverted https://github.com/pytorch/pytorch/pull/165544 on behalf of https://github.com/clee2000 due to failing in internal D84996252, probably needs some sort of update to fbgemm internally? ([comment](https://github.com/pytorch/pytorch/pull/165544#issuecomment-3422993703))
2025-10-20 17:06:07 +00:00
8700d68fef [ROCm][CI] Update rocm.yml workflow to use 1 GPU ARC runners (#165481)
* Moving rocm.yml from using persistent non-ARC runners from the combined MI2xx (MI210 + MI250) cluster to the ARC runners from the MI250 cluster. This halves the number of nodes, but provides access to approximately 4 times the runners, since every 8-GPU MI250 node now provides 8 1-GPU runners. This should help with concurrent capacity and queueing on the MI2xx jobs.

Tested here successfully: https://github.com/pytorch/pytorch/actions/runs/18620814622/job/53092469720

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165481
Approved by: https://github.com/jeffdaily, https://github.com/pruthvistony, https://github.com/albanD

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
2025-10-20 16:06:37 +00:00
ab82456c16 Revert "[1/N] Change C-style casts to static_cast or reinterpret_cast (#165750)"
This reverts commit e1e8491b316df810388d9fa24f135cdba27ab40e.

Reverted https://github.com/pytorch/pytorch/pull/165750 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/165750#issuecomment-3422413890))
2025-10-20 14:51:58 +00:00
b23f4687fd [Inductor][CuTeDSL] Move load_template up two directories (#165868)
Summary:
This is a reland of https://github.com/pytorch/pytorch/pull/165347

Moves the function used to load CuTeDSL Jinja templates up one level out of the flex attention folder. This way it can be used for more generate Inductor templates in the future.

Test Plan: test/inductor/test_flex_flash

Differential Revision: D85013024

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165868
Approved by: https://github.com/jananisriram
2025-10-20 12:14:38 +00:00
2705937080 [CI] Add rocm CI back to trunk for pre-submit/PR jobs (#165674)
Only adding single-GPU shards for now, to observe how current capacity handles it.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165674
Approved by: https://github.com/jeffdaily
2025-10-20 12:14:06 +00:00
c1eda348be [cuda] fix triu/tril int32 overflow for large matrices (#164705)
Fixes #136611

Cast blockIdx.x to int64_t before multiplication to prevent overflow when computing linear_idx for matrices larger than 2^31 elements.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164705
Approved by: https://github.com/eqy, https://github.com/ngimel
2025-10-20 07:17:41 +00:00
ba93d5636e [cuda] fix nll_loss2d backward bounds check with reduction=none (#165247)
Fixes #49882

Add missing bounds check in nll_loss2d backward kernel with reduction=none. Forward kernel already had CUDA_KERNEL_ASSERT for target bounds, now backward kernel matches.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165247
Approved by: https://github.com/ngimel
2025-10-20 06:25:11 +00:00
722b2b86c9 [dynamo] Remove duplicated guards (#165806)
This is by looking at a tlparse of an internal job. We will need deeper audit.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165806
Approved by: https://github.com/jansel
2025-10-20 05:50:33 +00:00
e1e8491b31 [1/N] Change C-style casts to static_cast or reinterpret_cast (#165750)
This series of changes try to cover C style casts into C++ alternatives.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165750
Approved by: https://github.com/Skylion007
2025-10-20 04:36:19 +00:00
767199fd9b [flex_attention] replace sliced BlockMask noop with helpful error (#164702)
Fixes part of #163314

After slicing BlockMask with `[]`, mask_mod was silently replaced with noop_mask. This caused silent incorrect results when users applied transformations to `sliced_mask.mask_mod`.

Replace noop with `_sliced_mask_mod_error` that raises RuntimeError with guidance to use `base_mask.mask_mod` instead.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164702
Approved by: https://github.com/drisspg, https://github.com/BoyuanFeng
2025-10-20 03:46:16 +00:00
602ace5eb4 Revert "[ATen] Fix CUDA reduction warp shuffle order (#164790)"
This reverts commit 36371b8ec7a1baed255c18451b2c716386a54c95.

Reverted https://github.com/pytorch/pytorch/pull/164790 on behalf of https://github.com/clee2000 due to was reverted due to failing internal tests after merge D84992607 ([comment](https://github.com/pytorch/pytorch/pull/164790#issuecomment-3420373755))
2025-10-20 03:06:52 +00:00
47804ce467 Revert "12/n : Remove fbandroid_compiler_flags (#165558)"
This reverts commit aead9270f56ebc7302c7f5fa7e5dff959f26608e.

Reverted https://github.com/pytorch/pytorch/pull/165558 on behalf of https://github.com/clee2000 due to Diff was actually reverted internally D84832629 ([comment](https://github.com/pytorch/pytorch/pull/165558#issuecomment-3420367955))
2025-10-20 03:03:13 +00:00
e8cb34dd52 [Inductor] support masked vectorization for the tail_loop for fp8 datatype (#163324)
**Summary:**
Support masked vectorization for the tail_loop for fp8 datatype.

**Example:**
```
import torch

def fn(
    x,
    scale,
    zero_point,
    quant_min,
    quant_max,
    dtype,
):
    x = torch.ops.quantized_decomposed.dequantize_per_tensor(
        x,
        scale,
        zero_point,
        quant_min,
        quant_max,
        dtype,
    )
    x = torch.relu(x)
    x = torch.ops.quantized_decomposed.quantize_per_tensor(
        x, scale, zero_point, quant_min, quant_max, dtype
    )
    return x

quant_min = -128
quant_max = 127
dtype = torch.float8_e4m3fn
x = torch.clamp(torch.randn((1, 7, 7, 9), dtype=torch.float32) * 100, quant_min, quant_max).to(dtype)
zero_point = 100
scale = 0.01

with torch.no_grad():
    compiled_fn = torch.compile(fn)
    compiled_fn(x, scale, zero_point, quant_min, quant_max, dtype)
```

**Generated code:**

- Before
```
cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0 = async_compile.cpp_pybinding(['const at::Float8_e4m3fn*', 'at::Float8_e4m3fn*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C"  void  kernel(const at::Float8_e4m3fn* in_ptr0,
                       at::Float8_e4m3fn* out_ptr0)
{
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(441L); x0+=static_cast<int64_t>(16L))
        {
            {
                if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(432L)))
                {
                    auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                    auto tmp1 = at::vec::convert<float>(tmp0);
                    auto tmp2 = static_cast<float>(100.0);
                    auto tmp3 = at::vec::Vectorized<float>(tmp2);
                    auto tmp4 = tmp1 - tmp3;
                    auto tmp5 = static_cast<float>(0.01);
                    auto tmp6 = at::vec::Vectorized<float>(tmp5);
                    auto tmp7 = tmp4 * tmp6;
                    auto tmp8 = (tmp7);
                    auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0));
                    auto tmp10 = tmp9 * tmp3;
                    auto tmp11 = tmp10.round();
                    auto tmp12 = tmp11 + tmp3;
                    auto tmp13 = static_cast<float>(-128.0);
                    auto tmp14 = at::vec::Vectorized<float>(tmp13);
                    auto tmp15 = at::vec::maximum(tmp12, tmp14);
                    auto tmp16 = static_cast<float>(127.0);
                    auto tmp17 = at::vec::Vectorized<float>(tmp16);
                    auto tmp18 = at::vec::minimum(tmp15, tmp17);
                    auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18);
                    tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                }
                if(C10_UNLIKELY(x0 >= static_cast<int64_t>(432L) && x0 < static_cast<int64_t>(441L)))
                {
                    for (int64_t x0_tail = static_cast<int64_t>(432L);x0_tail < static_cast<int64_t>(441L); x0_tail++)
                    {
                        auto tmp0 = in_ptr0[static_cast<int64_t>(x0_tail)];
                        auto tmp1 = c10::convert<float>(tmp0);
                        auto tmp2 = static_cast<float>(100.0);
                        auto tmp3 = float(tmp1 - tmp2);
                        auto tmp4 = static_cast<float>(0.01);
                        auto tmp5 = float(tmp3 * tmp4);
                        auto tmp6 = c10::convert<float>(tmp5);
                        auto tmp7 = std::max(tmp6, decltype(tmp6)(0));
                        auto tmp8 = float(tmp7 * tmp2);
                        auto tmp9 = std::nearbyint(tmp8);
                        auto tmp10 = float(tmp9 + tmp2);
                        auto tmp11 = static_cast<float>(-128.0);
                        auto tmp12 = max_propagate_nan(tmp10, tmp11);
                        auto tmp13 = static_cast<float>(127.0);
                        auto tmp14 = min_propagate_nan(tmp12, tmp13);
                        auto tmp15 = c10::convert<at::Float8_e4m3fn>(tmp14);
                        out_ptr0[static_cast<int64_t>(x0_tail)] = tmp15;
                    }
                }
            }
        }
    }
}
''')

async_compile.wait(globals())
del async_compile

class Runner:
    def __init__(self, partitions):
        self.partitions = partitions

    def recursively_apply_fns(self, fns):
        new_callables = []
        for fn, c in zip(fns, self.partitions):
            new_callables.append(fn(c))
        self.partitions = new_callables

    def call(self, args):
        arg0_1, = args
        args.clear()
        assert_size_stride(arg0_1, (1, 7, 7, 9), (441, 63, 9, 1))
        buf0 = empty_strided_cpu((1, 7, 7, 9), (441, 63, 9, 1), torch.float8_e4m3fn)
        # [Provenance debug handles] cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0:1
        cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0(arg0_1, buf0)
        del arg0_1
        return (buf0, )
```
- After
```
cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0 = async_compile.cpp_pybinding(['const at::Float8_e4m3fn*', 'at::Float8_e4m3fn*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C"  void  kernel(const at::Float8_e4m3fn* in_ptr0,
                       at::Float8_e4m3fn* out_ptr0)
{
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(441L); x0+=static_cast<int64_t>(16L))
        {
            {
                if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(432L)))
                {
                    auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                    auto tmp1 = at::vec::convert<float>(tmp0);
                    auto tmp2 = static_cast<float>(100.0);
                    auto tmp3 = at::vec::Vectorized<float>(tmp2);
                    auto tmp4 = tmp1 - tmp3;
                    auto tmp5 = static_cast<float>(0.01);
                    auto tmp6 = at::vec::Vectorized<float>(tmp5);
                    auto tmp7 = tmp4 * tmp6;
                    auto tmp8 = (tmp7);
                    auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0));
                    auto tmp10 = tmp9 * tmp3;
                    auto tmp11 = tmp10.round();
                    auto tmp12 = tmp11 + tmp3;
                    auto tmp13 = static_cast<float>(-128.0);
                    auto tmp14 = at::vec::Vectorized<float>(tmp13);
                    auto tmp15 = at::vec::maximum(tmp12, tmp14);
                    auto tmp16 = static_cast<float>(127.0);
                    auto tmp17 = at::vec::Vectorized<float>(tmp16);
                    auto tmp18 = at::vec::minimum(tmp15, tmp17);
                    auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18);
                    tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                }
                if(C10_UNLIKELY(x0 >= static_cast<int64_t>(432L) && x0 < static_cast<int64_t>(441L)))
                {
                    auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(9L));
                    auto tmp1 = at::vec::convert<float>(tmp0);
                    auto tmp2 = static_cast<float>(100.0);
                    auto tmp3 = at::vec::Vectorized<float>(tmp2);
                    auto tmp4 = tmp1 - tmp3;
                    auto tmp5 = static_cast<float>(0.01);
                    auto tmp6 = at::vec::Vectorized<float>(tmp5);
                    auto tmp7 = tmp4 * tmp6;
                    auto tmp8 = (tmp7);
                    auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0));
                    auto tmp10 = tmp9 * tmp3;
                    auto tmp11 = tmp10.round();
                    auto tmp12 = tmp11 + tmp3;
                    auto tmp13 = static_cast<float>(-128.0);
                    auto tmp14 = at::vec::Vectorized<float>(tmp13);
                    auto tmp15 = at::vec::maximum(tmp12, tmp14);
                    auto tmp16 = static_cast<float>(127.0);
                    auto tmp17 = at::vec::Vectorized<float>(tmp16);
                    auto tmp18 = at::vec::minimum(tmp15, tmp17);
                    auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18);
                    tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(9L));
                }
            }
        }
    }
}
''')

async_compile.wait(globals())
del async_compile

class Runner:
    def __init__(self, partitions):
        self.partitions = partitions

    def recursively_apply_fns(self, fns):
        new_callables = []
        for fn, c in zip(fns, self.partitions):
            new_callables.append(fn(c))
        self.partitions = new_callables

    def call(self, args):
        arg0_1, = args
        args.clear()
        assert_size_stride(arg0_1, (1, 7, 7, 9), (441, 63, 9, 1))
        buf0 = empty_strided_cpu((1, 7, 7, 9), (441, 63, 9, 1), torch.float8_e4m3fn)
        # [Provenance debug handles] cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0:1
        cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0(arg0_1, buf0)
        del arg0_1
        return (buf0, )
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163324
Approved by: https://github.com/Xia-Weiwen, https://github.com/mingfeima, https://github.com/jansel
ghstack dependencies: #163316
2025-10-20 01:56:00 +00:00
e9d8973427 [Inductor] support masked vectorization for the tail_loop for float64 datatype (#163316)
**Summary:**
Support masked vectorization for the tail_loop for float64 datatype.

**Example:**
```
import torch

def fn(x):
    return x * x

x = torch.randn((22, 22), dtype=torch.double)
with torch.no_grad():
    compiled_fn = torch.compile(fn)
    compiled_fn(x)
```

**Generated code:**

- Before
```
cpp_fused_mul_0 = async_compile.cpp_pybinding(['const double*', 'double*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C"  void  kernel(const double* in_ptr0,
                       double* out_ptr0)
{
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(484L); x0+=static_cast<int64_t>(16L))
        {
            {
                if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(480L)))
                {
                    auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                    auto tmp1 = tmp0 * tmp0;
                    tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                }
                if(C10_UNLIKELY(x0 >= static_cast<int64_t>(480L) && x0 < static_cast<int64_t>(484L)))
                {
                    for (int64_t x0_tail = static_cast<int64_t>(480L);x0_tail < static_cast<int64_t>(484L); x0_tail++)
                    {
                        auto tmp0 = in_ptr0[static_cast<int64_t>(x0_tail)];
                        auto tmp1 = double(tmp0 * tmp0);
                        out_ptr0[static_cast<int64_t>(x0_tail)] = tmp1;
                    }
                }
            }
        }
    }
}
''')

async_compile.wait(globals())
del async_compile

class Runner:
    def __init__(self, partitions):
        self.partitions = partitions

    def recursively_apply_fns(self, fns):
        new_callables = []
        for fn, c in zip(fns, self.partitions):
            new_callables.append(fn(c))
        self.partitions = new_callables

    def call(self, args):
        arg0_1, = args
        args.clear()
        assert_size_stride(arg0_1, (22, 22), (22, 1))
        buf0 = empty_strided_cpu((22, 22), (22, 1), torch.float64)
        # [Provenance debug handles] cpp_fused_mul_0:1
        cpp_fused_mul_0(arg0_1, buf0)
        del arg0_1
        return (buf0, )
```
- After
```
cpp_fused_mul_0 = async_compile.cpp_pybinding(['const double*', 'double*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C"  void  kernel(const double* in_ptr0,
                       double* out_ptr0)
{
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(484L); x0+=static_cast<int64_t>(16L))
        {
            {
                if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(480L)))
                {
                    auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                    auto tmp1 = tmp0 * tmp0;
                    tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
                }
                if(C10_UNLIKELY(x0 >= static_cast<int64_t>(480L) && x0 < static_cast<int64_t>(484L)))
                {
                    auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(4L));
                    auto tmp1 = tmp0 * tmp0;
                    tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(4L));
                }
            }
        }
    }
}
''')

async_compile.wait(globals())
del async_compile

class Runner:
    def __init__(self, partitions):
        self.partitions = partitions

    def recursively_apply_fns(self, fns):
        new_callables = []
        for fn, c in zip(fns, self.partitions):
            new_callables.append(fn(c))
        self.partitions = new_callables

    def call(self, args):
        arg0_1, = args
        args.clear()
        assert_size_stride(arg0_1, (22, 22), (22, 1))
        buf0 = empty_strided_cpu((22, 22), (22, 1), torch.float64)
        # [Provenance debug handles] cpp_fused_mul_0:1
        cpp_fused_mul_0(arg0_1, buf0)
        del arg0_1
        return (buf0, )
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163316
Approved by: https://github.com/mingfeima, https://github.com/jansel
2025-10-20 01:41:38 +00:00
61d9a5180e [Fix XPU CI] [Inductor UT] Fix test cases broken by community. (#165714)
Fixes #165719, Fixes #165771

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165714
Approved by: https://github.com/jansel
2025-10-19 23:59:04 +00:00
8a8329b51f [ATen] Switch order of blocked reduce when vectorize loads (#165178)
Performance benchmarking, perf neutral:
```
================================================================================================================================================================================================================================================
Tensor Shape         Operation    Full reduce (ms)     Non-Contig dim (ms)    Contig dim (ms)      Full reduce (ms)     Non-Contig dim (ms)    Contig dim (ms)      Full diff %     Non-Contig diff %    Contig diff %
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(256, 256)           mean         0.015684             0.017056               0.008287             0.016015             0.016929               0.008170                      -2.07%               +0.75%          +1.43%
(256, 256)           sum          0.015774             0.016638               0.007926             0.015811             0.016935               0.008330                      -0.23%               -1.75%          -4.85%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(512, 512)           mean         0.013385             0.025742               0.008629             0.013046             0.026005               0.008924                      +2.60%               -1.01%          -3.31%
(512, 512)           sum          0.013390             0.026059               0.009116             0.013054             0.025696               0.008952                      +2.57%               +1.41%          +1.83%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 1024)         mean         0.014213             0.015467               0.010334             0.013862             0.015082               0.010318                      +2.53%               +2.55%          +0.16%
(1024, 1024)         sum          0.014179             0.015446               0.010774             0.014132             0.015073               0.010350                      +0.33%               +2.47%          +4.10%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(2048, 2048)         mean         0.018234             0.019487               0.014812             0.018482             0.019397               0.014802                      -1.34%               +0.46%          +0.07%
(2048, 2048)         sum          0.018202             0.019529               0.015195             0.018122             0.019485               0.015129                      +0.44%               +0.23%          +0.44%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(4096, 4096)         mean         0.033582             0.039378               0.030751             0.033810             0.039673               0.031019                      -0.67%               -0.74%          -0.86%
(4096, 4096)         sum          0.033604             0.039777               0.030809             0.033530             0.039386               0.031113                      +0.22%               +0.99%          -0.98%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 8192)         mean         0.085824             0.091133               0.084200             0.085431             0.091364               0.084303                      +0.46%               -0.25%          -0.12%
(8192, 8192)         sum          0.085763             0.091442               0.084180             0.085508             0.091419               0.084595                      +0.30%               +0.03%          -0.49%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 16384)        mean         0.146480             0.147666               0.138807             0.146515             0.147987               0.138930                      -0.02%               -0.22%          -0.09%
(8192, 16384)        sum          0.146446             0.147593               0.138559             0.146151             0.147982               0.139120                      +0.20%               -0.26%          -0.40%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 32768)        mean         0.266047             0.265386               0.253837             0.265648             0.265885               0.253652                      +0.15%               -0.19%          +0.07%
(8192, 32768)        sum          0.266093             0.265421               0.253890             0.265458             0.265591               0.253567                      +0.24%               -0.06%          +0.13%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 65536)        mean         0.498632             0.508976               0.481865             0.498237             0.508777               0.481476                      +0.08%               +0.04%          +0.08%
(8192, 65536)        sum          0.498917             0.508202               0.481883             0.498104             0.508016               0.481972                      +0.16%               +0.04%          -0.02%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 131072)       mean         0.957633             0.968519               0.938172             0.956766             0.968267               0.938196                      +0.09%               +0.03%          -0.00%
(8192, 131072)       sum          0.956972             0.968140               0.937741             0.957365             0.968404               0.938056                      -0.04%               -0.03%          -0.03%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 262144)       mean         1.906661             1.928377               1.861846             1.907327             1.928811               1.862083                      -0.03%               -0.02%          -0.01%
(8192, 262144)       sum          1.905976             1.928362               1.862399             1.907098             1.928844               1.861782                      -0.06%               -0.02%          +0.03%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(4096, 262144)       mean         0.956852             0.970101               0.936524             0.957263             0.969809               0.936965                      -0.04%               +0.03%          -0.05%
(4096, 262144)       sum          0.957117             0.969933               0.936247             0.956675             0.969451               0.936395                      +0.05%               +0.05%          -0.02%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(2048, 262144)       mean         0.498813             0.511299               0.483415             0.498567             0.511482               0.483376                      +0.05%               -0.04%          +0.01%
(2048, 262144)       sum          0.498813             0.510834               0.483641             0.498875             0.511036               0.483338                      -0.01%               -0.04%          +0.06%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 262144)       mean         0.266157             0.276751               0.255192             0.265966             0.276808               0.255544                      +0.07%               -0.02%          -0.14%
(1024, 262144)       sum          0.266133             0.276709               0.255528             0.265658             0.276685               0.255287                      +0.18%               +0.01%          +0.09%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(512, 131072)        mean         0.085941             0.081184               0.087931             0.085591             0.080832               0.088008                      +0.41%               +0.44%          -0.09%
(512, 131072)        sum          0.085962             0.081107               0.088045             0.085882             0.081160               0.088024                      +0.09%               -0.07%          +0.02%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1000, 1000)         mean         0.014203             0.045859               0.010310             0.013885             0.046132               0.010621                      +2.29%               -0.59%          -2.93%
(1000, 1000)         sum          0.014180             0.046165               0.010756             0.013893             0.046109               0.010338                      +2.07%               +0.12%          +4.04%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 129)          mean         0.012953             0.016751               0.008536             0.012977             0.016714               0.008916                      -0.18%               +0.22%          -4.26%
(1024, 129)          sum          0.013356             0.016806               0.008722             0.013003             0.017071               0.008611                      +2.71%               -1.55%          +1.29%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 257)          mean         0.013075             0.016787               0.009102             0.013116             0.016769               0.008679                      -0.31%               +0.11%          +4.87%
(1024, 257)          sum          0.013092             0.016842               0.008786             0.013126             0.017128               0.008771                      -0.26%               -1.67%          +0.17%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 587)          mean         0.013662             0.017412               0.010055             0.013659             0.017019               0.010033                      +0.02%               +2.31%          +0.22%
(1024, 587)          sum          0.013636             0.017473               0.010163             0.013642             0.017363               0.010101                      -0.04%               +0.63%          +0.61%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(2048, 977)          mean         0.015276             0.027873               0.012531             0.015241             0.027783               0.012467                      +0.23%               +0.32%          +0.51%
(2048, 977)          sum          0.015345             0.027949               0.012192             0.015255             0.027839               0.012485                      +0.59%               +0.40%          -2.35%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 128)          mean         0.012806             0.014020               0.008291             0.013137             0.014309               0.007908                      -2.52%               -2.02%          +4.84%
(1024, 128)          sum          0.012769             0.014308               0.007924             0.012788             0.014236               0.008038                      -0.15%               +0.51%          -1.42%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 128)          mean         0.014145             0.023049               0.009143             0.014104             0.023298               0.009501                      +0.29%               -1.07%          -3.77%
(8192, 128)          sum          0.014132             0.023082               0.009638             0.014107             0.023331               0.009244                      +0.18%               -1.07%          +4.26%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(1024, 130)          mean         0.013420             0.025834               0.008949             0.013368             0.025724               0.008918                      +0.39%               +0.43%          +0.35%
(1024, 130)          sum          0.013300             0.025940               0.009113             0.013266             0.025419               0.008922                      +0.26%               +2.05%          +2.14%
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
(8192, 130)          mean         0.013993             0.017883               0.009661             0.014275             0.018220               0.009596                      -1.98%               -1.85%          +0.68%
(8192, 130)          sum          0.014026             0.018297               0.010066             0.014326             0.018257               0.009659                      -2.09%               +0.22%          +4.21%
================================================================================================================================================================================================================================================
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165178
Approved by: https://github.com/ngimel
ghstack dependencies: #165494, #164790, #165055
2025-10-19 23:39:05 +00:00
6b80c94901 [FlexAttention] Fix dynamic shaped heads flex_flash check (#165866)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165866
Approved by: https://github.com/BoyuanFeng
ghstack dependencies: #165729
2025-10-19 23:10:16 +00:00
8951df03de test_scaled_matmul_cuda: fix infer_scale_swizzle (#165788)
Extend #165747 fix to other cases.
Add parentheses to clarify operator precedence.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165788
Approved by: https://github.com/jeffdaily, https://github.com/slayton58
2025-10-19 21:42:01 +00:00
8139f33fa5 [dynamo] Add recompile reason for set_stance fail_on_recompile (#165445)
Fixes #163500

### Summary:
For `set_stance("fail_on_recompile")` failures will provide the reason why the recompilation occurred

### Impacts:
module: dynamo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165445
Approved by: https://github.com/williamwen42
2025-10-19 21:12:19 +00:00
a88587348b [dynamo] Clean up assert in dynamo [1/N] (#165430)
Fixes some part of #162852 and #164878. These two issues have some relationship though.

* __->__ #165430

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165430
Approved by: https://github.com/Lucaskabela, https://github.com/williamwen42

Co-authored-by: Lucas Kabela <lucasakabela@gmail.com>
2025-10-19 21:00:05 +00:00
633a3b7f67 Revert "shrink_group implementation to expose ncclCommShrink API (#164518)"
This reverts commit fa0db212e717b6cb225159cb32ea3d83baa52381.

Reverted https://github.com/pytorch/pytorch/pull/164518 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/164518#issuecomment-3419893217))
2025-10-19 19:20:45 +00:00
fa0db212e7 shrink_group implementation to expose ncclCommShrink API (#164518)
Closes #164529

To expose the new [ncclCommShrink](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommshrink) API to PyTorch.

This is useful when you need to exclude certain GPUs or nodes from a collective operation, for example in fault tolerance scenarios or when dynamically adjusting resource utilization.

For more info:  [Shrinking a communicator](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#shrinking-a-communicator)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164518
Approved by: https://github.com/kwen2501
2025-10-19 18:00:08 +00:00
15ff1cd28b Remove E721 suppression in flake8 (#165855)
Currently all files pass the E721 check.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165855
Approved by: https://github.com/albanD
2025-10-19 17:51:12 +00:00
c73f5080de Migrating some more callsites (#163580)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163580
Approved by: https://github.com/avikchaudhuri
ghstack dependencies: #165582
2025-10-19 15:52:17 +00:00
22ae059d32 AOTI util deprecated flow using the new tracer (#165582)
Reapply of https://github.com/pytorch/pytorch/pull/163260

AOTI utils expect free function sometimes so adjust export API to handle that, haven't seen any methods getting exported. Some AOTI flows also require we populate dynamo_flat_name_to_original_fqn so i just copy how it is done in eval_frame.py. I also cleaned up how we get rid of export_root and fixed some overcomplicated nn_module_stack handling in export code. The logic is simpler now thanks to @anijain2305 .

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165582
Approved by: https://github.com/anijain2305
2025-10-19 15:52:16 +00:00
1b121d636e Fix AllocatorConfig parse roundup division bug (#165304)
* #165288
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165304
Approved by: https://github.com/albanD
ghstack dependencies: #165288, #165289, #165291, #165298
2025-10-19 15:34:44 +00:00
1ba808dd97 Refine CUDA BackendStaticInitializer for allocator select (#165298)
* #165288
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165298
Approved by: https://github.com/albanD
ghstack dependencies: #165288, #165289, #165291
2025-10-19 15:34:44 +00:00
b2f5c25b27 Introduce a generic API torch._C._accelerator_setAllocatorSettings (#165291)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165291
Approved by: https://github.com/albanD
ghstack dependencies: #165288, #165289
2025-10-19 15:34:36 +00:00
a1114beed2 Deprecate overlapped functions in CUDAAllocatorConfig (#165289)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165289
Approved by: https://github.com/albanD
ghstack dependencies: #165288
2025-10-19 15:34:26 +00:00
4888ed440e Refine Allocator Config error message friendly (#165288)
* __->__ #165288
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165288
Approved by: https://github.com/albanD
2025-10-19 15:34:17 +00:00
5d62b63a76 [BE] Use Python-3.14 GE build (#165804)
3.14 reached general availability on Oct 7th 2025, so we can remove all pre-release workarounds
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165804
Approved by: https://github.com/yangw-dev, https://github.com/Skylion007, https://github.com/cyyever
2025-10-19 11:45:10 +00:00
57ba575242 [BE][Ez]: Update torch.is_tensor documentation (#165841)
TypeIs propogates the isinstance check with the typing system. They are now equivalent.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165841
Approved by: https://github.com/albanD
2025-10-19 09:24:11 +00:00
ceb11a584d [BE]: Update kleidai submodule to v1.15.0 (#165842)
This mostly just adds a few new kernels and fixes some IMA and performance improvement of prev kernels. Also improves compiler support.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165842
Approved by: https://github.com/albanD
2025-10-19 08:25:03 +00:00
33adb276fe [BE][Ez]: Update Eigen to 5.0.0. C++14 support and more! (#165840)
Update Eigen pin to 5.0.0 . Tons of new features and perf improvements. Most importantly updates minimum from C++03 to C++14 giving a ton of performance optimizations like properly implemented move operators, simplified code, etc. Also improved vectorization particularily on ARM. We really only use this library as a fallback for sparse operators, but still useful to update it.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165840
Approved by: https://github.com/albanD
2025-10-19 08:00:06 +00:00
e939651972 [audio hash update] update the pinned audio hash (#165807)
This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned audio hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165807
Approved by: https://github.com/pytorchbot
2025-10-19 04:45:20 +00:00
3255e7872b Enable all flake8-logging-format rules (#164655)
These rules are enabled by removing existing suppressions.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164655
Approved by: https://github.com/janeyx99, https://github.com/mlazos
2025-10-19 00:59:28 +00:00
c4f6619330 Enable more DTensor tests in local tensor mode and fix more integration issues (#165716)
- During op dispatch local tensor is supposed to collect rng state from CPU and CUDA
devices so that it can be reset before execution of the op for each such that ops
with randomness produces the same result for all ranks (note that we are planning a
separate change to add support of per rank rng state). Previously we relied on
op input arguments to deduce which devices to get rng state from. Which doesn't work
for factory functions such torch.randn. Hence this changes switches to uncondionally
collecting rng state from all devices.

- Fixing per rank specific computations in _MaskedPartial and Shard placements discovered
during test enablement.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165716
Approved by: https://github.com/ezyang
2025-10-18 23:33:24 +00:00
f18041cca8 Fix missing closing quote in __init__.py documentation (#165827)
Title says it all.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165827
Approved by: https://github.com/Skylion007
2025-10-18 22:09:18 +00:00
35e51893bd Remove CUDA 11 workarounds for CUB_SUPPORTS_SCAN_BY_KEY and CUB_SUPPORTS_UNIQUE_BY_KEY (#164637)
`CUB_SUPPORTS_SCAN_BY_KEY` and `CUB_SUPPORTS_UNIQUE_BY_KEY` are true since CUDA 12. This PR removes the old branches and source files.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164637
Approved by: https://github.com/ezyang
2025-10-18 20:05:54 +00:00
1f43d17ce6 Fix self assignment (#165816)
This PR removes assignments of the form `var=var`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165816
Approved by: https://github.com/jansel
2025-10-18 18:51:52 +00:00
032bed95cd Various C++ code fixes in LSAN integration (#165818)
This PR extracts the C++ code fixes from #154584, which are fixes in enabling LSAN.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165818
Approved by: https://github.com/ezyang
2025-10-18 17:59:23 +00:00
d14cbb4476 Add NVFP4 two-level scaling to scaled_mm (#165774)
Summary:

* Add second-level scaling dispatch to scaled_mm, tying into optional `alpha` passing
* Add two-level tests

Test Plan:

```
pytest -svv -k "nvfp4_global_scale" test/test_scaled_matmul_cuda.py
```

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165774
Approved by: https://github.com/drisspg
2025-10-18 13:06:04 +00:00
f510d0dbc0 Clarrifying input output angle unit in the docs for trigonometric fun… (#161248)
…ctions

Fixes #[160995](https://github.com/pytorch/pytorch/issues/160995)

Modified the docs to clarify that input tensor  values for torch.sin, torch.cos and torch.tan should be in radians and the output tensor  values for torch.acos, torch.asin and torch.atan is in radians.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/161248
Approved by: https://github.com/isuruf

Co-authored-by: Isuru Fernando <isuruf@gmail.com>
2025-10-18 11:53:48 +00:00
beb6b62e8c Revert "Enable more DTensor tests in local tensor mode and fix more integration issues (#165716)"
This reverts commit 1b397420f22b22f90a1093233ecd9167656e50cb.

Reverted https://github.com/pytorch/pytorch/pull/165716 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/165716#issuecomment-3418083391))
2025-10-18 09:15:49 +00:00
4740ce7787 [CP] Fix load balancer incorrectly assuming batch dimension exists (#165792)
https://github.com/pytorch/pytorch/pull/163617 removes the if/else statement to check if the input buffers have the batch dimension.

This PR fixes the issue and also adds a test.

In the future, we should explicitly ask users to unsqueeze the batch dimension. This is a BC of the existing contract but implicitly infers the batch dimension existence is not safe.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165792
Approved by: https://github.com/XilunWu
2025-10-18 09:11:16 +00:00
ad67170c8b [MPS] sparse matmuls (#165232)
Implements matmuls for sparse tensors. With this commit most of the core sparse operations should be implemented. Fixes:
https://github.com/pytorch/pytorch/issues/156540
https://github.com/pytorch/pytorch/issues/129842

Should be merged after:
https://github.com/pytorch/pytorch/pull/165102

To compare MPS and CPU, you can use this script:
```python
import torch
import time
import matplotlib.pyplot as plt

B, I, J, K = 8, 20000, 20000, 20000
num_iterations = 500

nnz_values = [10, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 100000]
speedups = []

for nnz in nnz_values:
    indices = torch.stack([
        torch.randint(0, B, (nnz,)),
        torch.randint(0, I, (nnz,)),
        torch.randint(0, J, (nnz,)),
    ])
    values = torch.rand(nnz)

    sparse = torch.sparse_coo_tensor(indices, values, size=(B, I, J), device="mps").coalesce()
    dense = torch.randn(B, J, 200, device="mps")

    t1 = time.time()
    for _ in range(num_iterations):
        result = torch.bmm(sparse, dense)
    torch.mps.synchronize()
    t2 = time.time()
    mps_time = (t2 - t1) / num_iterations

    sparse_cpu = sparse.cpu()
    dense_cpu = dense.cpu()
    t1 = time.time()
    for _ in range(num_iterations):
        result_cpu = torch.bmm(sparse_cpu, dense_cpu)
    t2 = time.time()
    cpu_time = (t2 - t1) / num_iterations

    speedup = cpu_time / mps_time
    speedups.append(speedup)
    print(f"nnz={nnz}: MPS={mps_time:.6f}s, CPU={cpu_time:.6f}s, Speedup={speedup:.2f}x")

plt.figure(figsize=(10, 6))
plt.plot(nnz_values, speedups, marker='o', linewidth=2, markersize=8)
plt.xlabel('Number of Non-Zero Elements (nnz)', fontsize=12)
plt.ylabel('Speedup (CPU time / MPS time)', fontsize=12)
plt.title('MPS vs CPU Speedup for Sparse-Dense BMM', fontsize=14)
plt.grid(True, alpha=0.3)
plt.axhline(y=1, color='r', linestyle='--', alpha=0.5)
plt.xscale('log')
plt.tight_layout()
plt.show()
```

## Tested on M1 Pro
<img width="1000" height="600" alt="Figure_1" src="https://github.com/user-attachments/assets/4a2402ec-3dc4-402d-8196-a0426906ca3d" />

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165232
Approved by: https://github.com/malfet
2025-10-18 09:04:42 +00:00
fdab48a7c1 Enable all PIE rules on ruff (#165814)
This PR enables all PIE rules on ruff, there are already some enabled rules from this family, the new added rules are
```
PIE796  Enum contains duplicate value: {value}
PIE808  Unnecessary start argument in range
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165814
Approved by: https://github.com/ezyang
2025-10-18 07:36:18 +00:00
a0948d4d23 [ROCm][inductor] autotune support for persistent reduction kernels (#163908)
After the removal of want_no_x_dim for persistent reduction kernels, we can improve the autotuning setup for persistent reduction kernels.

Currently even with tuning enable, filtering will only try a single config in many cases. Avoid filtering with autotune mode, and override MAX_BLOCK limit. Also we always include tiny_config when autotuning is enabled.

Contributions from several members of the AMD Inductor and Triton teams: @jataylo @iupaikov-amd @AmdSampsa @xiaohuguo2023

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163908
Approved by: https://github.com/jansel, https://github.com/PaulZhang12
2025-10-18 07:33:24 +00:00
0bbdd6b8db [ROCm][inductor] heuristic improvements for pointwise kernels (#163197)
Heuristic improvements for pointwise kernels for MI350.

Contributions from several members of the AMD Inductor and Triton teams:
@jataylo @AmdSampsa @iupaikov-amd @@xiaohuguo2023

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163197
Approved by: https://github.com/PaulZhang12, https://github.com/eellison, https://github.com/jansel

Co-authored-by: AmdSampsa <sampsa.riikonen@amd.com>
Co-authored-by: Jack Taylor <108682042+jataylo@users.noreply.github.com>
2025-10-18 07:23:41 +00:00
24520b8386 Revert "Enable all PIE rules on ruff (#165814)"
This reverts commit c79dfdc6550e872783aa5cb5fc9e86589bf18872.

Reverted https://github.com/pytorch/pytorch/pull/165814 on behalf of https://github.com/cyyever due to Need to cover more files ([comment](https://github.com/pytorch/pytorch/pull/165814#issuecomment-3417931863))
2025-10-18 07:21:08 +00:00
c79dfdc655 Enable all PIE rules on ruff (#165814)
This PR enables all PIE rules on ruff, there are already some enabled rules from this family, the new added rules are
```
PIE796  Enum contains duplicate value: {value}
PIE808  Unnecessary start argument in range
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165814
Approved by: https://github.com/ezyang
2025-10-18 06:40:12 +00:00
e595136187 Enable PLC1802 on ruff (#165813)
This PR enables ruff check `PLC1802`, which detects len calls on sequences in a boolean test context.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165813
Approved by: https://github.com/ezyang
2025-10-18 05:44:14 +00:00
aaac8cb0f5 [1/N] Add strict parameter to Python zip calls (#165531)
Add `strict=True/False` to zip calls in test utils. `strict=True` is passed when possible.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165531
Approved by: https://github.com/Skylion007
2025-10-18 05:26:33 +00:00
0f0b4bf029 [1/N] Remove unused header inclusion (#165763)
This PR removes unused header inclusion in C++ files.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165763
Approved by: https://github.com/Skylion007
2025-10-18 05:23:11 +00:00
b8194268a6 Remove unnecessary noqa suppressions (#164106)
This PR removes unused `noqa` suppressions in Python code.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164106
Approved by: https://github.com/albanD
2025-10-18 04:52:41 +00:00
f02e3947f6 Expand type checking to mypy strict files (#165697)
Expands Pyrefly type checking to check the files outlined in the mypy-strict.ini configuration file:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165697
Approved by: https://github.com/ezyang
2025-10-18 04:34:45 +00:00
9095a9dfae [CD] Apply the fix from #162455 to aarch64+cu129 build (#165794)
When trying to bring cu129 back in https://github.com/pytorch/pytorch/pull/163029, I mainly looked at https://github.com/pytorch/pytorch/pull/163029 and missed another tweak coming from https://github.com/pytorch/pytorch/pull/162455

I discover this issue when testing aarch64+cu129 builds in https://github.com/pytorch/test-infra/actions/runs/18603342105/job/53046883322?pr=7373.  Surprisingly, there is no test running for aarch64 CUDA build from what I see in 79a37055e7.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165794
Approved by: https://github.com/malfet
2025-10-18 04:16:24 +00:00
d9f94e0d7d [dynamo] Support fx.traceback.annotate as decorator (#165805)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165805
Approved by: https://github.com/Lucaskabela, https://github.com/SherlockNoMad, https://github.com/yushangdi
2025-10-18 03:58:11 +00:00
23417ae50f [Submodule] Bump FBGEMM to latest (#165544)
Summary:

* FBGEMM submodule updated to main
* CMake updated to reflect necessary changes
* Notably pulls in NVFP4 grouped gemm kernels

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165544
Approved by: https://github.com/cyyever, https://github.com/jeffdaily
2025-10-18 03:58:08 +00:00
e4d6c56ffb Improve dynamo graph capture stack trace for custom ops (#165693)
For a custom op
```
@torch.library.custom_op("my_lib::foo", mutates_args={})
def foo(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
    return x + y
```
ppl could call `torch.ops.my_lib.foo()` or directly call `foo()` in the `forward` of an `nn.Module`

These two calling conventions will lead to the same node in the output graph, but different stack traces.

When directly calling `foo()`, the displayed stack_trace in the graph will be
```
# File: .../pytorch/torch/_library/custom_ops.py:687 in __call__, code: return self._opoverload(*args, **kwargs)
```
This is not useful so we filter it out.

```
python test/functorch/test_aot_joint_with_descriptors.py -k test_custom_op_stack_trace
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165693
Approved by: https://github.com/SherlockNoMad, https://github.com/williamwen42
2025-10-18 03:48:18 +00:00
017d2985f3 set unbacked bindings in reinplace pass for newly created nodes during generalize_scatter decomp (#164948)
Two fixes:
1. in rein_place pass, set unbacked bindings for newly created nodes.
2. In inductor, ComputeBuffer used to miss detecting some used symbols, fixed that.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164948
Approved by: https://github.com/bobrenjc93
ghstack dependencies: #164341
2025-10-18 03:20:30 +00:00
c6a8db0b9a Fix issues with generalized_scatter and setitem allocated unbacked symbols. (#164341)
Three fixes:
1. When doing t[u0] +=1  if u0 is unbacked we could allocate a new unbacked symbol during the the indexing of t[u0] (when we fake trace setitem), namely because meta_select does allocate a new unbacked symbol for the storage offset when we do not know if u0>=0 or u0<0.  but the output size/stride of setitem(), does not depend on that new symbol. it's self consumed in setitem so we shall ignore it.

2. Also when we trace through generalized_scatter the applications of the views could allocate unbacked symints
but those do not effect final output, we also shall ignore them.

3.Before accessing strides in lowering we shall materialize.

Address  https://github.com/pytorch/pytorch/issues/114293 and https://github.com/pytorch/pytorch/issues/131911

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164341
Approved by: https://github.com/bobrenjc93
2025-10-18 03:20:30 +00:00
de09bab4b6 [BE]: Update cudnn frontend submodule to 1.15.0 (#165776)
Update cudnn frontend submodule to 1.15.0
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165776
Approved by: https://github.com/eqy
2025-10-18 02:23:27 +00:00
c137e222d4 .venv/ in .gitignore (#165418)
`uv venv` creates venv in `.venv/` directory. So, it's useful to have `.venv/` in `.gitignore`, since perhaps more people are using `uv` in their work. As per comment 3592f5f4e5 (diff-bc37d034bad564583790a46f19d807abfe519c5671395fd494d8cce506c42947)

uv docs  that confirms it: https://docs.astral.sh/uv/pip/environments/#using-arbitrary-python-environments
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165418
Approved by: https://github.com/ezyang
2025-10-18 02:00:52 +00:00
cf3a787bbc [annotate] Annotate bw nodes before eliminate dead code (#165782)
Fixes https://github.com/pytorch/torchtitan/pull/1907

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165782
Approved by: https://github.com/SherlockNoMad
2025-10-18 01:54:31 +00:00
de3da77cf7 Thread deterministic config vars to subproc compilation (#165729)
# Summary

TIL (AFTER WAYYYY TOO MUCH INSANITY), that we do not serialize the full set of configs for the subproc compilation.

I found this while working on Flex-attention determinism: https://github.com/meta-pytorch/attention-gym/pull/168

might be good to audit if we need to thread through any more

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165729
Approved by: https://github.com/shunting314, https://github.com/eellison
2025-10-18 01:25:50 +00:00
543ddbf44c [ONNX] Support renaming in dynamic axes to shapes conversion (#165769)
Discovered in ##165748

This PR also deprecates the conversion. ONNX exporter team does not intend to maintain the conversion in long term.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165769
Approved by: https://github.com/justinchuby
2025-10-18 01:11:20 +00:00
e9f4999985 [Code Clean] Replace std::runtime_error with TORCH_CHECK (#165305)
Fixes part of #148114

Including:

- torch/csrc/distributed

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165305
Approved by: https://github.com/FFFrog, https://github.com/albanD
2025-10-18 01:08:44 +00:00
29b029648e Fixed issue with GradTrackingTensor not properly propagating sparse layout (#165765)
Fixes #164286

Fixed issue with GradTrackingTensor not properly propagating sparse layout.

@ezyang @jcaip
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165765
Approved by: https://github.com/ezyang
2025-10-18 01:00:53 +00:00
a25a649e70 [Mem Snapshot] Add Metadata Field (#165490)
Summary:
The implementation adds the ability to:

Set custom metadata strings that will be attached to all subsequent allocations
Clear or change the metadata at any point
View the metadata in memory snapshots via _dump_snapshot()

Test Plan: Added test in test_cuda.py and check manually in snapshot to see that metadata was added.

Differential Revision: D84654933

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165490
Approved by: https://github.com/yushangdi
2025-10-17 23:46:02 +00:00
69c33898fa Revert "[Inductor][CuTeDSL] Move load_template up two directories (#165347) (#165576)"
This reverts commit febb60323018948b2b9d2cff35b3cc4e0d0c55c8.

Reverted https://github.com/pytorch/pytorch/pull/165576 on behalf of https://github.com/seemethere due to This was actually reverted internally, current PR is linked to a stale diff so diff train tools think that this is landed via co-dev when it was actually reverted ([comment](https://github.com/pytorch/pytorch/pull/165576#issuecomment-3417510146))
2025-10-17 23:33:17 +00:00
1b397420f2 Enable more DTensor tests in local tensor mode and fix more integration issues (#165716)
- During op dispatch local tensor is supposed to collect rng state from CPU and CUDA
devices so that it can be reset before execution of the op for each such that ops
with randomness produces the same result for all ranks (note that we are planning a
separate change to add support of per rank rng state). Previously we relied on
op input arguments to deduce which devices to get rng state from. Which doesn't work
for factory functions such torch.randn. Hence this changes switches to uncondionally
collecting rng state from all devices.

- Fixing per rank specific computations in _MaskedPartial and Shard placements discovered
during test enablement.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165716
Approved by: https://github.com/ezyang
2025-10-17 23:28:22 +00:00
fe80f03726 Add B200 files to labeler and update codeowners (#165767)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165767
Approved by: https://github.com/slayton58
2025-10-17 23:24:17 +00:00
e50dc40d28 Revert "Update gm.print_readable to include Annotation (#165397)"
This reverts commit 7a657700131f31577544e93587eb339618677e97.

Reverted https://github.com/pytorch/pytorch/pull/165397 on behalf of https://github.com/malfet due to I don't know how/why, but it breaks windows tests, see 2e22b1a61e/1 ([comment](https://github.com/pytorch/pytorch/pull/165397#issuecomment-3417428128))
2025-10-17 22:35:50 +00:00
2e22b1a61e [pytorch] Composite backend potential fix for is_backend_available (#165061)
Summary: `is_backend_available` takes in a string and expects it to only be backend, if its given a composite (device:backend) string, it fails.

Reviewed By: prashrock

Differential Revision: D81886736

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165061
Approved by: https://github.com/H-Huang
2025-10-17 22:06:36 +00:00
616c6bdf8f [dynamo][ac] Config flag to allow eager and compile AC divergence for side-effects (#165775)
Eager AC/SAC reapplies the mutations (like global dict mutations) in the backward during the recomputation of forward. torch.compile has no easy way to reapply python mutations in the backward. But many users might be ok to skip reapplication of side effects in the backward. They can set this config flag to accept this eager and compile divergence.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165775
Approved by: https://github.com/zou3519
ghstack dependencies: #165734
2025-10-17 22:04:19 +00:00
c18ddfc572 [dynamo][easy] Support torch.accelerator.current_accelerator (#165734)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165734
Approved by: https://github.com/Skylion007
2025-10-17 22:04:19 +00:00
86ebce1766 [precompile] Pass tensor_to_context to backend. (#165702)
Summary:

Fixing a VLLM issue https://github.com/vllm-project/vllm/issues/27040 where
aot precompile fails on some models using symbolic shapes in inductor.

Test Plan:
pp HF_HUB_DISABLE_XET=1 VLLM_ENABLE_V1_MULTIPROCESSING=0 VLLM_USE_AOT_COMPILE=1 vllm bench latency --model microsoft/DialoGPT-small --input-len 128 --output-len 256 --num-iters 50 --dtype float16

Reviewers:

Subscribers:

Tasks:

Tags:

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165702
Approved by: https://github.com/tugsbayasgalan
2025-10-17 21:52:04 +00:00
8cb2fb44f2 [Inductor] Support fallback for all gemm like ops (#165755)
Summary: Fill op_override field for bmm aten ops so they can be converted properly in the wrapper_fxir backend

Reviewed By: StellarrZ

Differential Revision: D84840948

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165755
Approved by: https://github.com/blaine-rister
2025-10-17 21:08:29 +00:00
ab65498d71 Fix _StridedShard incorrect split (#165533)
https://github.com/pytorch/pytorch/pull/164820 introduced a bug that `_StridedShard` will call parent class `Shard`'s `split_tensor` method, thus results in incorrect data locality. (I think @ezyang spotted this issue, but we have no test to capture this)

Meanwhile, I notice another bug that when we normalize a `_StridedShard`'s placement, it will also trigger parent class `Shard`'s `split_tensor` method because it will create a Shard class [here](0c14f55de6/torch/distributed/tensor/_api.py (L783)). I think we never test `distribute_tensor` for `_StridedShard` before. So I added a test here to compare against ordered shard.

Using classmethod because the _split_tensor logic is different between `Shard` and `_StridedShard`. Basically I want to shard on local tensors without initializing the Shard object:
```
local_tensor = _StridedShard._make_shard_tensor(dim, tensor, mesh, mesh_dim, split_factor=split_factor)
local_tensor = Shard._make_shard_tensor(dim, tensor, mesh, mesh_dim)
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165533
Approved by: https://github.com/XilunWu
2025-10-17 20:54:46 +00:00
06d324365c Revert "Escaped html tags name and target to appear as strings (#165543)"
This reverts commit 080365b7d82a3c99c995cab6dc912b7dfe22aa41.

Reverted https://github.com/pytorch/pytorch/pull/165543 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/165543#issuecomment-3417102048))
2025-10-17 20:45:48 +00:00
6c9c6e0936 Enable C407 of flake8 (#165046)
This PR enables C407 on flake8. The description is `C407` is `Unnecessary list comprehension - ‘<builtin>’ can take a generator`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165046
Approved by: https://github.com/albanD
2025-10-17 20:15:39 +00:00
2bcd892c86 [distributed] Replace assert statements in distributed checkpoint with explicit checks (#165256)
Fixes partially #164878

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165256
Approved by: https://github.com/albanD
2025-10-17 20:14:35 +00:00
75e2a9fae3 [annotate] add annotate_fn function decorator (#165703)
Example usage:

```
        @fx_traceback.annotate_fn({"pp_stage": 1})
        def example_function(x):
            return x * x

        class SimpleLinear(nn.Module):
            def __init__(self):
                super().__init__()
                self.linear = nn.Linear(3, 2)

            def forward(self, x):
                with fx_traceback.annotate({"pp_stage": 0}):
                    y = self.linear(x)
                y = example_function(y)
                return y - 1
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165703
Approved by: https://github.com/SherlockNoMad
2025-10-17 20:10:53 +00:00
a16fd6b488 [NVSHMEM][Triton] Fix NVSHMEM triton test for wacky world sizes (#165704)
Currently assumes divisible by 4? world size

Not as slick as the old setup code but more general

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165704
Approved by: https://github.com/Skylion007, https://github.com/kwen2501
2025-10-17 19:33:26 +00:00
382b0150de [docs] Add usage examples to ConvTranspose1d docstring (#165618)
Fixes #165615

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165618
Approved by: https://github.com/mikaylagawarecki
2025-10-17 19:11:57 +00:00
a664b299ac Update docs for torch.mode (#165614)
Currently the docs for `torch.mode` include a note:

`This function is not defined for torch.cuda.Tensor yet.`

However with `torch==2.7.1+cu126` when I try to get the mode of a Tensor that is in cuda memory, I do not face any issues:

```
>>> a = torch.tensor([0, 2, 1, 1, 1, 3, 3])
>>> a.mode()
torch.return_types.mode(
values=tensor(1),
indices=tensor(4))
>>> a.cuda().mode()
torch.return_types.mode(
values=tensor(1, device='cuda:0'),
indices=tensor(4, device='cuda:0'))
```

Am I misunderstanding the note? If not, I suggest removing it.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165614
Approved by: https://github.com/mikaylagawarecki
2025-10-17 19:06:33 +00:00
9c12651417 Improve error message for non-positive groups in convolution (#165669)
Prevents from segmentation fault for invalid groups value in convolution.

Fixes #142835

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165669
Approved by: https://github.com/mikaylagawarecki
2025-10-17 19:06:05 +00:00
08c97b4a1f Don't run compile inside kernel invocation (#165687)
When we call torch.compile during fake tensor prop, we shouldn't actually compile because we can't guarantee that the compiled artifact can be fake tensor prop-d. (for example, inductor backend). Instead we should just skip compiling. However, the inner compile will be triggered when being executed in runtime.

Fixes: https://github.com/pytorch/pytorch/issues/151328

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165687
Approved by: https://github.com/zou3519
2025-10-17 19:03:57 +00:00
fae74cd52f Revert "shrink_group implementation to expose ncclCommShrink API (#164518)"
This reverts commit a032510db38e8331afa08f7635d146f9cefdd0ab.

Reverted https://github.com/pytorch/pytorch/pull/164518 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/164518#issuecomment-3416718767))
2025-10-17 18:55:53 +00:00
7a65770013 Update gm.print_readable to include Annotation (#165397)
Sample output
```
[rank0]:        # Annotation: {'compile_with_inductor': 'flex_attention'} File: /data/users/bahuang/pytorch/torch/nn/attention/flex_attention.py:1490 in flex_attention, code: out, lse, max_scores = flex_attention_hop(
[rank0]:        score_mod_2 = self.score_mod_2
[rank0]:        mask_fn_2 = self.mask_fn_2
[rank0]:        flex_attention_1 = torch.ops.higher_order.flex_attention(xq_5, xk_5, xv_3, score_mod_2, (2048, 2048, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___kv_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___kv_indices, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_kv_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_kv_indices, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___q_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___q_indices, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_q_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_q_indices, 128, 128, mask_fn_2), 0.25, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False}, (), (g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___mask_mod___closure___0_cell_contents,));  xq_5 = xk_5 = xv_3 = score_mod_2 = mask_fn_2 = None
[rank0]:        out_2: "bf16[8, 4, 2048, 16]" = flex_attention_1[0];  flex_attention_1 = None
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165397
Approved by: https://github.com/yushangdi, https://github.com/anijain2305
2025-10-17 18:35:18 +00:00
e4454947e2 Widen ops support to take in IntHOArrayRef vs only std::vec (#165152)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165152
Approved by: https://github.com/mikaylagawarecki
ghstack dependencies: #164991
2025-10-17 18:32:39 +00:00
3806e9767b Refactor out headeronly ArrayRef (#164991)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164991
Approved by: https://github.com/swolchok
2025-10-17 18:32:39 +00:00
b08d8c2e50 Revert "[DebugMode][2/N] add nn.Module tracking (#165498)"
This reverts commit 45afaf08a14ab760d86ea80dea6d50cec8626513.

Reverted https://github.com/pytorch/pytorch/pull/165498 on behalf of https://github.com/seemethere due to First part of the stack was reverted so will need to revert this too ([comment](https://github.com/pytorch/pytorch/pull/165498#issuecomment-3416618198))
2025-10-17 18:22:48 +00:00
ca5b7f8ded torch.compile: populate compiler_config (#165581)
Summary: This starts writing the compiler_config metadata into logger

Test Plan:
Modified existing test case to make sure this is not null.
(Also eyeballed what we're logging tomake sure it's reasonable

Reviewed By: masnesral

Differential Revision: D84014636

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165581
Approved by: https://github.com/masnesral
2025-10-17 18:21:18 +00:00
9a71d96256 Revert "[DebugMode][1/N] refactor logs into _DebugCalls (#165376)"
This reverts commit 556fc09a9f67f24ca5591ec049c5d0c347c5f62a.

Reverted https://github.com/pytorch/pytorch/pull/165376 on behalf of https://github.com/seemethere due to This is failing for internal tests, see D84877379 for more context ([comment](https://github.com/pytorch/pytorch/pull/165376#issuecomment-3416570407))
2025-10-17 18:08:59 +00:00
0d4c2b71e8 [DeviceMesh] Simplify unflatten method (#165556)
By adding a few small helpers (e.g., a `splice` method to `_MeshLayout`, and making `_init_process_groups` static and thus stateless) we can substantially shorten the definition of the unflatten method, and help readability.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165556
Approved by: https://github.com/fduwjj
ghstack dependencies: #165554, #165555
2025-10-17 17:57:51 +00:00
d659bbde62 [DeviceMesh] Introduce private constructor instead of _create_mesh_from_ranks (#165555)
The refactoring of DeviceMesh is heavily constrained by the signature of its constructor, which is a public API which contains some "legacy" concepts which we'd love to get rid of, such as an explicit/materialized `mesh` Tensor.

In other languages the solution to this would be to add a private overload of the constructor. Python doesn't natively allow this, but in this PR I managed to build something that approximates it.

This new private constructor basically only takes `_layout`, `_global_rank_permutation`, and `mesh_dim_names`.

With such a constructor we can effectively simplify a lot of callsites and get rid of the `_create_mesh_from_ranks` helper method. That's a good thing because it was instantiating many DeviceMeshes in a for loop, which always felt unnecessary.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165555
Approved by: https://github.com/fduwjj, https://github.com/fegin
ghstack dependencies: #165554
2025-10-17 17:57:51 +00:00
58879bfafa [DeviceMesh] Prefer using _layout over _mesh for all sorts of things (#165554)
The goal of this PR is to avoid storing the explicit `mesh` Tensor inside each DeviceMesh, and instead compute it on-the-fly when the end user needs it, and try to replace all of its internal usages with `_layout` and the newly-introduced `_global_rank_permutation` Tensor. The name of this attribute is up for debate. The advantage of the `_global_rank_permutation` Tensor is that it is _the same_ Tensor for the root mesh and all its children, so it doesn't need to be copied/reallocated.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165554
Approved by: https://github.com/fduwjj
2025-10-17 17:57:51 +00:00
a032510db3 shrink_group implementation to expose ncclCommShrink API (#164518)
Closes #164529

To expose the new [ncclCommShrink](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommshrink) API to PyTorch.

This is useful when you need to exclude certain GPUs or nodes from a collective operation, for example in fault tolerance scenarios or when dynamically adjusting resource utilization.

For more info:  [Shrinking a communicator](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#shrinking-a-communicator)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164518
Approved by: https://github.com/Skylion007, https://github.com/syed-ahmed, https://github.com/kwen2501
2025-10-17 17:55:03 +00:00
39e0a832c9 Fix B200 test fails in scaled_mm (#165747)
Summary:

PR #165528 changes some scale/swizzle inference behavior in scaled_mm
tests - mxfp8 tests on Blackwell can get incorrectly classified,
resulting in failures.

Fix the scale/swizzle inference code to prevent this.

Fixes https://github.com/pytorch/pytorch/issues/165743

Test Plan:

```
pytest -svv test/test_scaled_matmul_cuda.py
```

Reviewers:

@jagadish-amd @jeffdaily @drisspg

Subscribers:

@Aidyn-A

Tasks:

Tags:
Signed-off-by: Simon Layton <simonlaytonmeta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165747
Approved by: https://github.com/eqy, https://github.com/drisspg, https://github.com/jeffdaily
2025-10-17 17:52:19 +00:00
dd3b48e85d Fix bug with serialization after AOTAutogradCache hit (#165474)
Fixes #165447

On AOTAutogradCache load, the serialization function we pick is just lambda: self, because the object itself is an AOTAutogradCacheEntry. However, this isn't safe, because `wrap_post_compile` will make `self` unserializable, since it needs to load triton kernels and stuff!

So instead, on AOTAutogradCache load, we preserve the bytes that were used to load the object to begin with, and return that object on a call to serialize(). This effectively makes it so that we save a copy of the pre-hydrated artifact, without needing to do an eager copy until someone actually calls `serialize`.

Test Plan:

Run

```py
import torch

class M(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(2, 4)
        self.relu = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(4, 8)
    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))

device = "cuda"
m = M().to(device)
sample_inputs = (torch.randn(2, 2, device=device),)
eager_out = m(*sample_inputs)

with torch._dynamo.config.patch("enable_aot_compile", True):
    compiled_fn_path = "./m.pt"
    compiled_fn = torch.compile(
        m,
        fullgraph=True
    ).forward.aot_compile((sample_inputs, {}))

    compiled_fn.save_compiled_function(compiled_fn_path)
    torch._dynamo.reset()
    with torch.compiler.set_stance("fail_on_recompile"):
        with open(compiled_fn_path, "rb") as f:
            loaded_fn = torch.compiler.load_compiled_function(f)

assert loaded_fn is not None

compiled_out = loaded_fn(m, *sample_inputs)

assert torch.allclose(eager_out, compiled_out)
```

twice, see that it succeeds.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165474
Approved by: https://github.com/yiming0416, https://github.com/zhxchen17
2025-10-17 17:47:24 +00:00
cff1b20771 Patch the flex_attention._get_mod_type to not use inspect.signature when computing num_positional_args (an alternative fix for flex attention graph break on create_block_mask) (#164923)
The initial fix for inspect.signature uses not a right approach (https://github.com/pytorch/pytorch/pull/164349#pullrequestreview-3306614010). As @williamwen42 suggests (https://github.com/pytorch/pytorch/pull/164349#issuecomment-3379222885) we can just for now get rid of `inspect.signature` call in flex_attention to resolve this high priority issue (https://github.com/pytorch/pytorch/issues/164247#issuecomment-3378673179). In this PR I did exactly this - limited the scope of fix to just computing `num_positional_args` in `flex_attention._get_mod_type` based on properties returned by `NestedUserFunctionVariable.const_getattr` (some were missing so I added them)

Fixes #164247

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164923
Approved by: https://github.com/williamwen42
2025-10-17 17:44:45 +00:00
da8517fa63 [ROCm][CI] upgrade wheels to 7.0.2 and 6.4.4 patch release (#165756)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165756
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-10-17 17:41:19 +00:00
45afaf08a1 [DebugMode][2/N] add nn.Module tracking (#165498)
Uses ModTracker to record nn.Module entries, much like CommDebugMode.

Can be switched on with `DebugMode(record_nn_module=True)`:
```
    [nn.Mod] Bar
      [nn.Mod] Bar.abc
        [nn.Mod] Bar.abc.l1
          aten::t(t: f32[4, 4])
          aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])
        [nn.Mod] Bar.abc.l2
          aten::t(t: f32[4, 4])
          aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])
      [nn.Mod] Bar.xyz
        aten::t(t: f32[4, 4])
        aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])"""
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165498
Approved by: https://github.com/SherlockNoMad
ghstack dependencies: #165376
2025-10-17 17:39:48 +00:00
080365b7d8 Escaped html tags name and target to appear as strings (#165543)
Fixes small typo in markdown documentation file - Added escape characters to precede tag pattern.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165543
Approved by: https://github.com/mikaylagawarecki
2025-10-17 17:35:18 +00:00
2928c5c572 Revert "Pyrefly suppressions 2 (#165692)"
This reverts commit 43d78423ac224cce432bf34ed9627035169d5433.

Reverted https://github.com/pytorch/pytorch/pull/165692 on behalf of https://github.com/seemethere due to This is causing merge conflicts when attempting to land internally, see D84890919 for more details ([comment](https://github.com/pytorch/pytorch/pull/165692#issuecomment-3416397240))
2025-10-17 17:13:04 +00:00
630520b346 [dynamo][misc] Replace UserFunctionVariable with VariableTracker build (#165707)
Audit: To prevent future issues with functools.partial or callable
objects.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165707
Approved by: https://github.com/Lucaskabela
ghstack dependencies: #165683, #165706
2025-10-17 17:02:18 +00:00
1dc9a05d03 [dynamo][user_defined] Replace UserFunctionVariable with VariableTracker build (#165706)
Audit: To prevent future issues with functools.partial or callable
objects.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165706
Approved by: https://github.com/Lucaskabela
ghstack dependencies: #165683
2025-10-17 17:02:18 +00:00
bfcdbd0a97 fix wrong accuracy_status when exception. (#165731)
When I debug `XPU` accruacy issue, I found the script output wrong accuracy_status.
When the `try` block raise an exception, we should process the exception, but not return the `fail_accuracy`.

Before fixing, it returned as `fail_accuracy`:
<img width="1109" height="216" alt="image" src="https://github.com/user-attachments/assets/385c354f-fbf6-48e4-a1be-3e37e987341b" />

After fixing, it returned the exception message:
<img width="1101" height="292" alt="image" src="https://github.com/user-attachments/assets/f18c0e3c-8358-4ec7-a6bb-c2e01b69d27f" />

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165731
Approved by: https://github.com/Stonepia, https://github.com/chuanqi129, https://github.com/Lucaskabela
2025-10-17 16:37:06 +00:00
faff826a46 Revert "[ROCm] new implementation of upsample_bilinear2d_backward (#164572)"
This reverts commit 53f9ae0e50d4dcc47f2ca4bf854803f9d4f875ae.

Reverted https://github.com/pytorch/pytorch/pull/164572 on behalf of https://github.com/seemethere due to Looks like this is failing in our internal builds, will post a suggestion for a fix but want you to double verify that this behavior is correct ([comment](https://github.com/pytorch/pytorch/pull/164572#issuecomment-3416262676))
2025-10-17 16:27:59 +00:00
85c5433d38 Revert "Fix _StridedShard incorrect split (#165533)"
This reverts commit dfc8a1c5ddc8401197e9ab546e03b0f745edc27b.

Reverted https://github.com/pytorch/pytorch/pull/165533 on behalf of https://github.com/seemethere due to Causing a merge conflict internally, see D84829161 ([comment](https://github.com/pytorch/pytorch/pull/165533#issuecomment-3416143176))
2025-10-17 15:57:01 +00:00
935ccdbe75 [MPS] Fix internal assertion in torch.linalg.solve for singular matrices (#165254)
Fixes #163962 by special casing MPS in the negative status code branch in `_linalg_check_errors`.

Checks if info is [`MPSMatrixDecompositionStatus.singular`](https://developer.apple.com/documentation/metalperformanceshaders/mpsmatrixdecompositionstatus/singular) (which has a raw value of -2). I didn't find an official Apple source with this raw value (besides printing the enum value), so I'm not sure if we can (or should) depend on it? Is there a way to directly get the Objective-C enum value in C++?
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165254
Approved by: https://github.com/malfet
2025-10-17 15:35:49 +00:00
3af2f0c12a [inductor] require shape in TritonCSEVariable (#162275)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162275
Approved by: https://github.com/mlazos
ghstack dependencies: #164158
2025-10-17 14:47:45 +00:00
fa7898d389 Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-12 21:38:16 -07:00
392ad268f3 Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-12 21:38:16 -07:00
2f24de182c Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-01 13:39:40 -07:00
0fa1bb03d5 Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-01 13:39:40 -07:00
acfcdd4630 Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-01 12:41:31 -07:00
078e124a61 Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-01 12:41:31 -07:00
c4b608a7a6 Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-01 11:57:05 -07:00
4cde65db1e Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-01 11:57:05 -07:00
7b23c1f143 Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-01 11:55:39 -07:00
c4e78db566 Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
1. Add `extension_build_version` and `is_internal` to `FromImpl`/`ToImpl` (this will be useful for future if we need to break the BC of any type) #163832 has the PoC of how we would actually use this system
2. Add `aoti_torch_library_impl_v2` that takes in an additional `extension_build_version` argument, updates callsite in `torch/csrc/stable/library.h` to always pass `TORCH_ABI_VERSION` for this argument
3. Add `extension_build_version` to `from_ivalue` and `to_ivalue` and update all callsites
4. Add a private `_from` and `_to` that pass `is_internal=True` to `FromImpl`/`ToImpl`, making it easier to reason about what is being called from libtorch-land / extension-land




**Note: This PR does not include a linter that tells the user to update from/to if changing the ABI of a type in headeronly, which I intend to do in https://github.com/pytorch/pytorch/pull/163998**




[ghstack-poisoned]
2025-10-01 11:55:39 -07:00
5f64a36653 Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
#163832 has the PoC of how a type would actually use this system that we won't land




[ghstack-poisoned]
2025-10-01 07:54:41 -07:00
861d1b5cc5 Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
#163832 has the PoC of how a type would actually use this system that we won't land




[ghstack-poisoned]
2025-10-01 07:54:41 -07:00
0f6fa5e93e Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
#163832 has the PoC of how a type would actually use this system that we won't land




[ghstack-poisoned]
2025-10-01 07:31:27 -07:00
b1ff91c021 Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
#163832 has the PoC of how a type would actually use this system that we won't land




[ghstack-poisoned]
2025-10-01 07:31:27 -07:00
c2faf46755 Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
#163832 has the PoC of how a type would actually use this system that we won't land




[ghstack-poisoned]
2025-10-01 07:28:36 -07:00
253bfd133f Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
#163832 has the PoC of how a type would actually use this system that we won't land




[ghstack-poisoned]
2025-10-01 07:28:36 -07:00
fbb539612e Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
#163832 has the PoC of how a type would actually use this system that we won't land




[ghstack-poisoned]
2025-10-01 07:21:48 -07:00
f37eb45d37 Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
#163832 has the PoC of how a type would actually use this system that we won't land




[ghstack-poisoned]
2025-10-01 07:21:47 -07:00
4773945fc2 Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
#163832 has the PoC of how a type would actually use this system that we won't land




[ghstack-poisoned]
2025-10-01 07:11:40 -07:00
f3c154dd5a Update base for Update on "Add scaffolding for StableIValue FC/BC (no PoC)"
#163832 has the PoC of how a type would actually use this system that we won't land




[ghstack-poisoned]
2025-10-01 07:11:40 -07:00
2ce847e94a Add scaffolding for StableIValue FC/BC (no PoC)
[ghstack-poisoned]
2025-09-30 21:09:40 -07:00
a6ff7071d2 Update on "Add scaffolding for aoti_torch_call_dispatcher BC/FC with native ops"
Part 1 of plan in https://docs.google.com/document/d/1MaX51H5aEQE5XnOlnZIpf9oCYwzGrTWkgBACxNzsmWE/edit?usp=sharing

- Upgrade `aoti_torch_call_dispatcher` to v2 with an `extension_abi_version`
- Allow registration of StableIValue stack  --> IValue stack adapters for schema changes




[ghstack-poisoned]
2025-09-30 20:47:57 -07:00
ea12786bd8 Update base for Update on "Add scaffolding for aoti_torch_call_dispatcher BC/FC with native ops"
Part 1 of plan in https://docs.google.com/document/d/1MaX51H5aEQE5XnOlnZIpf9oCYwzGrTWkgBACxNzsmWE/edit?usp=sharing

- Upgrade `aoti_torch_call_dispatcher` to v2 with an `extension_abi_version`
- Allow registration of StableIValue stack  --> IValue stack adapters for schema changes




[ghstack-poisoned]
2025-09-30 20:47:56 -07:00
1d18e6ba52 Update on "Add scaffolding for aoti_torch_call_dispatcher BC/FC with native ops"
Part 1 of plan in https://docs.google.com/document/d/1MaX51H5aEQE5XnOlnZIpf9oCYwzGrTWkgBACxNzsmWE/edit?usp=sharing

- Upgrade `aoti_torch_call_dispatcher` to v2 with an `extension_abi_version`
- Allow registration of StableIValue stack  --> IValue stack adapters for schema changes




[ghstack-poisoned]
2025-09-30 09:19:04 -07:00
13c89116d0 Update on "Add scaffolding for aoti_torch_call_dispatcher BC/FC with native ops"
Part 1 of plan in https://docs.google.com/document/d/1MaX51H5aEQE5XnOlnZIpf9oCYwzGrTWkgBACxNzsmWE/edit?usp=sharing

- Upgrade `aoti_torch_call_dispatcher` to v2 with an `extension_abi_version`
- Allow registration of StableIValue stack  --> IValue stack adapters for schema changes




[ghstack-poisoned]
2025-09-29 20:49:08 -07:00
9c88079132 Update on "Add scaffolding for aoti_torch_call_dispatcher BC/FC with native ops"
Part 1 of plan in https://docs.google.com/document/d/1MaX51H5aEQE5XnOlnZIpf9oCYwzGrTWkgBACxNzsmWE/edit?usp=sharing

- Upgrade `aoti_torch_call_dispatcher` to v2 with an `extension_abi_version`
- Allow registration of StableIValue stack  --> IValue stack adapters for schema changes




[ghstack-poisoned]
2025-09-29 20:22:49 -07:00
eb7d78f707 Update on "Add scaffolding for aoti_torch_call_dispatcher BC/FC with native ops"
[ghstack-poisoned]
2025-09-23 13:29:19 -07:00
f2ce819026 Add scaffolding for aoti_torch_call_dispatcher BC/FC with native ops
[ghstack-poisoned]
2025-09-23 13:28:07 -07:00
1455 changed files with 12935 additions and 34874 deletions

View File

@ -8,8 +8,6 @@ if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0"
elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
fi

View File

@ -20,7 +20,7 @@ ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
# cmake-3.18.4 from pip
RUN yum install -y python3-pip && \
python3 -m pip install cmake==3.18.4 && \
python3 -mpip install cmake==3.18.4 && \
ln -s /usr/local/bin/cmake /usr/bin/cmake3
RUN rm -rf /usr/local/cuda-*

View File

@ -113,7 +113,6 @@ case "$tag" in
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INSTALL_MINGW=yes
;;
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
CUDA_VERSION=13.0.0
@ -182,7 +181,7 @@ case "$tag" in
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100"
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
if [[ $tag =~ "benchmarks" ]]; then
INDUCTOR_BENCHMARKS=yes
fi
@ -345,7 +344,7 @@ docker build \
--build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
--build-arg "KATEX=${KATEX:-}" \
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" \
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx1100}" \
--build-arg "IMAGE_NAME=${IMAGE_NAME}" \
--build-arg "UCX_COMMIT=${UCX_COMMIT}" \
--build-arg "UCC_COMMIT=${UCC_COMMIT}" \
@ -362,7 +361,6 @@ docker build \
--build-arg "OPENBLAS=${OPENBLAS:-}" \
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
--build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
--build-arg "INSTALL_MINGW=${INSTALL_MINGW:-}" \
-f $(dirname ${DOCKERFILE})/Dockerfile \
-t "$tmp_tag" \
"$@" \

View File

@ -25,7 +25,7 @@ function install_torchbench() {
python install.py --continue_on_fail
echo "Print all dependencies after TorchBench is installed"
python -m pip freeze
python -mpip freeze
popd
chown -R jenkins torchbench

View File

@ -1,10 +0,0 @@
#!/bin/bash
set -ex
# Install MinGW-w64 for Windows cross-compilation
apt-get update
apt-get install -y g++-mingw-w64-x86-64-posix
echo "MinGW-w64 installed successfully"
x86_64-w64-mingw32-g++ --version

View File

@ -8,8 +8,8 @@ MKLROOT=/opt/intel
mkdir -p ${MKLROOT}
pushd /tmp
python3 -m pip install wheel
python3 -m pip download -d . mkl-static==${MKL_VERSION}
python3 -mpip install wheel
python3 -mpip download -d . mkl-static==${MKL_VERSION}
python3 -m wheel unpack mkl_static-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
python3 -m wheel unpack mkl_include-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
mv mkl_static-${MKL_VERSION}/mkl_static-${MKL_VERSION}.data/data/lib ${MKLROOT}

View File

@ -20,7 +20,7 @@ pip_install \
pip_install coloredlogs packaging
pip_install onnxruntime==1.23.0
pip_install onnxscript==0.5.4
pip_install onnxscript==0.5.3
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers
# package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/

View File

@ -11,5 +11,5 @@ ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
python -m venv /var/lib/jenkins/ci_env
source /var/lib/jenkins/ci_env/bin/activate
python -m pip install --upgrade pip
python -m pip install -r /opt/requirements-ci.txt
python -mpip install --upgrade pip
python -mpip install -r /opt/requirements-ci.txt

View File

@ -14,7 +14,7 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op
# cmake-3.18.4 from pip
RUN yum install -y python3-pip && \
python3 -m pip install cmake==3.18.4 && \
python3 -mpip install cmake==3.18.4 && \
ln -s /usr/local/bin/cmake /usr/bin/cmake3
FROM base as openssl
@ -135,7 +135,7 @@ RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
# cmake-3.18.4 from pip; force in case cmake3 already exists
RUN yum install -y python3-pip && \
python3 -m pip install cmake==3.18.4 && \
python3 -mpip install cmake==3.18.4 && \
ln -sf /usr/local/bin/cmake /usr/bin/cmake3
FROM cpu_final as cuda_final
@ -157,7 +157,7 @@ ENV ROCM_PATH /opt/rocm
# cmake-3.28.4 from pip to get enable_language(HIP)
# and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
RUN python3 -m pip install --upgrade pip && \
python3 -m pip install cmake==3.28.4
python3 -mpip install cmake==3.28.4
# replace the libdrm in /opt/amdgpu with custom amdgpu.ids lookup path
ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
@ -174,7 +174,7 @@ FROM cpu_final as xpu_final
ENV XPU_DRIVER_TYPE ROLLING
# cmake-3.28.4 from pip
RUN python3 -m pip install --upgrade pip && \
python3 -m pip install cmake==3.28.4
python3 -mpip install cmake==3.28.4
ADD ./common/install_xpu.sh install_xpu.sh
ENV XPU_VERSION 2025.2
RUN bash ./install_xpu.sh && rm install_xpu.sh

View File

@ -113,7 +113,7 @@ RUN dnf install -y \
RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
# cmake-3.28.0 from pip for onnxruntime
RUN python3 -m pip install cmake==3.28.0
RUN python3 -mpip install cmake==3.28.0
ADD ./common/patch_libstdc.sh patch_libstdc.sh
RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh

View File

@ -10,6 +10,11 @@ BAD_SSL = "https://self-signed.badssl.com"
print("Testing SSL certificate checking for Python:", sys.version)
if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
print("This version never checks SSL certs; skipping tests")
sys.exit(0)
EXC = OSError
print(f"Connecting to {GOOD_SSL} should work")

View File

@ -103,11 +103,6 @@ COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
ARG INSTALL_MINGW
COPY ./common/install_mingw.sh install_mingw.sh
RUN if [ -n "${INSTALL_MINGW}" ]; then bash ./install_mingw.sh; fi
RUN rm install_mingw.sh
ARG TRITON
ARG TRITON_CPU

View File

@ -143,7 +143,7 @@ def sample_vllm_test_library():
"pytest -v -s compile/test_decorator.py",
],
},
"vllm_language_model_test_extended_generation_28_failure_test": {
"vllm_languagde_model_test_extended_generation_28_failure_test": {
"title": "Language Models Test (Extended Generation) 2.8 release failure",
"id": "vllm_languagde_model_test_extended_generation_28_failure_test",
"package_install": [

View File

@ -63,7 +63,7 @@ class VllmBuildParameters:
# DOCKERFILE_PATH: path to Dockerfile used when use_local_dockerfile is True"
use_local_dockerfile: bool = env_bool_field("USE_LOCAL_DOCKERFILE", True)
dockerfile_path: Path = env_path_field(
"DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile"
"DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
)
# the cleaning script to remove torch dependencies from pip

View File

@ -187,22 +187,19 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
export USE_CUFILE=0
else
DEPS_LIST+=(
"/usr/local/cuda/lib64/libnvToolsExt.so.1"
"/usr/local/cuda/lib64/libcublas.so.12"
"/usr/local/cuda/lib64/libcublasLt.so.12"
"/usr/local/cuda/lib64/libcudart.so.12"
"/usr/local/cuda/lib64/libnvrtc.so.12"
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
DEPS_SONAME+=(
"libnvToolsExt.so.1"
"libcublas.so.12"
"libcublasLt.so.12"
"libcudart.so.12"
"libnvrtc.so.12"
"libcupti.so.12")
if [[ $CUDA_VERSION != 12.9* ]]; then
DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
DEPS_SONAME+=("libnvToolsExt.so.1")
fi
fi
else
echo "Using nvidia libs from pypi."

View File

@ -233,9 +233,7 @@ if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
export BUILD_STATIC_RUNTIME_BENCHMARK=ON
fi
if [[ "$BUILD_ENVIRONMENT" == *-full-debug* ]]; then
export CMAKE_BUILD_TYPE=Debug
elif [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
export CMAKE_BUILD_TYPE=RelWithAssert
fi
@ -288,7 +286,7 @@ else
# or building non-XLA tests.
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *xla* && "$BUILD_ENVIRONMENT" != *riscv64* ]]; then
# Install numpy-2.0.2 for builds which are backward compatible with 1.X
python -m pip install numpy==2.0.2
python -mpip install numpy==2.0.2
WERROR=1 python setup.py clean
@ -301,11 +299,6 @@ else
python -m build --wheel --no-isolation
fi
pip_install_whl "$(echo dist/*.whl)"
if [[ "$BUILD_ENVIRONMENT" == *full-debug* ]]; then
# Regression test for https://github.com/pytorch/pytorch/issues/164297
# Torch should be importable and that's about it
pushd /; python -c "import torch;print(torch.__config__.show(), torch.randn(5) + 1.7)"; popd
fi
if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *vision* ]]; then
install_torchvision

View File

@ -67,13 +67,13 @@ function pip_install_whl() {
# Loop through each path and install individually
for path in "${paths[@]}"; do
echo "Installing $path"
python3 -m pip install --no-index --no-deps "$path"
python3 -mpip install --no-index --no-deps "$path"
done
else
# Loop through each argument and install individually
for path in "${args[@]}"; do
echo "Installing $path"
python3 -m pip install --no-index --no-deps "$path"
python3 -mpip install --no-index --no-deps "$path"
done
fi
}

View File

@ -182,7 +182,7 @@ checkout_install_torchbench() {
pip uninstall -y torchao
echo "Print all dependencies after TorchBench is installed"
python -m pip freeze
python -mpip freeze
}
torchbench_setup_macos() {
@ -211,7 +211,7 @@ torchbench_setup_macos() {
}
pip_benchmark_deps() {
python -m pip install --no-input requests cython scikit-learn six
python -mpip install --no-input requests cython scikit-learn six
}

View File

@ -337,13 +337,13 @@ test_python() {
test_python_smoke() {
# Smoke tests for H100/B200
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty
}
test_python_smoke_b200() {
# Targeted smoke tests for B200 - staged approach to avoid too many failures
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty
}
@ -485,22 +485,6 @@ test_inductor_aoti() {
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
}
test_inductor_aoti_cross_compile_for_windows() {
TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR"
# Set WINDOWS_CUDA_HOME environment variable
WINDOWS_CUDA_HOME="$(pwd)/win-torch-wheel-extracted"
export WINDOWS_CUDA_HOME
echo "WINDOWS_CUDA_HOME is set to: $WINDOWS_CUDA_HOME"
echo "Contents:"
ls -lah "$(pwd)/win-torch-wheel-extracted/lib/x64/" || true
python test/inductor/test_aoti_cross_compile_windows.py -k compile --package-dir "$TEST_REPORTS_DIR" --win-torch-lib-dir "$(pwd)/win-torch-wheel-extracted/torch/lib"
}
test_inductor_cpp_wrapper_shard() {
if [[ -z "$NUM_TEST_SHARDS" ]]; then
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
@ -916,7 +900,7 @@ test_inductor_set_cpu_affinity(){
export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
if [[ "$(uname -m)" != "aarch64" ]]; then
if [[ "${TEST_CONFIG}" != *aarch64* ]]; then
# Use Intel OpenMP for x86
IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD"
@ -930,7 +914,7 @@ test_inductor_set_cpu_affinity(){
cores=$((cpus / thread_per_core))
# Set number of cores to 16 on aarch64 for performance runs
if [[ "$(uname -m)" == "aarch64" && $cores -gt 16 ]]; then
if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
cores=16
fi
export OMP_NUM_THREADS=$cores
@ -1434,7 +1418,7 @@ EOF
# shellcheck source=./common-build.sh
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist"
python -m pip install base_dist/*.whl
python -mpip install base_dist/*.whl
echo "::endgroup::"
pushd test/forward_backward_compatibility
@ -1631,7 +1615,6 @@ test_operator_benchmark() {
TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR"
TEST_DIR=$(pwd)
ARCH=$(uname -m)
test_inductor_set_cpu_affinity
@ -1646,7 +1629,7 @@ test_operator_benchmark() {
pip_install pandas
python check_perf_csv.py \
--actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
--expected "${ARCH}_expected_ci_operator_benchmark_eager_float32_cpu.csv"
--expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
}
test_operator_microbenchmark() {
@ -1683,7 +1666,7 @@ if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then
python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0
fi
python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py
elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]]; then
elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
test_linux_aarch64
elif [[ "${TEST_CONFIG}" == *backward* ]]; then
test_forward_backward_compatibility
@ -1734,8 +1717,6 @@ elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
test_inductor_triton_cpu
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
test_inductor_micro_benchmark
elif [[ "${TEST_CONFIG}" == *aoti_cross_compile_for_windows* ]]; then
test_inductor_aoti_cross_compile_for_windows
elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
install_torchvision
id=$((SHARD_NUMBER-1))

View File

@ -15,35 +15,37 @@ if errorlevel 1 exit /b 1
if not errorlevel 0 exit /b 1
cd %TMP_DIR_WIN%\build\torch\test
:: Enable delayed variable expansion to make the list
setlocal enabledelayedexpansion
set EXE_LIST=
for /r "." %%a in (*.exe) do (
if "%%~na" == "c10_intrusive_ptr_benchmark" (
@REM NB: This is not a gtest executable file, thus couldn't be handled by
@REM pytest-cpp and is excluded from test discovery by run_test
call "%%~fa"
call :libtorch_check "%%~na" "%%~fa"
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
) else (
if "%%~na" == "verify_api_visibility" (
@REM Skip verify_api_visibility as it is a compile-level test
) else (
set EXE_LIST=!EXE_LIST! cpp/%%~na
)
)
)
goto :eof
:libtorch_check
cd %CWD%
set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test
:: Run python test\run_test.py on the list
set NO_TD=True && python test\run_test.py --cpp --verbose -i !EXE_LIST!
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
:: Skip verify_api_visibility as it a compile level test
if "%~1" == "verify_api_visibility" goto :eof
goto :eof
echo Running "%~2"
if "%~1" == "c10_intrusive_ptr_benchmark" (
:: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp
call "%~2"
goto :eof
)
python test\run_test.py --cpp --verbose -i "cpp/%~1"
if errorlevel 1 (
echo %1 failed with exit code %errorlevel%
goto fail
)
if not errorlevel 0 (
echo %1 failed with exit code %errorlevel%
goto fail
)
:eof
exit /b 0

View File

@ -173,7 +173,7 @@ esac
PINNED_PACKAGES=(
"numpy${NUMPY_PINNED_VERSION}"
)
python -m venv ~/${desired_python}-build
python -mvenv ~/${desired_python}-build
source ~/${desired_python}-build/bin/activate
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
retry brew install libomp

View File

@ -12,7 +12,7 @@ ignore =
# to line this up with executable bit
EXE001,
# these ignores are from flake8-bugbear; please fix!
B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907,B908,B910
# these ignores are from flake8-comprehensions; please fix!
C407,
# these ignores are from flake8-logging-format; please fix!

View File

@ -8,7 +8,6 @@ assignees: ''
---
> NOTE: Remember to label this issue with "`ci: sev`"
> If you want autorevert to be disabled, keep the ci: disable-autorevert label
<!-- Add the `merge blocking` label to this PR to prevent PRs from being merged while this issue is open -->

View File

@ -1,7 +1,7 @@
---
name: "D❌\U0001F519 ISABLE AUTOREVERT"
name: DISABLE AUTOREVERT
about: Disables autorevert when open
title: "[DISABLE AUTOREVERT]"
title: "❌​\U0001F519 [DISABLE AUTOREVERT]"
labels: 'ci: disable-autorevert'
assignees: ''

View File

@ -65,7 +65,7 @@ runs:
cd .ci/lumen_cli
python3 -m pip install -e .
)
MAX_JOBS="$(nproc --ignore=10)"
MAX_JOBS="$(nproc --ignore=6)"
export MAX_JOBS
# Split the comma-separated list and build each target

View File

@ -274,6 +274,8 @@ runs:
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
# Propagate download.pytorch.org IP to container
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"

View File

@ -111,16 +111,3 @@ runs:
# This video group ID maps to subgid 1 inside the docker image due to the /etc/subgid entries.
# The group name corresponding to group ID 1 can change depending on the OS, so both are necessary.
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd $DEVICE_FLAG --group-add video --group-add $render_gid --group-add daemon --group-add bin --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --network=host" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Login to Amazon ECR
id: login-ecr
continue-on-error: true
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1

View File

@ -33,6 +33,10 @@ runs:
)
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
# Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
fi
docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
# Generate test script

View File

@ -1 +1 @@
1b013f5b5a87a1882eb143c26d79d091150d6a37
87ff22e49ed0e92576c4935ccb8c143daac4a3cd

View File

@ -1 +1 @@
faffd5cf673615583da6517275e361cb3dbc77e6
966da7e46f65d6d49df3e31214470a4fe5cc8e66

View File

@ -1 +1 @@
e5192819208c4d68194844b7dfafbc00020d0dea
0ad9951c416d33c5da4f7a504fb162cbe62386f5

View File

@ -1 +1 @@
0fa6e3129e61143224663e1ec67980d12b7ec4eb
2a9138a26ee257fef05310ad3fecf7c55fe80d73

View File

@ -1,41 +1,59 @@
# TODO(elainwy): remove this file after the torch nightly dockerfile is in sync in vllm repo
# The vLLM Dockerfile is used to construct vLLM image against torch nightly and torch main that can be directly used for testing
ARG CUDA_VERSION=12.8.1
ARG PYTHON_VERSION=3.12
# BUILD_BASE_IMAGE: used to setup python build xformers, and vllm wheels, It can be replaced with a different base image from local machine,
# by default, it uses the torch-nightly-base stage from this docker image
ARG BUILD_BASE_IMAGE=torch-nightly-base
# FINAL_BASE_IMAGE: used to set up vllm-instaled environment and build flashinfer,
# by default, it uses devel-ubuntu22.04 official image.
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
# The logic is copied from https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile
ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"
#################### TORCH NIGHTLY BASE IMAGE ####################
# A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
ARG CUDA_VERSION
ARG PYTHON_VERSION
ARG GET_PIP_URL
# Install system dependencies and uv, then create Python virtual environment
# Install Python and other dependencies
RUN apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
&& ln -s /opt/venv/bin/python3 /usr/bin/python3 \
&& ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
&& ln -s /opt/venv/bin/pip /usr/bin/pip \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim \
&& add-apt-repository -y ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
RUN apt-get install -y gcc-10 g++-10
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
RUN <<EOF
gcc --version
EOF
# Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519)
RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \
if command -v apt-get >/dev/null; then \
if [ "$current_gcc_version" -lt 10 ]; then \
echo "GCC version is $current_gcc_version, installing gcc-10..."; \
apt-get update \
&& apt-get install -y gcc-10 g++-10 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 \
&& update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
else \
echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
fi \
fi \
&& gcc --version && g++ --version
# Install uv for faster pip installs
# install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv==0.8.4
@ -43,32 +61,36 @@ ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
#################### TORCH NIGHTLY BASE IMAGE ####################
#################### BASE BUILD IMAGE ####################
# A base image for building vLLM with torch nightly or torch wheels
# prepare basic build environment
FROM ${BUILD_BASE_IMAGE} AS base
USER root
ARG CUDA_VERSION
ARG PYTHON_VERSION
# Only work with PyTorch manylinux builder
# TODO (huydhn): Only work with PyTorch manylinux builder
ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
# Install some system dependencies and double check python version
RUN if command -v apt-get >/dev/null; then \
apt-get update -y \
&& apt-get install -y ccache software-properties-common git wget sudo vim; \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim; \
else \
dnf install -y git wget sudo; \
dnf install -y git curl wget sudo; \
fi \
&& python3 --version && python3 -m pip --version
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv==0.8.4
if ! python3 -m uv --version >/dev/null 2>&1; then \
python3 -m pip install uv==0.8.4; \
fi
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
@ -76,15 +98,15 @@ ENV UV_LINK_MODE=copy
WORKDIR /workspace
# Install build and runtime dependencies
# install build and runtime dependencies
COPY requirements/common.txt requirements/common.txt
COPY use_existing_torch.py use_existing_torch.py
COPY pyproject.toml pyproject.toml
# Install build and runtime dependencies without stable torch version
# install build and runtime dependencies without stable torch version
RUN python3 use_existing_torch.py
# Default mount file as placeholder, this just avoid the mount error
# default mount file as placeholder, this just avoid the mount error
# change to a different vllm folder if this does not exist anymore
ARG TORCH_WHEELS_PATH="./requirements"
ARG PINNED_TORCH_VERSION
@ -116,36 +138,56 @@ RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/common.txt
# Must put before installing xformers, so it can install the correct version of xfomrers.
ARG xformers_cuda_arch_list='7.5;8.0+PTX;9.0a'
ENV TORCH_CUDA_ARCH_LIST=${xformers_cuda_arch_list}
ARG max_jobs=16
ENV MAX_JOBS=${max_jobs}
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
git clone https://github.com/facebookresearch/xformers.git
RUN echo ${TORCH_CUDA_ARCH_LIST}
RUN echo ${MAX_JOBS}
RUN pip freeze | grep -E 'ninja'
pushd xformers
git checkout v0.0.32.post2
git submodule update --init --recursive
python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose
popd
# Build xformers with cuda and torch nightly/wheel
# following official xformers guidance: https://github.com/facebookresearch/xformers#build
# sha for https://github.com/facebookresearch/xformers/tree/v0.0.32.post2
ARG XFORMERS_COMMIT=5d4b92a5e5a9c6c6d4878283f47d82e17995b468
ENV CCACHE_DIR=/root/.cache/ccache
rm -rf xformers
BASH
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/uv \
echo 'git clone xformers...' \
&& git clone https://github.com/facebookresearch/xformers.git --recursive \
&& cd xformers \
&& git checkout ${XFORMERS_COMMIT} \
&& git submodule update --init --recursive \
&& echo 'finish git clone xformers...' \
&& rm -rf build \
&& python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
&& cd .. \
&& rm -rf xformers
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system xformers-dist/*.whl
uv pip install --system xformers-dist/*.whl --verbose
# Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
RUN cat torch_build_versions.txt
RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
#################### BASE BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE ####################
# Image used to build vllm wheel
FROM base AS build
ARG TARGETPLATFORM
COPY . .
RUN python3 use_existing_torch.py
RUN --mount=type=cache,target=/root/.cache/uv \
@ -155,17 +197,20 @@ ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
# Max jobs used by Ninja to build extensions
ARG max_jobs=16
ENV MAX_JOBS=${max_jobs}
ARG nvcc_threads=8
ARG nvcc_threads=4
ENV NVCC_THREADS=$nvcc_threads
ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
ARG USE_SCCACHE
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# Use sccache to speed up compilation
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \
@ -190,9 +235,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& sccache --show-stats; \
fi
ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
ARG vllm_target_device="cuda"
ENV VLLM_TARGET_DEVICE=${vllm_target_device}
ENV CCACHE_DIR=/root/.cache/ccache
@ -206,10 +248,17 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
fi
RUN echo "[INFO] Listing current directory:" && \
ls -al && \
echo "[INFO] Showing torch_build_versions.txt content:" && \
cat torch_build_versions.txt
#################### WHEEL BUILD IMAGE ####################
################### VLLM INSTALLED IMAGE ####################
# Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer
FROM ${FINAL_BASE_IMAGE} AS vllm-base
USER root
@ -217,7 +266,7 @@ ARG CUDA_VERSION
ARG PYTHON_VERSION
ARG GET_PIP_URL
# Only work with PyTorch manylinux builder
# TODO (huydhn): Only work with PyTorch manylinux builder
ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
# prepare for environment starts
@ -226,19 +275,20 @@ WORKDIR /workspace
# Install Python and other dependencies
RUN if command -v apt-get >/dev/null; then \
apt-get update -y \
&& apt-get install -y ccache software-properties-common git sudo vim python3-pip; \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim \
&& add-apt-repository -y ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
else \
dnf install -y git wget sudo; \
dnf install -y git curl wget sudo; \
fi \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
&& ln -s /opt/venv/bin/python3 /usr/bin/python3 \
&& ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
&& ln -s /opt/venv/bin/pip /usr/bin/pip \
&& python3 --version && python3 -m pip --version
# Get the torch versions, and whls used in previous stage
# Get the torch versions, and whls used in previous stagtes for consistency
COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
COPY --from=base /workspace/xformers-dist /wheels/xformers
COPY --from=build /workspace/vllm-dist /wheels/vllm
@ -247,29 +297,33 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
echo "[INFO] Showing torch_build_versions.txt content:" && \
cat torch_build_versions.txt
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv==0.8.4
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
# Install build and runtime dependencies, this is needed for flashinfer install
COPY requirements/build.txt requirements/build.txt
COPY use_existing_torch.py use_existing_torch.py
RUN python3 use_existing_torch.py
RUN cat requirements/build.txt
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
if ! python3 -m uv --version > /dev/null 2>&1; then \
python3 -m pip install uv==0.8.4; \
fi
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt
# Default mount file as placeholder, this just avoid the mount error
ARG TORCH_WHEELS_PATH="./requirements"
# Install torch, torchaudio and torchvision. If TORCH_WHEELS_PATH is default
# to ./requirements, it will pull the nightly versions using pip. Otherwise,
# it will use the local wheels from TORCH_WHEELS_PATH
# Install torch, torchaudio and torchvision
# if TORCH_WHEELS_PATH is default "./requirements", it will pull the nightly versions using pip using torch_build_versions.txt
# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine
RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
--mount=type=cache,target=/root/.cache/uv \
if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
@ -290,14 +344,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Install xformers wheel from previous stage
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system /wheels/xformers/*.whl --verbose
# Build FlashInfer from source
# Build flashinfer from source.
ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
# install package for build flashinfer
# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
RUN pip freeze | grep -E 'setuptools|packaging|build'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Build flashinfer for torch nightly from source around 10 mins
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
ARG FLASHINFER_GIT_REF="v0.2.14.post1"
RUN --mount=type=cache,target=/root/.cache/uv \
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
@ -309,7 +367,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& cd .. \
&& rm -rf flashinfer
# Install FlashInfer
# install flashinfer python
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system wheels/flashinfer/*.whl --verbose
@ -319,6 +377,49 @@ RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm
################### VLLM INSTALLED IMAGE ####################
#################### UNITTEST IMAGE #############################
FROM vllm-base as test
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
COPY tests/ tests/
COPY examples examples
COPY benchmarks benchmarks
COPY ./vllm/collect_env.py .
COPY requirements/common.txt requirements/common.txt
COPY use_existing_torch.py use_existing_torch.py
COPY pyproject.toml pyproject.toml
# Install build and runtime dependencies without stable torch version
COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
RUN python3 use_existing_torch.py
# install packages
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/common.txt
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/nightly_torch_test.txt
# Logging to confirm the torch versions
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
# Logging to confirm all the packages are installed
RUN pip freeze
#################### UNITTEST IMAGE #############################
#################### EXPORT STAGE ####################
FROM scratch as export-wheels

View File

@ -3,7 +3,6 @@ ciflow_tracking_issue: 64124
ciflow_push_tags:
- ciflow/b200
- ciflow/b200-symm-mem
- ciflow/b200-distributed
- ciflow/binaries
- ciflow/binaries_libtorch
- ciflow/binaries_wheel
@ -16,8 +15,7 @@ ciflow_push_tags:
- ciflow/inductor-micro-benchmark
- ciflow/inductor-micro-benchmark-cpu-x86
- ciflow/inductor-perf-compare
- ciflow/inductor-perf-test-nightly-rocm-mi300
- ciflow/inductor-perf-test-nightly-rocm-mi355
- ciflow/inductor-perf-test-nightly-rocm
- ciflow/inductor-perf-test-nightly-x86-zen
- ciflow/inductor-periodic
- ciflow/inductor-rocm
@ -32,7 +30,6 @@ ciflow_push_tags:
- ciflow/riscv64
- ciflow/rocm
- ciflow/rocm-mi300
- ciflow/rocm-mi355
- ciflow/s390
- ciflow/slow
- ciflow/torchbench

View File

@ -512,8 +512,6 @@ def perform_misc_tasks(
"keep-going",
branch == MAIN_BRANCH
or bool(tag and re.match(r"^trunk/[a-f0-9]{40}$", tag))
# Pattern for tags created via manual run on HUD
or bool(tag and re.match(r"^ciflow/[^/]+/[a-f0-9]{40}$", tag))
or check_for_setting(labels, pr_body, "keep-going"),
)
set_output(

View File

@ -16,18 +16,16 @@ from typing import Optional
# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
CUDA_ARCHES = ["12.6", "12.8", "13.0"]
CUDA_STABLE = "12.8"
CUDA_ARCHES_FULL_VERSION = {
"12.6": "12.6.3",
"12.8": "12.8.1",
"12.9": "12.9.1",
"13.0": "13.0.0",
}
CUDA_ARCHES_CUDNN_VERSION = {
"12.6": "9",
"12.8": "9",
"12.9": "9",
"13.0": "9",
}
@ -40,7 +38,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]
CPU_S390X_ARCH = ["cpu-s390x"]
CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "12.9-aarch64", "13.0-aarch64"]
CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"]
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
@ -78,23 +76,6 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
),
"12.9": (
"nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
),
"13.0": (
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
@ -241,11 +222,7 @@ def generate_libtorch_matrix(
arches += CUDA_ARCHES
arches += ROCM_ARCHES
elif os == "windows":
# TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
# in 2.10
windows_cuda_arches = CUDA_ARCHES.copy()
windows_cuda_arches.remove("12.9")
arches += windows_cuda_arches
arches += CUDA_ARCHES
if libtorch_variants is None:
libtorch_variants = [
"shared-with-deps",
@ -309,11 +286,7 @@ def generate_wheels_matrix(
if os == "linux":
arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
elif os == "windows":
# TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
# in 2.10
windows_cuda_arches = CUDA_ARCHES.copy()
windows_cuda_arches.remove("12.9")
arches += windows_cuda_arches + XPU_ARCHES
arches += CUDA_ARCHES + XPU_ARCHES
elif os == "linux-aarch64":
# Separate new if as the CPU type is different and
# uses different build/test scripts
@ -349,7 +322,7 @@ def generate_wheels_matrix(
# cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
if (
arch_version in ["13.0", "12.9", "12.8", "12.6"]
arch_version in ["13.0", "12.8", "12.6"]
and os == "linux"
or arch_version in CUDA_AARCH64_ARCHES
):
@ -413,6 +386,5 @@ def generate_wheels_matrix(
validate_nccl_dep_consistency("13.0")
validate_nccl_dep_consistency("12.9")
validate_nccl_dep_consistency("12.8")
validate_nccl_dep_consistency("12.6")

View File

@ -24,7 +24,7 @@ change_wheel_version() {
local t_version=$4
# Extract the wheel
${PYTHON_EXECUTABLE} -m wheel unpack $wheel
${PYTHON_EXECUTABLE} -mwheel unpack $wheel
mv "${package}-${f_version}" "${package}-${t_version}"
# Change the version from f_version to t_version in the dist-info dir
@ -47,7 +47,7 @@ change_wheel_version() {
popd
# Repack the wheel
${PYTHON_EXECUTABLE} -m wheel pack "${package}-${t_version}"
${PYTHON_EXECUTABLE} -mwheel pack "${package}-${t_version}"
# Clean up
rm -rf "${package}-${t_version}"
@ -85,7 +85,7 @@ repackage_wheel() {
}
# Require to re-package the wheel
${PYTHON_EXECUTABLE} -m pip install wheel==0.45.1
${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
pushd externals/vllm/wheels
for package in xformers flashinfer-python vllm; do

View File

@ -1092,7 +1092,7 @@ class GitHubPR:
editor = node["editor"]
return GitHubComment(
body_text=node["bodyText"],
created_at=node.get("createdAt", ""),
created_at=node["createdAt"] if "createdAt" in node else "",
author_login=node["author"]["login"],
author_url=node["author"].get("url", None),
author_association=node["authorAssociation"],
@ -2042,6 +2042,10 @@ def validate_revert(
f"[{', '.join(allowed_reverters)}], but instead is {author_association}."
)
# Raises exception if matching rule is not found, but ignores all status checks
find_matching_merge_rule(
pr, repo, skip_mandatory_checks=True, skip_internal_checks=True
)
commit_sha = get_pr_commit_sha(repo, pr)
return (author_login, commit_sha)

View File

@ -177,9 +177,6 @@ jobs:
runs-on: linux.rocm.gpu.mi250
timeout-minutes: !{{ common.timeout_minutes }}
!{{ upload.binary_env(config) }}
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm

View File

@ -72,7 +72,7 @@ jobs:
# Let's try to figure out how this can be improved
timeout-minutes: 360
- docs_type: python
runner: ${{ inputs.runner_prefix }}linux.c7i.2xlarge
runner: ${{ inputs.runner_prefix }}linux.2xlarge
# It takes less than 30m to finish python docs unless there are issues
timeout-minutes: 30
# Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)

View File

@ -37,7 +37,7 @@ on:
runner:
required: false
type: string
default: "linux.c7i.2xlarge"
default: "linux.2xlarge"
description: |
Label of the runner this job should run on.
test-matrix:

View File

@ -224,46 +224,6 @@ jobs:
continue-on-error: true
uses: ./.github/actions/download-td-artifacts
- name: Download Windows torch wheel for cross-compilation
if: matrix.win_torch_wheel_artifact != ''
uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0
with:
name: ${{ matrix.win_torch_wheel_artifact }}
path: win-torch-wheel
- name: Extract Windows wheel and setup CUDA libraries
if: matrix.win_torch_wheel_artifact != ''
shell: bash
run: |
set -x
# Find the wheel file
WHEEL_FILE=$(find win-torch-wheel -name "*.whl" -type f | head -n 1)
if [ -z "$WHEEL_FILE" ]; then
echo "Error: No wheel file found in win-torch-wheel directory"
exit 1
fi
echo "Found wheel file: $WHEEL_FILE"
# Unzip the wheel file
unzip -q "$WHEEL_FILE" -d win-torch-wheel-extracted
echo "Extracted wheel contents"
# Setup CUDA libraries (cuda.lib and cudart.lib) directory
mkdir -p win-torch-wheel-extracted/lib/x64
if [ -f "win-torch-wheel/cuda.lib" ]; then
mv win-torch-wheel/cuda.lib win-torch-wheel-extracted/lib/x64/
echo "Moved cuda.lib to win-torch-wheel-extracted/lib/x64/"
fi
if [ -f "win-torch-wheel/cudart.lib" ]; then
mv win-torch-wheel/cudart.lib win-torch-wheel-extracted/lib/x64/
echo "Moved cudart.lib to win-torch-wheel-extracted/lib/x64/"
fi
# Verify CUDA libraries are present
echo "CUDA libraries:"
ls -la win-torch-wheel-extracted/lib/x64/ || echo "No CUDA libraries found"
- name: Parse ref
id: parse-ref
run: .github/scripts/parse_ref.py
@ -429,6 +389,8 @@ jobs:
"${DOCKER_IMAGE}" \
${DOCKER_SHELL_CMD}
)
# Propagate download.pytorch.org IP to container
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then

View File

@ -211,7 +211,7 @@ jobs:
$tool --version
done
python3 -m pip install --no-index --no-deps dist/*.whl
python3 -mpip install --no-index --no-deps dist/*.whl
set +e
pushd "${RUNNER_TEMP}"
@ -222,7 +222,7 @@ jobs:
popd
if [ "${RC}" -ne 0 ]; then
python3 -m pip install --ignore-installed -r "${PIP_REQUIREMENTS_FILE}"
python3 -mpip install --ignore-installed -r "${PIP_REQUIREMENTS_FILE}"
fi
set -e

View File

@ -102,6 +102,19 @@ jobs:
exit 1
fi
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Login to Amazon ECR
id: login-ecr
continue-on-error: true
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main

View File

@ -168,31 +168,6 @@ jobs:
run: |
.ci/pytorch/win-build.sh
# Collect Windows torch libs and CUDA libs for cross-compilation
- name: Collect Windows CUDA libs for cross-compilation
if: steps.build.outcome != 'skipped' && inputs.cuda-version != 'cpu'
shell: bash
run: |
set -ex
# Create directory structure if does not exist
mkdir -p /c/${{ github.run_id }}/build-results
# Copy CUDA libs
CUDA_PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${{ inputs.cuda-version }}"
if [ -f "${CUDA_PATH}/lib/x64/cuda.lib" ]; then
cp "${CUDA_PATH}/lib/x64/cuda.lib" /c/${{ github.run_id }}/build-results/
fi
if [ -f "${CUDA_PATH}/lib/x64/cudart.lib" ]; then
cp "${CUDA_PATH}/lib/x64/cudart.lib" /c/${{ github.run_id }}/build-results/
fi
# List collected files
echo "Collected CUDA libs:"
ls -lah /c/${{ github.run_id }}/build-results/*.lib
# Upload to github so that people can click and download artifacts
- name: Upload artifacts to s3
if: steps.build.outcome != 'skipped'

View File

@ -204,7 +204,7 @@ jobs:
run: |
pushd "${PYTORCH_FINAL_PACKAGE_DIR}"
# shellcheck disable=SC2046,SC2102
python3 -m pip install $(echo *.whl)[opt-einsum,optree] optree==0.13.0
python3 -mpip install $(echo *.whl)[opt-einsum,optree] optree==0.13.0
popd
.ci/pytorch/win-test.sh

View File

@ -1,62 +0,0 @@
name: CI for distributed tests on B200
on:
pull_request:
paths:
- .github/workflows/b200-distributed.yml
workflow_dispatch:
push:
tags:
- ciflow/b200-distributed/*
schedule:
- cron: 46 8 * * * # about 1:46am PDT
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200:
name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed-b200
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 2, runner: "linux.dgx.b200.8" },
{ config: "distributed", shard: 2, num_shards: 2, runner: "linux.dgx.b200.8" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-test-distributed-b200:
name: linux-jammy-cuda12.8-py3.10-gcc11-test-b200
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200
with:
timeout-minutes: 1200
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
secrets: inherit

View File

@ -46,12 +46,10 @@ jobs:
fail-fast: false
matrix:
include: [
{ name: "manylinux2_28-builder", tag: "cuda13.0", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "cuda13.0", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "cuda12.8", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "cuda12.9", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "cuda12.6", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda13.0", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda12.9", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" },

View File

@ -27,8 +27,9 @@ jobs:
fail-fast: false
matrix:
python-version: [ '3.12' ]
# TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
device: [ 'cu128', 'cu129', 'cu130' ]
device: [ 'cu128', 'cu129' ]
include:
- platform: manylinux_2_28_x86_64
device: cu128
@ -38,10 +39,6 @@ jobs:
device: cu129
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
runner: linux.12xlarge.memory
- platform: manylinux_2_28_x86_64
device: cu130
manylinux-image: 'pytorch/manylinux2_28-builder:cuda13.0'
runner: linux.12xlarge.memory
- platform: manylinux_2_28_aarch64
device: cu128
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
@ -50,11 +47,6 @@ jobs:
device: cu129
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
runner: linux.arm64.r7g.12xlarge.memory
exclude:
# TODO (huydhn): Add cu130 aarch64 once PyTorch is on 2.9+ and
# xformers is update to support 13.0
- platform: manylinux_2_28_aarch64
device: cu130
name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
runs-on: ${{ matrix.runner }}
timeout-minutes: 480
@ -126,13 +118,13 @@ jobs:
"${MANYLINUX_IMAGE}"
)
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install \
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \
--pre torch torchvision torchaudio \
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
# I wonder if there is a command to both download and install the wheels
# in one go
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip download \
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip download \
--pre torch torchvision torchaudio \
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
@ -177,12 +169,7 @@ jobs:
fail-fast: false
matrix:
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
device: [ 'cu128', 'cu129', 'cu130' ]
exclude:
# TODO (huydhn): Add cu130 aarch64 once PyTorch is on 2.9+ and
# xformers is update to support 13.0
- platform: manylinux_2_28_aarch64
device: cu130
device: [ 'cu128', 'cu129' ]
env:
PLATFORM: ${{ matrix.platform }}
BUILD_DEVICE: ${{ matrix.device }}

View File

@ -204,52 +204,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_10-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -453,52 +407,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_11-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -702,52 +610,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_12-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -951,52 +813,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -1200,52 +1016,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13t-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
build_name: manywheel-py3_13t-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -1449,52 +1219,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
build_name: manywheel-py3_14-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -1698,52 +1422,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14t-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
build_name: manywheel-py3_14t-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml

View File

@ -248,74 +248,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-cuda12_9-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: libtorch-cuda12_9-shared-with-deps-release
build_environment: linux-binary-libtorch
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-cuda12_9-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-cuda12_9-shared-with-deps-release-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-cuda12_9-shared-with-deps-release
build_environment: linux-binary-libtorch
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-cuda12_9-shared-with-deps-release-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-cuda12_9-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-cuda13_0-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -426,9 +358,6 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -544,9 +473,6 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm

View File

@ -241,72 +241,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_10-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_10-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -413,9 +347,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.10"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -528,9 +459,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
DESIRED_PYTHON: "3.10"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -907,72 +835,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_11-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_11-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -1079,9 +941,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.11"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -1194,9 +1053,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
DESIRED_PYTHON: "3.11"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -1573,72 +1429,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_12-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_12-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -1745,9 +1535,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.12"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -1860,9 +1647,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
DESIRED_PYTHON: "3.12"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -2239,72 +2023,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_13-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -2411,9 +2129,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.13"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -2526,9 +2241,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
DESIRED_PYTHON: "3.13"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -2905,72 +2617,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_13t-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
build_name: manywheel-py3_13t-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13t-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
build_name: manywheel-py3_13t-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -3077,9 +2723,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.13t"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -3192,9 +2835,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
DESIRED_PYTHON: "3.13t"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -3571,72 +3211,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_14-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
build_name: manywheel-py3_14-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
build_name: manywheel-py3_14-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -3743,9 +3317,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.14"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -3858,9 +3429,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
DESIRED_PYTHON: "3.14"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -4237,72 +3805,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_14t-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
build_name: manywheel-py3_14t-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14t-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
build_name: manywheel-py3_14t-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -4409,9 +3911,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.14t"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -4524,9 +4023,6 @@ jobs:
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
DESIRED_PYTHON: "3.14t"
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm

View File

@ -106,7 +106,7 @@ jobs:
SMOKE_TEST_PARAMS=""
# shellcheck disable=SC2086
python -m venv test_venv
python -mvenv test_venv
source test_venv/bin/activate
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -216,7 +216,7 @@ jobs:
SMOKE_TEST_PARAMS=""
# shellcheck disable=SC2086
python -m venv test_venv
python -mvenv test_venv
source test_venv/bin/activate
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -326,7 +326,7 @@ jobs:
SMOKE_TEST_PARAMS=""
# shellcheck disable=SC2086
python -m venv test_venv
python -mvenv test_venv
source test_venv/bin/activate
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -436,7 +436,7 @@ jobs:
SMOKE_TEST_PARAMS=""
# shellcheck disable=SC2086
python -m venv test_venv
python -mvenv test_venv
source test_venv/bin/activate
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -546,7 +546,7 @@ jobs:
SMOKE_TEST_PARAMS=""
# shellcheck disable=SC2086
python -m venv test_venv
python -mvenv test_venv
source test_venv/bin/activate
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -656,7 +656,7 @@ jobs:
SMOKE_TEST_PARAMS=""
# shellcheck disable=SC2086
python -m venv test_venv
python -mvenv test_venv
source test_venv/bin/activate
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -766,7 +766,7 @@ jobs:
SMOKE_TEST_PARAMS=""
# shellcheck disable=SC2086
python -m venv test_venv
python -mvenv test_venv
source test_venv/bin/activate
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

View File

@ -37,7 +37,7 @@ jobs:
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: "linux.c7i.12xlarge"
runner: "linux.12xlarge"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '9.0'

View File

@ -130,7 +130,7 @@ jobs:
name: test-periodically
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '15 0 * * 1-6'
if: github.event.schedule == '15 0,12 * * 1-6'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true

View File

@ -1,132 +0,0 @@
name: inductor-perf-nightly-rocm-mi300
on:
push:
tags:
- ciflow/inductor-perf-test-nightly-rocm-mi300/*
schedule:
- cron: 15 0 * * *
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
workflow_dispatch:
inputs:
training:
description: Run training (on by default)?
required: false
type: boolean
default: true
inference:
description: Run inference (on by default)?
required: false
type: boolean
default: true
default:
description: Run inductor_default?
required: false
type: boolean
default: false
dynamic:
description: Run inductor_dynamic_shapes?
required: false
type: boolean
default: false
cppwrapper:
description: Run inductor_cpp_wrapper?
required: false
type: boolean
default: false
cudagraphs:
description: Run inductor_cudagraphs?
required: false
type: boolean
default: true
freezing_cudagraphs:
description: Run inductor_cudagraphs with freezing for inference?
required: false
type: boolean
default: false
aotinductor:
description: Run aot_inductor for inference?
required: false
type: boolean
default: false
maxautotune:
description: Run inductor_max_autotune?
required: false
type: boolean
default: false
benchmark_configs:
description: The list of configs used the benchmark
required: false
type: string
default: inductor_huggingface_perf_rocm_mi300,inductor_timm_perf_rocm_mi300,inductor_torchbench_perf_rocm_mi300
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions: read-all
jobs:
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
linux-jammy-rocm-py3_10-inductor-benchmark-build:
if: github.repository_owner == 'pytorch'
name: rocm-py3_10-inductor-benchmark-build
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-rocm-py3_10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm_mi300", shard: 1, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm_mi300", shard: 2, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm_mi300", shard: 3, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm_mi300", shard: 4, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm_mi300", shard: 5, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm_mi300", shard: 6, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm_mi300", shard: 7, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
]}
secrets: inherit
linux-jammy-rocm-py3_10-inductor-benchmark-test:
permissions:
id-token: write
contents: read
name: rocm-py3_10-inductor-benchmark-test
uses: ./.github/workflows/_rocm-test.yml
needs: linux-jammy-rocm-py3_10-inductor-benchmark-build
with:
build-environment: linux-jammy-rocm-py3_10
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.test-matrix }}
timeout-minutes: 720
# Disable monitor in perf tests for more investigation
disable-monitor: true
monitor-log-interval: 10
monitor-data-collect-interval: 2
secrets: inherit

View File

@ -1,11 +1,11 @@
name: inductor-perf-nightly-rocm-mi355
name: inductor-perf-nightly-rocm
on:
push:
tags:
- ciflow/inductor-perf-test-nightly-rocm-mi355/*
- ciflow/inductor-perf-test-nightly-rocm/*
schedule:
- cron: 15 0 * * *
- cron: 0 7 * * 0,3
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
workflow_dispatch:
@ -59,7 +59,7 @@ on:
description: The list of configs used the benchmark
required: false
type: string
default: inductor_huggingface_perf_rocm_mi355,inductor_timm_perf_rocm_mi355,inductor_torchbench_perf_rocm_mi355
default: inductor_huggingface_perf_rocm,inductor_timm_perf_rocm,inductor_torchbench_perf_rocm
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@ -88,27 +88,23 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 1, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 2, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 3, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 4, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 5, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 6, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 7, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
]}
secrets: inherit

View File

@ -12,7 +12,6 @@ on:
- landchecks/*
tags:
- ciflow/pull/*
- ciflow/trunk/*
workflow_dispatch:
permissions: read-all
@ -33,12 +32,10 @@ jobs:
name: Get changed files
uses: ./.github/workflows/_get-changed-files.yml
with:
all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') || github.event_name == 'push' }}
all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}
lintrunner-clang:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
# Needed to prevent deduping on HUD
name: lintrunner-clang-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
needs: [get-label-type, get-changed-files]
# Only run if there are changed files relevant to clangtidy / clangformat
if: |
@ -78,7 +75,6 @@ jobs:
# fails to find types when it should
lintrunner-mypy:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
name: lintrunner-mypy-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
needs: [get-label-type, get-changed-files]
# Only run if there are changed files relevant to mypy
if: |
@ -103,7 +99,6 @@ jobs:
lintrunner-noclang:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
name: lintrunner-noclang-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
needs: [get-label-type, get-changed-files]
with:
timeout: 120
@ -118,9 +113,9 @@ jobs:
CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
echo "Running all other linters"
if [ "$CHANGED_FILES" = '*' ]; then
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT,PYREFLY --all-files" .github/scripts/lintrunner.sh
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
else
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT,PYREFLY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh
fi
quick-checks:

View File

@ -7,11 +7,9 @@ on:
workflow_dispatch:
inputs:
test_mode:
type: choice
options:
- 'short'
- 'long'
- 'all'
required: false
type: string
default: 'short'
description: tag filter for operator benchmarks, options from long, short, all
schedule:
# Run at 07:00 UTC every Sunday
@ -30,49 +28,38 @@ permissions:
contents: read
jobs:
x86-opbenchmark-build:
opbenchmark-build:
if: github.repository_owner == 'pytorch'
name: x86-opbenchmark-build
name: opbenchmark-build
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-py3.10-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "cpu_operator_benchmark_${{ inputs.test_mode || 'short' }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
{ config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
]}
secrets: inherit
x86-opbenchmark-test:
name: x86-opbenchmark-test
uses: ./.github/workflows/_linux-test.yml
needs: x86-opbenchmark-build
with:
build-environment: linux-jammy-py3.10-gcc11-build
docker-image: ${{ needs.x86-opbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.x86-opbenchmark-build.outputs.test-matrix }}
secrets: inherit
aarch64-opbenchmark-build:
if: github.repository_owner == 'pytorch'
name: aarch64-opbenchmark-build
opbenchmark-on-demand-build:
if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
name: opbenchmark-on-demand-build
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-aarch64-py3.10
runner: linux.arm64.m7g.4xlarge
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
build-environment: linux-jammy-py3.10-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" },
{ config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
]}
secrets: inherit
aarch64-opbenchmark-test:
name: aarch64-opbenchmark-test
opbenchmark-test:
name: opbenchmark-test
uses: ./.github/workflows/_linux-test.yml
needs: aarch64-opbenchmark-build
needs: opbenchmark-build
with:
build-environment: linux-jammy-aarch64-py3.10
docker-image: ${{ needs.aarch64-opbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.aarch64-opbenchmark-build.outputs.test-matrix }}
build-environment: linux-jammy-py3.10-gcc11-build
docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
secrets: inherit

View File

@ -182,11 +182,11 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
test-matrix: |
{ include: [
{ config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
{ config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
{ config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
{ config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
]}
secrets: inherit

View File

@ -127,7 +127,6 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan

View File

@ -1,9 +1,6 @@
name: rocm-mi355
on:
push:
tags:
- ciflow/rocm-mi355/*
workflow_dispatch:
schedule:
- cron: 30 11,1 * * * # about 4:30am PDT and 6:30pm PDT
@ -45,12 +42,12 @@ jobs:
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
]}
secrets: inherit
@ -67,7 +64,5 @@ jobs:
build-environment: linux-noble-rocm-py3.12-mi355
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
tests-to-include: >-
${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor test_matmul_cuda test_scaled_matmul_cuda'
|| '' }}
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
secrets: inherit

View File

@ -140,7 +140,6 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan

View File

@ -56,7 +56,7 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
build-generates-artifacts: false
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: "linux.c7i.4xlarge"
runner: "linux.4xlarge"
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },
@ -180,13 +180,13 @@ jobs:
disable-monitor: false
secrets: inherit
win-vs2022-cuda12_8-py3-build:
name: win-vs2022-cuda12.8-py3
win-vs2022-cuda12_6-py3-build:
name: win-vs2022-cuda12.6-py3
uses: ./.github/workflows/_win-build.yml
needs: get-label-type
with:
build-environment: win-vs2022-cuda12.8-py3
cuda-version: "12.8"
build-environment: win-vs2022-cuda12.6-py3
cuda-version: "12.6"
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
secrets: inherit
@ -200,23 +200,6 @@ jobs:
cuda-arch-list: '8.0'
secrets: inherit
# Test cross-compiled models with Windows libs extracted from wheel
cross-compile-linux-test:
name: cross-compile-linux-test
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-build
- get-label-type
- win-vs2022-cuda12_8-py3-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
test-matrix: |
{ include: [
{ config: "aoti_cross_compile_for_windows", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", win_torch_wheel_artifact: "win-vs2022-cuda12.8-py3" },
]}
secrets: inherit
verify-cachebench-cpu-build:
name: verify-cachebench-cpu-build
uses: ./.github/workflows/_linux-build.yml
@ -266,14 +249,3 @@ jobs:
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3_10-gcc11-full-debug-build-only:
name: linux-jammy-py3.10-gcc11-full-debug-build-only
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: linux.2xlarge.memory
build-environment: linux-jammy-py3.10-gcc11-full-debug-build-only
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
secrets: inherit

View File

@ -46,7 +46,7 @@ jobs:
runner: linux.24xlarge.memory
test-matrix: |
{ include: [
{ config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_entrypoints_test", shard: 1, num_shards: 1,runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
@ -54,7 +54,7 @@ jobs:
{ config: "vllm_pytorch_compilation_unit_tests", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_lora_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_multi_model_test_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
{ config: "vllm_language_model_test_extended_generation_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
{ config: "vllm_languagde_model_test_extended_generation_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
{ config: "vllm_distributed_test_2_gpu_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_lora_test", shard: 0, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_lora_test", shard: 1, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },

View File

@ -35,7 +35,7 @@ jobs:
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
build-environment: linux-jammy-xpu-n-1-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
runner: linux.c7i.12xlarge
runner: linux.12xlarge
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
@ -56,7 +56,7 @@ jobs:
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
build-environment: linux-jammy-xpu-n-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
runner: linux.c7i.12xlarge
runner: linux.12xlarge
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },

1
.gitignore vendored
View File

@ -395,4 +395,3 @@ android/pytorch_android_torchvision/.cxx
CLAUDE.local.md
/test_*.py
/debug_*.py
CLAUDE_CONTEXT/

View File

@ -209,46 +209,6 @@ command = [
'@{{PATHSFILE}}'
]
[[linter]]
code = 'PYREFLY'
include_patterns = [
'torch/**/*.py',
'torch/**/*.pyi',
'torchgen/**/*.py',
'torchgen/**/*.pyi',
'functorch/**/*.py',
'functorch/**/*.pyi',
]
exclude_patterns = []
command = [
'python3',
'tools/linter/adapters/pyrefly_linter.py',
'--config=pyrefly.toml',
]
init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'numpy==2.1.0 ; python_version >= "3.12"',
'expecttest==0.3.0',
'pyrefly==0.36.2',
'sympy==1.13.3',
'types-requests==2.27.25',
'types-pyyaml==6.0.2',
'types-tabulate==0.8.8',
'types-protobuf==5.29.1.20250403',
'types-setuptools==79.0.0.20250422',
'types-jinja2==2.11.9',
'types-colorama==0.4.6',
'filelock==3.18.0',
'junitparser==2.1.1',
'rich==14.1.0',
'optree==0.17.0',
'types-openpyxl==3.1.5.20250919',
'types-python-dateutil==2.9.0.20251008'
]
[[linter]]
code = 'CLANGTIDY'
include_patterns = [

View File

@ -388,9 +388,9 @@ cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker
option(USE_MIMALLOC "Use mimalloc" OFF)
# Enable third party mimalloc library to improve memory allocation performance
# on Windows and AArch64.
# on Windows.
option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF)
if(WIN32 OR (CPU_AARCH64 AND NOT APPLE))
if(WIN32)
set(USE_MIMALLOC ON)
# Not enable USE_MIMALLOC_ON_MKL due to it caused issue:

View File

@ -39,7 +39,7 @@ RUN chmod +x ~/miniconda.sh && \
bash ~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
/opt/conda/bin/python -m pip install -r requirements.txt && \
/opt/conda/bin/python -mpip install -r requirements.txt && \
/opt/conda/bin/conda clean -ya
FROM dev-base as submodule-update

View File

@ -28,19 +28,4 @@ inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) {
return stream << BlasBackendToString(backend);
}
namespace blas {
enum class ScalingType : std::uint8_t {
TensorWise, // fp32 scales
RowWise, // fp32 scales
BlockWise1x16, // fp8_e4m3fn scales
BlockWise1x32, // fp8_e8m0fnu scales
BlockWise1x128, // fp32 scales
BlockWise128x128, // fp32 scales
};
enum class SwizzleType : std::uint8_t { NO_SWIZZLE = 0, SWIZZLE_32_4_4 = 1 };
} // namespace blas
} // namespace at

View File

@ -256,7 +256,6 @@ endif()
IF(USE_FBGEMM_GENAI)
set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
set(FBGEMM_GENAI_SRCS ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
if(USE_CUDA)
# To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
# If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
@ -293,64 +292,58 @@ IF(USE_FBGEMM_GENAI)
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
)
target_include_directories(fbgemm_genai PRIVATE
target_include_directories(fbgemm_genai PUBLIC
${FBGEMM_THIRD_PARTY}/cutlass/include
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
${fbgemm_genai_mx8mx8bf16_grouped}
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
)
else()
if(USE_ROCM)
# Only include the kernels we want to build to avoid increasing binary size.
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
# Add FBGEMM_GENAI include directories for torch_ops.h
list(APPEND ATen_CUDA_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
list(APPEND ATen_CUDA_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
elseif(USE_ROCM)
# Only include the kernels we want to build to avoid increasing binary size.
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
# Add additional HIPCC compiler flags for performance
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-mllvm
-amdgpu-coerce-illegal-types=1
-mllvm
-enable-post-misched=0
-mllvm
-greedy-reverse-local-assignment=1
-fhip-new-launch-api)
# Add additional HIPCC compiler flags for performance
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-mllvm
-amdgpu-coerce-illegal-types=1
-mllvm
-enable-post-misched=0
-mllvm
-greedy-reverse-local-assignment=1
-fhip-new-launch-api)
# Only compile for gfx942 for now.
# This is rather hacky, I could not figure out a clean solution :(
set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
endif()
set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
# Only compile for gfx942 for now.
# This is rather hacky, I could not figure out a clean solution :(
set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
hip_add_library(
fbgemm_genai STATIC
${fbgemm_genai_native_rocm_hip}
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
target_include_directories(fbgemm_genai PUBLIC
# FBGEMM version of Composable Kernel is used due to some customizations
${FBGEMM_THIRD_PARTY}/composable_kernel/include
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
${FBGEMM_THIRD_PARTY}/cutlass/include
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
)
endif()
set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
hip_add_library(
fbgemm_genai STATIC
${fbgemm_genai_native_rocm_hip}
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
target_include_directories(fbgemm_genai PRIVATE
# FBGEMM version of Composable Kernel is used due to some customizations
${FBGEMM_THIRD_PARTY}/composable_kernel/include
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
${FBGEMM_THIRD_PARTY}/cutlass/include
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
)
# Add FBGEMM_GENAI include directories for torch_ops.h
list(APPEND ATen_HIP_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
list(APPEND ATen_HIP_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
endif()
endif()
@ -699,6 +692,12 @@ if(USE_CUDA AND NOT USE_ROCM)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
# Add FBGEMM_GENAI include directories for torch_ops.h
if(USE_FBGEMM_GENAI)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
endif()
if($ENV{ATEN_STATIC_CUDA})
if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
list(APPEND ATen_CUDA_DEPENDENCY_LIBS

View File

@ -226,15 +226,15 @@ class TORCH_API Context {
bool userEnabledMkldnn() const;
void setUserEnabledMkldnn(bool e);
bool benchmarkCuDNN() const;
void setBenchmarkCuDNN(bool /*b*/);
void setBenchmarkCuDNN(bool);
int benchmarkLimitCuDNN() const;
void setBenchmarkLimitCuDNN(int /*b*/);
void setBenchmarkLimitCuDNN(int);
bool immediateMiopen() const;
void setImmediateMiopen(bool /*b*/);
void setImmediateMiopen(bool);
bool deterministicCuDNN() const;
void setDeterministicCuDNN(bool /*b*/);
void setDeterministicCuDNN(bool);
bool deterministicMkldnn() const;
void setDeterministicMkldnn(bool /*b*/);
void setDeterministicMkldnn(bool);
bool userEnabledNNPACK() const;
void setUserEnabledNNPACK(bool e);
@ -252,32 +252,32 @@ class TORCH_API Context {
void setSDPPriorityOrder(const std::vector<int64_t>& order);
std::array<at::SDPBackend, at::num_sdp_backends> sDPPriorityOrder();
void setSDPUseFlash(bool /*e*/);
void setSDPUseFlash(bool);
bool userEnabledFlashSDP() const;
void setSDPUseMemEfficient(bool /*e*/);
void setSDPUseMemEfficient(bool);
bool userEnabledMemEfficientSDP() const;
void setSDPUseMath(bool /*e*/);
void setSDPUseMath(bool);
bool userEnabledMathSDP() const;
void setSDPUseCuDNN(bool /*e*/);
void setSDPUseCuDNN(bool);
bool userEnabledCuDNNSDP() const;
void setAllowFP16BF16ReductionMathSDP(bool /*e*/);
void setAllowFP16BF16ReductionMathSDP(bool);
bool allowFP16BF16ReductionMathSDP() const;
void setSDPUseOverrideable(bool /*e*/);
void setSDPUseOverrideable(bool);
bool userEnabledOverrideableSDP() const;
at::LinalgBackend linalgPreferredBackend() const;
void setLinalgPreferredBackend(at::LinalgBackend /*b*/);
void setLinalgPreferredBackend(at::LinalgBackend);
at::BlasBackend blasPreferredBackend();
void setBlasPreferredBackend(at::BlasBackend /*b*/);
void setBlasPreferredBackend(at::BlasBackend);
at::ROCmFABackend getROCmFAPreferredBackend();
void setROCmFAPreferredBackend(at::ROCmFABackend /*b*/);
void setROCmFAPreferredBackend(at::ROCmFABackend);
// Note [Enabling Deterministic Operations]
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -310,9 +310,9 @@ class TORCH_API Context {
bool deterministicAlgorithms() const;
bool deterministicAlgorithmsWarnOnly() const;
void setDeterministicAlgorithms(bool /*b*/, bool /*warn_only*/);
void setDeterministicAlgorithms(bool, bool);
bool deterministicFillUninitializedMemory() const;
void setDeterministicFillUninitializedMemory(bool /*b*/);
void setDeterministicFillUninitializedMemory(bool);
// Note [Writing Nondeterministic Operations]
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -356,11 +356,11 @@ class TORCH_API Context {
Float32Op op,
Float32Precision p);
bool allowTF32CuDNN(std::optional<Float32Op> op = std::nullopt) const;
void setAllowTF32CuDNN(bool /*b*/);
void setAllowTF32CuDNN(bool);
bool allowTF32OneDNN() const;
void setAllowTF32OneDNN(bool /*b*/);
void setAllowTF32OneDNN(bool);
bool allowTF32CuBLAS() const;
void setAllowTF32CuBLAS(bool /*b*/);
void setAllowTF32CuBLAS(bool);
Float32MatmulPrecision float32MatmulPrecision() const;
Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
CuBLASReductionOption allowFP16ReductionCuBLAS() const;
@ -372,7 +372,7 @@ class TORCH_API Context {
bool allow_reduced_precision,
bool allow_splitk = true);
bool allowFP16AccumulationCuBLAS() const;
void setAllowFP16AccumulationCuBLAS(bool /*b*/);
void setAllowFP16AccumulationCuBLAS(bool);
// Matmuls can use a so-called "persistent" kernel which launches one CUDA
// block for each SM on the GPU, and each block then iterates over multiple
@ -384,7 +384,7 @@ class TORCH_API Context {
// to make matmuls target only a subset of the SMs, so they can fully schedule
// even next to a comms kernel, and only be a few percent slower.
std::optional<int32_t> _SMCarveout_EXPERIMENTAL() const;
void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t> /*c*/);
void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t>);
at::QEngine qEngine() const;
void setQEngine(at::QEngine e);
@ -405,7 +405,7 @@ class TORCH_API Context {
void setDefaultMobileCPUAllocator();
void unsetDefaultMobileCPUAllocator();
bool allowFP16ReductionCPU() const;
void setAllowFP16ReductionCPU(bool /*b*/);
void setAllowFP16ReductionCPU(bool);
// Preserved for BC
void lazyInitCUDA() {

View File

@ -389,16 +389,37 @@ void fillVersion<DLManagedTensorVersioned>(
// constructed out of ATen tensor
template <class T>
T* toDLPackImpl(const Tensor& src) {
auto view = src;
// Detect whether there is need to normalize the strides
// Background: gh-83069
//
// However, normalizing strides can come at a high-cost
// to slow down toDLPack conversion 3x, so we
// only normalize if needed.
//
// The following code detects whether the src follows
// a continuous pattern. If the src follows such pattern (common-case)
// then we do not need to normalize the strides.
bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1;
// less common case, try normalizing the strides
if (need_normalize_strides) {
// create a new tensor with possibly normalized strides
// gh-83069
auto shape = src.sizes();
view = src.as_strided(shape, {1}, src.storage_offset());
}
ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
atDLMTensor->handle = src;
atDLMTensor->handle = view;
atDLMTensor->tensor.manager_ctx = atDLMTensor;
atDLMTensor->tensor.deleter = &deleter<T>;
atDLMTensor->tensor.dl_tensor.data = src.data_ptr();
atDLMTensor->tensor.dl_tensor.data = view.data_ptr();
atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(src.sizes().data());
atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(src.strides().data());
atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data());
atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data());
atDLMTensor->tensor.dl_tensor.byte_offset = 0;
fillVersion(&atDLMTensor->tensor);

View File

@ -52,16 +52,16 @@ struct DLPackTraits {};
template <>
struct DLPackTraits<DLManagedTensor> {
inline static constexpr const char* capsule = "dltensor";
inline static constexpr const char* used = "used_dltensor";
inline static const char* capsule = "dltensor";
inline static const char* used = "used_dltensor";
inline static auto toDLPack = at::toDLPack;
inline static auto fromDLPack = at::fromDLPack;
};
template <>
struct DLPackTraits<DLManagedTensorVersioned> {
inline static constexpr const char* capsule = "dltensor_versioned";
inline static constexpr const char* used = "used_dltensor_versioned";
inline static const char* capsule = "dltensor_versioned";
inline static const char* used = "used_dltensor_versioned";
inline static auto toDLPack = at::toDLPackVersioned;
inline static auto fromDLPack = at::fromDLPackVersioned;
};

View File

@ -16,8 +16,8 @@ inline void check_size_nonnegative(ArrayRef<int64_t> size) {
inline void check_size_nonnegative(ArrayRef<c10::SymInt> size) {
for (const auto& x : size) {
TORCH_SYM_CHECK(
x.sym_ge(0),
TORCH_CHECK(
x.expect_size(__FILE__, __LINE__),
"Trying to create tensor with negative dimension ",
x,
": ",

View File

@ -4,7 +4,6 @@
#include <c10/core/ScalarType.h>
#include <c10/core/SymIntArrayRef.h>
#include <c10/util/DimVector.h>
#include <c10/util/Exception.h>
#include <optional>
#include <sstream>
#include <vector>
@ -27,7 +26,9 @@ inline void infer_size_impl(
std::optional<int64_t> infer_dim;
for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) {
TORCH_CHECK(!infer_dim, "only one dimension can be inferred");
if (infer_dim) {
throw std::runtime_error("only one dimension can be inferred");
}
infer_dim = dim;
} else {
// in case of unbacked shape[dim] we assume it's not -1 and add a runtime

View File

@ -58,7 +58,7 @@ namespace at {
namespace{
// PyTorch allows operations to specify dim 0 and dim -1 on a scalar tensor.
bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
static bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
return dim == 0 || dim == -1;
}
@ -365,7 +365,7 @@ Tensor select_batching_rule(const Tensor& self, int64_t dim, int64_t index) {
return self_physical.getPhysicalToLogicalMap().apply(result);
}
int64_t getGradInputPhysicalDim(int64_t dim, IntArrayRef input_sizes, int64_t num_batch_dims) {
static int64_t getGradInputPhysicalDim(int64_t dim, IntArrayRef input_sizes, int64_t num_batch_dims) {
return maybe_wrap_dim(dim, static_cast<int64_t>(input_sizes.size())) + num_batch_dims;
}
@ -488,7 +488,7 @@ Tensor view_as_complex_batching_rule(const Tensor& self) {
// Checks that the smallest batch stride is greater than the largest example
// stride. This is something we can support but we choose not to because it's
// potentially error prone.
void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t num_batch_dims) {
static void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t num_batch_dims) {
auto smallest_batch_stride = std::min_element(
physical_strides.begin(), physical_strides.begin() + num_batch_dims);
auto largest_example_stride = std::max_element(
@ -508,7 +508,7 @@ void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t num_bat
// given (sizes, strides, storage_offset) returns the maximum location that
// can be indexed (or nullopt if such a location doesn't exist, e.g., tensors
// with zero-size dims).
std::optional<int64_t> maximum_indexable_location(
static std::optional<int64_t> maximum_indexable_location(
IntArrayRef sizes, IntArrayRef strides, int64_t storage_offset) {
auto result = native::storage_size_for(sizes, strides);
if (result == 0) {
@ -521,7 +521,7 @@ std::optional<int64_t> maximum_indexable_location(
// This checks that the range of possible memory locations accessible by
// x.as_strided(sizes, strides, maybe_storage_offset)
// are within the bounds of possible memory locations accessible by x.
void checkBasicAsStridedValidForSlice(
static void checkBasicAsStridedValidForSlice(
const Tensor& physical_tensor,
int64_t num_batch_dims,
IntArrayRef sizes,

View File

@ -62,7 +62,7 @@ constexpr const char* unknown_eventname = "eventname not specified";
#endif
} // namespace (anonymous)
MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd, int flags, size_t size)
MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags, size_t size)
: filename_(filename.empty() ? unknown_filename : filename)
, size_(0) // to be filled later
#ifdef _WIN32
@ -494,7 +494,7 @@ RefcountedMapAllocator::RefcountedMapAllocator(const char *filename, int flags,
initializeAlloc();
}
RefcountedMapAllocator::RefcountedMapAllocator(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size)
RefcountedMapAllocator::RefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size)
: RefcountedMapAllocatorArgCheck(flags)
, MapAllocator(WITH_FD, filename, flags, fd, size + map_alloc_alignment) {
@ -614,7 +614,7 @@ at::DataPtr MapAllocator::makeDataPtr(std::string_view filename, int flags, size
return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
}
at::DataPtr MapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
at::DataPtr MapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
auto* context = new MapAllocator(WITH_FD, filename, fd, flags, size);
if (actual_size_out) *actual_size_out = context->size();
return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
@ -626,7 +626,7 @@ at::DataPtr RefcountedMapAllocator::makeDataPtr(const char *filename, int flags,
return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
}
at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
auto* context = new RefcountedMapAllocator(WITH_FD, filename, fd, flags, size);
if (actual_size_out) *actual_size_out = context->size() - map_alloc_alignment;
return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};

View File

@ -25,7 +25,7 @@ class TORCH_API MapAllocator {
public:
MapAllocator(std::string_view filename, int flags, size_t size);
MapAllocator(
WithFd /*unused*/,
WithFd,
std::string_view filename,
int fd,
int flags,
@ -59,14 +59,14 @@ class TORCH_API MapAllocator {
return flags_;
}
static MapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
static MapAllocator* fromDataPtr(const at::DataPtr&);
static at::DataPtr makeDataPtr(
std::string_view filename,
int flags,
size_t size,
size_t* actual_size_out);
static at::DataPtr makeDataPtr(
WithFd /*unused*/,
WithFd,
const char* filename,
int fd,
int flags,
@ -105,13 +105,13 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
public:
RefcountedMapAllocator(const char* filename, int flags, size_t size);
RefcountedMapAllocator(
WithFd /*unused*/,
WithFd,
const char* filename,
int fd,
int flags,
size_t size);
static RefcountedMapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&);
RefcountedMapAllocator(const RefcountedMapAllocator&) = delete;
RefcountedMapAllocator(RefcountedMapAllocator&&) = delete;
RefcountedMapAllocator& operator=(const RefcountedMapAllocator&) = delete;
@ -122,7 +122,7 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
size_t size,
size_t* actual_size_out);
static at::DataPtr makeDataPtr(
WithFd /*unused*/,
WithFd,
const char* filename,
int fd,
int flags,

View File

@ -273,7 +273,7 @@ c10::SymInt NestedTensorImpl::sym_numel_custom() const {
return NestedTensorImpl::numel_custom();
}
c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
return nested_tensor_impl_is_contiguous(this);
}
IntArrayRef NestedTensorImpl::sizes_custom() const {

View File

@ -115,8 +115,7 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
// with real implementations
int64_t numel_custom() const override;
c10::SymInt sym_numel_custom() const override;
c10::SymBool sym_is_contiguous_custom(
MemoryFormat /*memory_format*/) const override;
c10::SymBool sym_is_contiguous_custom(MemoryFormat) const override;
int64_t size_custom(int64_t d) const override {
return this->size(d);
}

View File

@ -14,7 +14,7 @@ inline int64_t divup(int64_t x, int64_t y) {
TORCH_API void init_num_threads();
// Sets the number of threads to be used in parallel region
TORCH_API void set_num_threads(int /*nthreads*/);
TORCH_API void set_num_threads(int);
// Returns the maximum number of threads that may be used in a parallel region
TORCH_API int get_num_threads();
@ -37,7 +37,7 @@ inline void lazy_init_num_threads() {
}
}
TORCH_API void set_thread_num(int /*id*/);
TORCH_API void set_thread_num(int);
class TORCH_API ThreadIdGuard {
public:
@ -130,7 +130,7 @@ inline scalar_t parallel_reduce(
TORCH_API std::string get_parallel_info();
// Sets number of threads used for inter-op parallelism
TORCH_API void set_num_interop_threads(int /*nthreads*/);
TORCH_API void set_num_interop_threads(int);
// Returns the number of threads used for inter-op parallelism
TORCH_API size_t get_num_interop_threads();

View File

@ -42,14 +42,8 @@ const PythonTorchFunctionTLS& PythonTorchFunctionTLS::get_state() {
}
bool torch_function_mode_enabled() {
// Manually flatten because gcc is refusing to inline here. Note
// that we are still calling __tls_get_addr twice here with GCC,
// presumably because of
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81501 (which says
// the fix ships in GCC 16), but forcing inlining still improves
// performance.
const auto& ptfs = pythonTorchFunctionState;
return ptfs.disabled_state_ != TorchFunctionDisabledState::ALL_DISABLED && !ptfs.stack_.empty();
return PythonTorchFunctionTLS::get_disabled_state() != TorchFunctionDisabledState::ALL_DISABLED &&
PythonTorchFunctionTLS::stack_len() > 0;
}
// This is needed to disambiguate the ternary torch function disabled states

View File

@ -27,7 +27,6 @@ struct TORCH_API PythonTorchFunctionTLS {
TorchFunctionDisabledState disabled_state_ =
TorchFunctionDisabledState::ENABLED;
std::vector<std::shared_ptr<c10::SafePyObject>> stack_;
friend TORCH_API bool torch_function_mode_enabled();
};
TORCH_API bool torch_function_mode_enabled();

View File

@ -13,7 +13,7 @@ namespace {
// and left at true for the rest of the execution.
// It's an optimization so that users who never use default hooks don't need to
// read the thread_local variables pack_hook_ and unpack_hook_.
bool is_initialized(false);
static bool is_initialized(false);
}
static void assertSavedTensorHooksNotDisabled() {

View File

@ -252,7 +252,7 @@ void SparseCsrTensorImpl::set_stride(int64_t dim, int64_t new_stride) {
void SparseCsrTensorImpl::set_storage_offset(int64_t storage_offset) {
TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_storage_offset.");
}
c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have is_contiguous");
}
} // namespace at

View File

@ -32,10 +32,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
public:
explicit SparseCsrTensorImpl(
at::DispatchKeySet /*key_set*/,
at::DispatchKeySet,
at::Device device,
Layout layout,
const caffe2::TypeMeta /*data_type*/);
const caffe2::TypeMeta);
void resize_(int64_t nnz, IntArrayRef size);
void resize_and_clear_(
@ -86,8 +86,7 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
protected:
IntArrayRef strides_custom() const override;
SymIntArrayRef sym_strides_custom() const override;
SymBool sym_is_contiguous_custom(
MemoryFormat /*memory_format*/) const override;
SymBool sym_is_contiguous_custom(MemoryFormat) const override;
public:
void set_size(int64_t dim, int64_t new_size) override;

View File

@ -46,9 +46,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
public:
// Public for now...
explicit SparseTensorImpl(
at::DispatchKeySet /*key_set*/,
const caffe2::TypeMeta /*data_type*/);
explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta);
void release_resources() override;
@ -386,8 +384,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
private:
explicit SparseTensorImpl(
at::DispatchKeySet /*key_set*/,
const caffe2::TypeMeta /*data_type*/,
at::DispatchKeySet,
const caffe2::TypeMeta,
at::Tensor indices,
at::Tensor values);

View File

@ -112,10 +112,10 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice);
// `torch.tensor([1, 2])`) | `torch::tensor({1, 2})`
struct TORCH_API TensorIndex final {
// Case 1: `at::indexing::None`
TensorIndex(std::nullopt_t /*unused*/) : type_(TensorIndexType::None) {}
TensorIndex(std::nullopt_t) : type_(TensorIndexType::None) {}
// Case 2: "..." / `at::indexing::Ellipsis`
TensorIndex(at::indexing::EllipsisIndexType /*unused*/)
TensorIndex(at::indexing::EllipsisIndexType)
: type_(TensorIndexType::Ellipsis) {}
TensorIndex(const char* str) : TensorIndex(at::indexing::Ellipsis) {
TORCH_CHECK_VALUE(

View File

@ -56,7 +56,7 @@ inline void get_strides(int64_t* strides, ArrayRef<OperandInfo> operands, int64_
}
}
OptionalTensorRef make_otr(const TensorBase &tensor) {
static OptionalTensorRef make_otr(const TensorBase &tensor) {
if (tensor.defined()) {
return OptionalTensorRef(tensor);
} else {

View File

@ -250,7 +250,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
using PtrVector = SmallVector<char*, 4>;
using StrideVector = SmallVector<int64_t, 6>;
void build(TensorIteratorConfig& /*config*/);
void build(TensorIteratorConfig&);
// The inner-loop function operates on the fastest moving dimension. It
// implements element-wise operations in terms of 1-d strided tensors.
@ -618,20 +618,20 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
#undef TORCH_DISALLOW_TEMPORARIES
protected:
// Mutable reference as it moves tensors out of TensorIteratorConfig
void populate_operands(TensorIteratorConfig& /*config*/);
void populate_operands(TensorIteratorConfig&);
void mark_outputs();
void mark_resize_outputs(const TensorIteratorConfig& /*config*/);
void compute_mem_overlaps(const TensorIteratorConfig& /*config*/);
void compute_shape(const TensorIteratorConfig& /*config*/);
void compute_strides(const TensorIteratorConfig& /*config*/);
void mark_resize_outputs(const TensorIteratorConfig&);
void compute_mem_overlaps(const TensorIteratorConfig&);
void compute_shape(const TensorIteratorConfig&);
void compute_strides(const TensorIteratorConfig&);
void reorder_dimensions();
void permute_dimensions(IntArrayRef perm);
void compute_types(const TensorIteratorConfig& /*config*/);
void compute_types(const TensorIteratorConfig&);
ScalarType compute_common_dtype();
void allocate_or_resize_outputs();
bool fast_set_up(const TensorIteratorConfig& /*config*/);
FastSetupType compute_fast_setup_type(const TensorIteratorConfig& /*config*/);
void compute_names(const TensorIteratorConfig& /*config*/);
bool fast_set_up(const TensorIteratorConfig&);
FastSetupType compute_fast_setup_type(const TensorIteratorConfig&);
void compute_names(const TensorIteratorConfig&);
void propagate_names_to_outputs();
void coalesce_dimensions();

View File

@ -20,7 +20,7 @@
namespace at {
TORCH_API int _crash_if_asan(int /*arg*/);
TORCH_API int _crash_if_asan(int);
// Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
// NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.

View File

@ -103,7 +103,9 @@ std::string get_cpu_capability() {
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
case native::CPUCapability::ZVECTOR:
return "Z VECTOR";
#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
case native::CPUCapability::SVE128:
return "SVE128";
case native::CPUCapability::SVE256:
return "SVE256";
#else

View File

@ -36,7 +36,7 @@ namespace {
using weakref_type = c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl>;
using val_type = std::tuple<weakref_type, Tensor>;
ska::flat_hash_map<TensorImpl*, val_type>& get_cached_casts() {
static ska::flat_hash_map<TensorImpl*, val_type>& get_cached_casts() {
static ska::flat_hash_map<TensorImpl*, val_type> cached_casts;
return cached_casts;
}
@ -148,7 +148,7 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_
Banned functions
*******************************/
static Tensor binary_cross_entropy_banned(const Tensor & /*unused*/, const Tensor & /*unused*/, const std::optional<Tensor>& /*unused*/, int64_t /*unused*/) {
static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const std::optional<Tensor>&, int64_t) {
TORCH_CHECK(false, "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
"Many models use a sigmoid layer right before the binary cross entropy layer.\n"
"In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n"

View File

@ -44,3 +44,13 @@ using c10::SymIntList;
using c10::TensorOptions;
} // namespace at
// Forward declarations for dummy types used in operator signatures
namespace dummy_types {
namespace v2_8 {
struct Dummy;
}
inline namespace v2_9 {
struct Dummy;
}
} // namespace dummy_types

View File

@ -6,9 +6,9 @@ namespace at {
namespace {
std::array<HostAllocator*, at::COMPILE_TIME_MAX_DEVICE_TYPES>
static std::array<HostAllocator*, at::COMPILE_TIME_MAX_DEVICE_TYPES>
allocator_array{};
std::array<uint8_t, at::COMPILE_TIME_MAX_DEVICE_TYPES>
static std::array<uint8_t, at::COMPILE_TIME_MAX_DEVICE_TYPES>
allocator_priority{};
} // anonymous namespace

View File

@ -27,11 +27,11 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
HasNonWildcard
};
explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, DimnameList names)
explicit NamedTensorMeta(HAS_NON_WILDCARD, DimnameList names)
: names_(names.vec()) {
check_invariants();
}
explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& names)
explicit NamedTensorMeta(HAS_NON_WILDCARD, std::vector<Dimname>&& names)
: names_(std::move(names)) {
check_invariants();
}
@ -52,13 +52,13 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
std::any_of(names_.begin(), names_.end(), [](const Dimname& n) { return !n.isWildcard(); }));
}
void set_names(HAS_NON_WILDCARD /*unused*/, DimnameList new_names) {
void set_names(HAS_NON_WILDCARD, DimnameList new_names) {
TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
std::copy(new_names.begin(), new_names.end(), names_.begin());
check_invariants();
}
void set_names(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& new_names) {
void set_names(HAS_NON_WILDCARD, std::vector<Dimname>&& new_names) {
TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
names_ = std::move(new_names);
check_invariants();

View File

@ -229,10 +229,10 @@ private:
}
static constexpr uint32_t kPhilox10A = 0x9E3779B9;
static constexpr uint32_t kPhilox10B = 0xBB67AE85;
static constexpr uint32_t kPhiloxSA = 0xD2511F53;
static constexpr uint32_t kPhiloxSB = 0xCD9E8D57;
static const uint32_t kPhilox10A = 0x9E3779B9;
static const uint32_t kPhilox10B = 0xBB67AE85;
static const uint32_t kPhiloxSA = 0xD2511F53;
static const uint32_t kPhiloxSB = 0xCD9E8D57;
};
typedef philox_engine Philox4_32;

View File

@ -13,7 +13,7 @@ class TORCH_API PythonOpRegistrationTrampoline final {
public:
// Returns true if you successfully registered yourself (that means
// you are in the hot seat for doing the operator registrations!)
static bool registerInterpreter(c10::impl::PyInterpreter* /*interp*/);
static bool registerInterpreter(c10::impl::PyInterpreter*);
// Returns nullptr if no interpreter has been registered yet.
static c10::impl::PyInterpreter* getInterpreter();

Some files were not shown because too many files have changed in this diff Show More