Compare commits

..

191 Commits

Author SHA1 Message Date
19602e64df compile_kernel remove cp from graph test 2025-09-26 13:09:39 -07:00
e21b037756 Add tests for aot_export_joint_with_descriptors annotation (#163893)
As title, test

1) Annotation works with aot_export_joint_with_descriptor API
2) Annotation works with the 2 step "strict export.export + aot_export_joint_with_descriptor"
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163893
Approved by: https://github.com/SherlockNoMad
2025-09-26 19:25:44 +00:00
f8c7505855 [inductor] Fix unbounded number of substitutions when equality checks contain Max expr (#163685)
## Issue

From an internal use case, we found that if we have an equality rule like:

```
Max(15, u0) == s0 * Max(15, u0)
```

This would lead to wrong substitution rule being generated in the substitution table, the result would be the process got stuck in the substitution loop as if it hangs indefinitely, as it's doing the following substitutions:

```
Max(15, u0)
--> s0 * Max(15, u0)
--> s0 ** 2 * Max(15, u0)
--> s0 ** 3 * Max(15, u0)
--> s0 ** 4 * Max(15, u0)
...
```

The root cause is with SymPy expression comparison: as `Max` is [not inside the op class table](https://github.com/sympy/sympy/blob/1.14/sympy/core/basic.py#L50-L86), it'll take the [UNKNOWN](https://github.com/sympy/sympy/blob/1.14/sympy/core/basic.py#L120) order, and considered bigger than any other types of expressions.

## Fix
1. Added a breaking-out from the substitution while-loop to warn about any exccessive substitutions, what threshold should be used here and how to pass it are open to suggestion, using a hard-coded static value to be simple for now
2. Enhanced the sympy expression comparison logic, so that we first check if one expr "has" the other one or not, to help work around the issue with `Max` here

## Testing

- with the unittiest alone --> unittest stuck
- with the unittest and while-loop breakout, we could see tests finished with warning "**Substitution limit reached**":
```
test/inductor/test_aot_inductor.py::AOTInductorTestABICompatibleCpu::test_unbounded_expr_substitutions_cpu W0923 13:00:37.864000 46140 /data/users/q1l1/pytorch/torch/_export/__init__.py:70] +============================+
W0923 13:00:37.864000 46140 /data/users/q1l1/pytorch/torch/_export/__init__.py:71] |     !!!   WARNING   !!!    |
W0923 13:00:37.865000 46140 /data/users/q1l1/pytorch/torch/_export/__init__.py:72] +============================+
W0923 13:00:37.865000 46140 /data/users/q1l1/pytorch/torch/_export/__init__.py:73] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
stats [('calls_captured', 5), ('unique_graphs', 1)]
inductor [('extern_calls', 2)]
graph_break []
aten_mm_info [('aten.mm_Max(15, u0)_16_64', 1)]
PASSED [5.6947s]
test/inductor/test_aot_inductor.py::AOTInductorTestABICompatibleGpu::test_unbounded_expr_substitutions_cuda W0923 13:00:39.633000 46140 /data/users/q1l1/pytorch/torch/_inductor/sizevars.py:765] [0/0] Substitution limit (30) reached w/ u1**30*Max(15, u0)
W0923 13:00:39.679000 46140 /data/users/q1l1/pytorch/torch/_inductor/sizevars.py:765] [0/0] Substitution limit (30) reached w/ 64*u1**30*Max(15, u0)
stats [('calls_captured', 5), ('unique_graphs', 1)]
inductor [('extern_calls', 2), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('async_compile_cache_miss', 1)]
graph_break []
aten_mm_info [('aten.mm_Max(15, u0)_16_64', 1)]
PASSED [5.6278s]
test/inductor/test_aot_inductor.py::AOTInductorTestABICompatibleMps::test_unbounded_expr_substitutions_mps SKIPPED [0.0002s]

============================ 2 passed, 1 skipped, 870 deselected in 19.66s ============================
```

- with the unittest + comparison logic enhanced, we don't see the warning any more:
```
Running 3 items in this shard

test/inductor/test_aot_inductor.py::AOTInductorTestABICompatibleCpu::test_unbounded_expr_substitutions_cpu W0923 13:15:39.560000 290812 /data/users/q1l1/pytorch/torch/_export/__init__.py:70] +============================+
W0923 13:15:39.561000 290812 /data/users/q1l1/pytorch/torch/_export/__init__.py:71] |     !!!   WARNING   !!!    |
W0923 13:15:39.561000 290812 /data/users/q1l1/pytorch/torch/_export/__init__.py:72] +============================+
W0923 13:15:39.562000 290812 /data/users/q1l1/pytorch/torch/_export/__init__.py:73] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
stats [('calls_captured', 5), ('unique_graphs', 1)]
inductor [('extern_calls', 2)]
graph_break []
aten_mm_info [('aten.mm_Max(15, u0)_16_64', 1)]
PASSED [6.6093s]
test/inductor/test_aot_inductor.py::AOTInductorTestABICompatibleGpu::test_unbounded_expr_substitutions_cuda stats [('calls_captured', 5), ('unique_graphs', 1)]
inductor [('extern_calls', 2), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('async_compile_cache_miss', 1)]
graph_break []
aten_mm_info [('aten.mm_Max(15, u0)_16_64', 1)]
PASSED [6.0502s]
test/inductor/test_aot_inductor.py::AOTInductorTestABICompatibleMps::test_unbounded_expr_substitutions_mps SKIPPED [0.0002s]

============================ 2 passed, 1 skipped, 870 deselected in 21.99s ============================
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163685
Approved by: https://github.com/jansel
2025-09-26 18:46:36 +00:00
425ea90f95 [testing] Add test owner labels for some cuda? tests (#163296)
I am trying to give some test files better owner labels than `module: unknown`.  I am not sure them, but they seem pretty reasonable

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163296
Approved by: https://github.com/eqy, https://github.com/msaroufim
2025-09-26 18:26:56 +00:00
5b764267f4 [testing] Add test owner labels for some distributed tests (#163174)
I am trying to give some test files better owner labels than `module: unknown`.  I am not sure them, but they seem pretty reasonable

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163174
Approved by: https://github.com/ezyang
2025-09-26 18:19:04 +00:00
50c0550f5a Add magic TORCH_MAKE_PYBIND_ENUM_FASTER macro (#163527)
See comment on the macro definition. In short, pybind11 3.x
added `py::native_enum`, and also had to add overhead for that new way
to bind enums on the critical path for calling functions that take
regular old `py::enum_`s as arguments (for example, `__eq__`).

Differential Revision: [D82873169](https://our.internmc.facebook.com/intern/diff/D82873169/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163527
Approved by: https://github.com/ezyang
2025-09-26 17:59:22 +00:00
d7491fb1c1 Fix tensor creation with empty names crash (#163957)
Partially fixes #148324

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163957
Approved by: https://github.com/malfet, https://github.com/janeyx99
2025-09-26 17:41:00 +00:00
9534c59311 [Inductor] address comments from https://github.com/pytorch/pytorch/pull/163803 (#163901)
Summary: address comments from https://github.com/pytorch/pytorch/pull/163803

Differential Revision: D83291637

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163901
Approved by: https://github.com/desertfire
2025-09-26 17:18:44 +00:00
5880996b4c Expose torch.nn.utils.parametrize (#163835)
`torch.nn.utils.parametrize` is not imported from `torch/nn/utils/__init__.py`, thus is not exposed and make it hard for code editors to statically analyze the code and provide auto-completion based on the function signature.

<img width="615" height="292" alt="Screenshot 2025-09-25 at 12 01 52 PM" src="https://github.com/user-attachments/assets/a276f6f0-87f3-4732-943d-2a92ea871974" />

after the fix:

<img width="964" height="393" alt="Screenshot 2025-09-25 at 12 02 16 PM" src="https://github.com/user-attachments/assets/ca47f09e-dc4e-4420-a2d2-11669e07471a" />

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163835
Approved by: https://github.com/albanD
2025-09-26 16:38:18 +00:00
1d26eb0fcc Move inductor.aot_compile to use new tracer (#163137)
Differential Revision: [D82603768](https://our.internmc.facebook.com/intern/diff/D82603768)

I feel no one probably uses this API now but still useful path for more test cases.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163137
Approved by: https://github.com/avikchaudhuri
ghstack dependencies: #163136
2025-09-26 15:54:24 +00:00
a05f6ecfec Fix bug with renaming submodules in dynamo for new tracer (#163136)
Differential Revision: [D82603767](https://our.internmc.facebook.com/intern/diff/D82603767)

Previously, i forgot to add handle call_module case which now will have export_root prepended to their names. Basically i want to clean up sth like:
```
graph():
      %l_self_export_root_sub_mod = call_module[target=l_self_export_root_sub_mod](%x, %y)
      %l_self_export_root_sub_mod_1 = call_module[target=l_self_export_root_sub_mod](%x, %y)
  ```

Dynamo graph can have call_module nodes that have messed up name due to our wrapper.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163136
Approved by: https://github.com/avikchaudhuri
2025-09-26 15:54:24 +00:00
c106ee8515 [FakeTensor] Supplement the relevant logic for converting conv1d to conv2d in meta_conv (#160408)
## Fixes https://github.com/pytorch/pytorch/issues/159462 also fixes #163569 , #163604

## summary
the issue is caused by the wrong stride of conv1d's result generated by meta_conv:
4d5b3f2d5a/torch/_meta_registrations.py (L2453-L2471)

and the wrong stride will be used to codegen size assert in inductor:
4d5b3f2d5a/torch/_inductor/ir.py (L6152-L6163)

## reason
So why the computed stride is wrong in the meta_conv function? because the corresponding backend will convert conv1d to conv2d and change the input tensor' size and memory_format(channel last). but the meta_conv do not do this transformation, so a mismatch happend.
4d5b3f2d5a/aten/src/ATen/native/Convolution.cpp (L1502-L1510)
 just add corresponding logic in meta_conv.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/160408
Approved by: https://github.com/eellison, https://github.com/jansel, https://github.com/mlazos
2025-09-26 15:45:02 +00:00
8aba513506 [MPS] test sparse add MPS dtypes so we get proper expected failure (#163951)
Adds dtypeIfMPS so if op is supported we get proper error like unexpected success. Before we would never get unexpected success because tests were run in torch.double dtype which will always fail on MPS due to it not supporting the dtype
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163951
Approved by: https://github.com/malfet
2025-09-26 14:48:58 +00:00
8c194a367e [DeviceMesh][ez] Add a type alias for backend config (#163928)
Create a type alias for `tuple[Optional[str], Optional[C10dBackend.Options]]` since it is too long.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163928
Approved by: https://github.com/fegin
ghstack dependencies: #163212, #163288
2025-09-26 14:46:53 +00:00
33f3413bd3 [WIP][precompile] Set fake_mode of base tensor in fx graph pickler (#163738)
Summary:
When unpickling a fake tensor in fx graph pickler. It only sets the fake mode of the current tensor's metadata to the one that is consistent with pickler's `unpickle_state`. However, it doesn't set the fake mode of a tensor's base tensor when that tensor is a view.

This will cause an issue when dumping and loading the following graph
```
class GraphModule(torch.nn.Module):
    def forward(self, s77: "Sym(s77)", L_x_: "f32[s77, 8]"):
        l_x_ = L_x_
        chunk = l_x_.chunk(2, dim = -1);  l_x_ = None
        y: "f32[s77, 4]" = chunk[0];  chunk = None
        y_repeat: "f32[s77, 8]" = y.repeat_interleave(2, dim = -1);  y = None
        return (y_repeat,)
```
because `repeat_interleave` will create an intermediate fake tensor of size `[s77, 2, 4]` and it will become the base of the node `y_repeat`'s `meta['val']`.

This causes issues during the deserialization phase when applying AOT precompile to DeepSeek in vLLM.

Test Plan:
This has been tested in vLLM with DeepSeek.

As for unittest, ideally it should be `test_aot_compile_repeat_interleave` with mark_dynamic turned on. However, that's leading to some other pickle issues.

```
python test/dynamo/test_aot_compile.py -k test_aot_compile_repeat_interleave
```

I have yet to figure out a more appropriate unittest. But a proof-of-concept demo would be the following:
```
import inspect
import sympy
import torch
from torch.fx._graph_pickler import GraphPickler
from torch.fx.experimental.symbolic_shapes import ShapeEnv
from torch._subclasses import FakeTensorMode
from torch.fx._graph_pickler import GraphPickler, Options
from unittest.mock import patch

class M(torch.nn.Module):
    def forward(self, x):
        chunk = x.chunk(2, dim=-1)
        y = chunk[0]
        y_repeat = y.repeat_interleave(2, dim=-1)
        return y_repeat

def my_custom_backend(gm, example_inputs):
    global gm_global
    gm_global = gm
    return gm.forward

m = M()
m_opt = torch.compile(m, backend=my_custom_backend, fullgraph=True)

sample_inputs = (torch.randn(2, 8),)
torch._dynamo.mark_dynamic(sample_inputs[0], [0])
opt_out = m_opt(*sample_inputs)

graph_reducer_override = GraphPickler.reducer_override

def _graph_reducer_override(self, obj):
    if (inspect.isclass(obj) and issubclass(obj, sympy.Function)
            and hasattr(obj, "_torch_unpickler")):
        return obj._torch_unpickler, (obj._torch_handler_name, )
    if isinstance(obj, FakeTensorMode):
        return type(None), ()
    return graph_reducer_override(self, obj)

with patch.object(GraphPickler, "reducer_override", _graph_reducer_override):
    pickled_gm = GraphPickler.dumps(gm_global, Options(ops_filter=None))

fake_mode = FakeTensorMode(shape_env=ShapeEnv())
loaded_gm = GraphPickler.loads(pickled_gm, fake_mode)
```

Differential Revision: D83112599

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163738
Approved by: https://github.com/zhxchen17
2025-09-26 14:36:37 +00:00
d4e4f70768 Fix overflow in slow_conv3d when kernel size is too large. (#162718)
Also, adding check for padding to avoid segmentation fault caused by overflow.

Fixes #141846

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162718
Approved by: https://github.com/jgong5, https://github.com/Skylion007
2025-09-26 13:39:29 +00:00
bfd21cd3e6 Revert "Add less warps config to inner reductions (#162447)"
This reverts commit 768361e67f0eb36491d7b763ef38d7c928ebefe6.

Reverted https://github.com/pytorch/pytorch/pull/162447 on behalf of https://github.com/PaulZhang12 due to failed to land internally ([comment](https://github.com/pytorch/pytorch/pull/162447#issuecomment-3338680532))
2025-09-26 13:16:04 +00:00
7441a1b9b1 Update ruff to 0.13.1 (#163744)
Update ruff to 0.13.1 so that we can remove `UP038` from `pyproject.toml` because it has been removed from supported rules of ruff.
There are some fixes, the most notable one is [(PYI059)](https://docs.astral.sh/ruff/rules/generic-not-last-base-class/#generic-not-last-base-class-pyi059)
```
Checks for classes inheriting from typing.Generic[] where Generic[] is not the last base class in the bases tuple.

```

A BC-breaking change is introduced to change the typing of `OrderedSet .storage`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163744
Approved by: https://github.com/Skylion007, https://github.com/jingsh
2025-09-26 10:12:21 +00:00
6a2bd1f4ee [inductor] skip bmm when converting channel last (#159459)
Workaround of #159458 by remove some nodes output channel last set

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159459
Approved by: https://github.com/etaf, https://github.com/eellison, https://github.com/shunting314
2025-09-26 09:11:40 +00:00
4783e3ff49 Update torch-xpu-ops commit pin (#163758)
Update the torch-xpu-ops commit to [intel/torch-xpu-ops@229e8b](229e8ba104), includes:

- Revert tracking of Work status for FlightRecorder in ProcessGroupXCCL to fix memory leak
- Enable SYCL warnings on Linux
- Fix accuracy issues with CTC loss
- Enable aten::nonzero_static on XPU backend
- Stop recursive calculations in polynomial kernels if tensor has NaNs
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163758
Approved by: https://github.com/EikanWang
2025-09-26 09:05:08 +00:00
c8e5b7dabb Add SDPA patterns for T5 variants when batch size is 1 (#163252)
As mentioned in
https://github.com/pytorch/pytorch/blob/main/torch/_inductor/fx_passes/fuse_attention.py#L838, this PR generates patterns  for the cases batch size == 1.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163252
Approved by: https://github.com/Valentine233, https://github.com/jansel
2025-09-26 08:50:06 +00:00
04b51499f7 [CPU] Support transpose and packing fusion for bit8 (#163233)
To be used by CPU INT8 SDPA in TorchAO https://github.com/pytorch/ao/pull/3025. This change has a kernel improvement of about 9%.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163233
Approved by: https://github.com/mingfeima, https://github.com/jansel
2025-09-26 07:15:04 +00:00
54461a53bd [Inductor] Check if profiling before using record_function in CompiledFxGraph (#163747)
The call to `record_function` adds overhead even if profiling is disabled, which can as much as double the total runtime overhead of a compiled function. #163566 aims to make `record_function` more efficient, but doesn't fully eliminate overhead. This change adds a check if profiling is active before using `record_function`, which avoids this issue all together.

`TestExecutionTrace.test_execution_trace_with_pt2` in https://github.com/pytorch/pytorch/blob/main/test/profiler/test_execution_trace.py#L372 already checks that the `record_function` region is tracked during profiling.

Comparison of the `benchmarks/dynamo/microbenchmarks/overheads.py ` results:

Before Change:
```
requires_grad=False
compiled 56.9us (warmup=10.7s)

requires_grad=True
compiled 99.4us (warmup=0.2s)

inference_mode()
compiled 55.7us (warmup=0.1s)
```

After Change:
```
requires_grad=False
eager    6.9us (warmup=0.0s)
compiled 23.9us (warmup=22.3s)

requires_grad=True
eager    8.7us (warmup=0.0s)
compiled 56.8us (warmup=0.1s)

inference_mode()
eager    6.3us (warmup=0.0s)
compiled 22.2us (warmup=0.1s)
```

Additionally, #163866 introduces an instruction count benchmark. Because that is not merged and activated yet, here is a comparison:

Before Change:
```
runtime_overhead_inductor,instruction_count,222645
runtime_overhead_inductor_inference_mode,instruction_count,234998
runtime_overhead_inductor_requires_grad,instruction_count,293556
runtime_overhead_inductor_requires_grad_backward,instruction_count,78181
runtime_overhead_inductor_dynamic,instruction_count,234870
runtime_overhead_inductor_inference_mode_dynamic,instruction_count,248711
runtime_overhead_inductor_requires_grad_dynamic,instruction_count,309979
runtime_overhead_inductor_requires_grad_backward_dynamic,instruction_count,77599
```

After Change:
```
runtime_overhead_inductor,instruction_count,149997
runtime_overhead_inductor_inference_mode,instruction_count,163397
runtime_overhead_inductor_requires_grad,instruction_count,220722
runtime_overhead_inductor_requires_grad_backward,instruction_count,78276
runtime_overhead_inductor_dynamic,instruction_count,161177
runtime_overhead_inductor_inference_mode_dynamic,instruction_count,175495
runtime_overhead_inductor_requires_grad_dynamic,instruction_count,235674
runtime_overhead_inductor_requires_grad_backward_dynamic,instruction_count,77475
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163747
Approved by: https://github.com/mlazos, https://github.com/anijain2305
2025-09-26 06:49:40 +00:00
d1403250c9 Fix specialize_impl from triton.runtime.jit (#163844)
Summary:
In https://github.com/triton-lang/triton/pull/7771/ , create_specialize_impl is removed. We extend the support using native_specialize_impl.

Otherwise, PyTorch won't work with trunk triton.

Test Plan:
scripts/lufang/llm/launch_qwen3_vl_235b_a22b_thinking_2507_h100.sh

No more error message like
```
(Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] Encountered an exception in identify_mutated_tensors, assuming every input is mutated
(Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] Traceback (most recent call last):
(Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0]   File "/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inference_platform_sp/llm_predictor_gpu/__service__/service#link-tree/to
rch/_higher_order_ops/triton_kernel_wrap.py", line 924, in identify_mutated_tensors
(Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0]     ttir_module, ordered_tensor_names = generate_ttir(
(Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0]   File "/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inference_platform_sp/llm_predictor_gpu/__service__/service#link-tree/to
rch/_higher_order_ops/triton_kernel_wrap.py", line 419, in generate_ttir
(Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0]     specialization = _get_specialization(ordered_args.values())
(Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0]   File "/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inference_platform_sp/llm_predictor_gpu/__service__/service#link-tree/to
rch/_higher_order_ops/triton_kernel_wrap.py", line 390, in _get_specialization
(Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0]     from triton.runtime.jit import specialize_impl as specialize_impl_orig
(Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] ImportError: cannot import name 'specialize_impl' from 'triton.runtime.jit' (/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inf
erence_platform_sp/llm_predictor_gpu/__service__/service#link-tree/triton/runtime/jit.py)
(Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] Encountered an exception in identify_mutated_tensors, assuming every input is mutated
(Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] Traceback (most recent call last):
(Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0]   File "/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inference_platform_sp/llm_predictor_gpu/__service__/service#link-tree/to
rch/_higher_order_ops/triton_kernel_wrap.py", line 924, in identify_mutated_tensors
(Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0]     ttir_module, ordered_tensor_names = generate_ttir(
(Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0]   File "/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inference_platform_sp/llm_predictor_gpu/__service__/service#link-tree/to
rch/_higher_order_ops/triton_kernel_wrap.py", line 419, in generate_ttir
(Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0]     specialization = _get_specialization(ordered_args.values())
(Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0]   File "/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inference_platform_sp/llm_predictor_gpu/__service__/service#link-tree/to
rch/_higher_order_ops/triton_kernel_wrap.py", line 390, in _get_specialization
(Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0]     from triton.runtime.jit import specialize_impl as specialize_impl_orig
(Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] ImportError: cannot import name 'specialize_impl' from 'triton.runtime.jit' (/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inf
erence_platform_sp/llm_predictor_gpu/__service__/service#link-tree/triton/runtime/jit.py)
(Worker_TP5_EP5 pid=190359) [rank5]:W0924 23:24:48.216000 190359 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] Encountered an exception in identify_mutated_tensors, assuming every input is mutated
(Worker_TP5_EP5 pid=190359) [rank5]:W0924 23:24:48.216000 190359 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] Traceback (most recent call last):
(Worker_TP5_EP5 pid=190359) [rank5]:W0924 23:24:48.216000 190359 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0]   File "/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inference_platform_sp/llm_predictor_gpu/__service__/service#link-tree/to
rch/_higher_order_ops/triton_kernel_wrap.py", line 924, in identify_mutated_tensors
```

Differential Revision: D83229128

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163844
Approved by: https://github.com/henryoier, https://github.com/davidberard98, https://github.com/BoyuanFeng
2025-09-26 06:37:26 +00:00
b42e81def5 Allow unbacked to unbacked replacements if rhs unbacked symbols are all inputs (#163652)
This partially solve the issue https://github.com/pytorch/pytorch/issues/163641. We do not need to ban unbacked to unbacked replacement if all rhs symbols are inputs since we know those symbols are seen by the whole program.

This issue was found as i was tracing some vllm models with unbacked, namely  Qwen/Qwen2-1.5B-Instruct it makes reasoning logic easier to do those replacements.

as for data dependent similar pattern, I am thinking to create a set of replacements that we apply only during static eval
instead of none. to make reasoning better.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163652
Approved by: https://github.com/bobrenjc93
2025-09-26 06:23:22 +00:00
2a45f30ae7 Exporting aten.conv with cuda under fake mode on a cuda-less machine (#163912)
Summary:
Improve op coverage of exporting a CUDA model on a CPU-only machine under fake tensor mode.

For `torch.nn.functional.conv2d`, it will `_select_conv_backend` based on input and weight shapes.

When calling into `supportsDepthwiseConvolutionWithCuDNN()`, it calls `at::cuda::getCurrentDeviceProperties()` and fails on a CPU-only machine.

So we check if CUDA is actually enabled first.

Test Plan: TORCH_SHOW_CPP_STACKTRACES=1 buck2 run fbcode//caffe2/test:test_export -- --r nn_functional_conv2d

Reviewed By: angelayi, henryoier

Differential Revision: D80562984

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163912
Approved by: https://github.com/SherlockNoMad
2025-09-26 06:04:20 +00:00
11b4c0eb9e [aoti] Save compute information (#163792)
Metadata looks like:
```
{
  'AOTI_DEVICE_KEY': 'cpu',
  'AOTI_PLATFORM': 'linux',
  'AOTI_MACHINE': 'x86_64',
  'AOTI_CPU_ISA': 'AVX512',
  'AOTI_COMPUTE_CAPABILITY': '90'
}
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163792
Approved by: https://github.com/yushangdi, https://github.com/desertfire
ghstack dependencies: #163779
2025-09-26 05:40:44 +00:00
fb93491ddc [aoti] Load metadata w/o loading package (#163779)
Add a function to load the metadata stored in aoti without needing to load the .so. This can be used to store what platform we are compiling the .so on which we can check before loading the .so.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163779
Approved by: https://github.com/yushangdi, https://github.com/desertfire
2025-09-26 05:40:44 +00:00
39df24fe04 [Code Clean] Replace std::runtime_error with TORCH_CHECK (#163610)
Including:
- `torch/csrc/instruction_counter`
- `torch/csrc/lazy`
- `torch/csrc/monitor`
- `torch/csrc/profiler`
- `torch/csrc/dynamo`

Fixes part of #148114

Personal mistake about (PR #163317), this PR does the same thing **and PR #163317 has already been approved by @albanD.**

This is a personal mistake on my part, and I'm so sorry about that. Hope you won't mind @albanD. 🥹

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163610
Approved by: https://github.com/albanD, https://github.com/Skylion007
2025-09-26 04:52:48 +00:00
bbde16fe98 [vllm hash update] update the pinned vllm hash (#163823)
This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned vllm hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163823
Approved by: https://github.com/pytorchbot
2025-09-26 04:29:52 +00:00
1b78ca2ef5 [Triton] [Inductor] Prune template selection based on decompose_k (#163781)
Summary:

Triton templates tend to perform very poorly on large K, hence the introduction of decompose_k. As a result, when decompose_k is selected will disable exploring the Triton templates. We may want to consider an override in the future.

Note: Based on the timing results it may be desirable to better refine/prune the decompose k decisions.

Testing:

Tested by looking at the autotune/compilation time using a single shape in TritonBench.
`TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 ENABLE_PERSISTENT_TMA_MATMUL=1 python run --op gemm --rep 1000 --sleep 1.0 --m 512 --n 512 --k 300000 --only pt2_matmul_maxautotune`
Before this change:
`SingleProcess AUTOTUNE benchmarking takes 13.5368 seconds and 0.1595 seconds precompiling for 38 choices`
With this change:
`SingleProcess AUTOTUNE benchmarking takes 9.9626 seconds and 0.0020 seconds precompiling for 11 choices`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163781
Approved by: https://github.com/eellison, https://github.com/PaulZhang12
2025-09-26 04:09:35 +00:00
082eaf4aae [DeviceMesh] Add extra check in flatten result cache lookup (#163288)
while refactoring DeviceMesh bookkeeping, we found that there is one corner case which we just don't check whether the dims to be flattened into is same as the dims which an existing flattened name maps to. So we need to add extra cases in the unit test and extra check logic in the code.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163288
Approved by: https://github.com/wz337, https://github.com/ezyang, https://github.com/fegin
ghstack dependencies: #163212
2025-09-26 03:41:58 +00:00
f1f2e3e4da [DeviceMesh] Introduce CuTe layout into devicemesh code base for internal bookkeeping (#163212)
DeviceMesh essentially is a way to specify how devices interact with each other or device layout. They are all integers but because they can have various shapes and meshes, it make internal bookkeeping internally way more challenging. Currently our internal bookkeeing inside DeviceMesh is not scalable, so in order to support new functions like `_unflatten`, we need to introduce very complicated logics inside DeviceMesh as pointed out per comment (https://github.com/pytorch/pytorch/pull/159482/files#r2256025452). So thanks to @lw 's suggestion and PoC PR (https://github.com/pytorch/pytorch/pull/160429), we realize that by leveraging CuTe layout algebra([ref](https://docs.nvidia.com/cutlass/media/docs/cpp/cute/02_layout_algebra.html)) from Cutlass will greatly simply our internal mechanical bookkeeping for and make the abstraction ops way easier on top of it. So to make things go incrementally, we propose couple steps here https://github.com/pytorch/pytorch/issues/160337#issuecomment-3195106243.

On top of what we have been doing about PyCute we want to continue add methods into the wrapper class so that we can get rank indexes needed for ProcessGroup Creation with a layout object. We also added detailed explanations and comments (thanks to llm) and unit test to show case the code indeed is working as expected.

More PRs are on the way.

This is a continue of https://github.com/pytorch/pytorch/pull/161016 (originally messed with EasyCLA)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163212
Approved by: https://github.com/ezyang, https://github.com/fegin, https://github.com/lw
2025-09-26 03:32:19 +00:00
67cc0e0ac9 Add Static Dispatch Kernels (#163676) (#163870)
Summary:
X-link: https://github.com/facebookresearch/FBGEMM/pull/1951

X-link: https://github.com/pytorch/FBGEMM/pull/4927

Add a few missing static dispatch kernels for remote_ro.

Test Plan: Tested with scripts in D83028841.

Differential Revision: D83258808

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163870
Approved by: https://github.com/henryoier
2025-09-26 03:00:07 +00:00
bbf8aa43ef [a2av] Separate in/out splits into two tensors (#163837)
Old signature:
`all_to_all_vdev(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name)`
New signature:
`all_to_all_vdev(Tensor input, Tensor(a!) out, Tensor in_splits, Tensor(a!) out_splits_offsets, str group_name)`

i.e. split `in_out_splits` into IN tensor and OUT tensor so that we can define the TORCH_LIBRARY signature better.
Also to be in line with the 2D version.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163837
Approved by: https://github.com/fduwjj
ghstack dependencies: #163886
2025-09-26 01:03:54 +00:00
5daa79fd6e Remove dataclass_slots (#163623)
`dataclass` now has `slots` kwarg.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163623
Approved by: https://github.com/Skylion007
2025-09-26 00:54:42 +00:00
b776e0c71e [ROCm][CI/CD] create ROCm 7.0 magma tarball (#163883)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163883
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-09-26 00:51:17 +00:00
5c2f09d1f9 [export] _detect_attribute_assignment gives warning instead of raising ValueError (#163809)
Summary:
LSTM was not exportable with non-strict export as it failed at `_detect_attribute_assignment`

This is because the `_flat_weights` attribute in LSTM is a list of registered parameters and will be updated by the `_update_flat_weights` method in `forward`.

However, in `_detect_attribute_assignment`, we manually restore the state of the module by `mod.__dict__.update(snapshot)`. Therefore, it should be fine to turn the `ValueError` into a warning so that RNN models are exportable with non-strict export.

Added test to verify that there is no lifted tensor constant and no fake tensor leakage.

Test Plan: buck2 run mode/dev-nosan caffe2/test:test_export -- -r test_export_rnn_variants_with_warning

Differential Revision: D83196971

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163809
Approved by: https://github.com/tugsbayasgalan
2025-09-26 00:43:29 +00:00
b4be380480 [ROCm] Implement float32 copy kernel (#163869)
* Add `float32_copy_kernel` for vectorizing float16/bfloat16 to float32 conversion

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163869
Approved by: https://github.com/jeffdaily
2025-09-26 00:39:30 +00:00
5b8fef3f17 Extend triton_mm auto-tune options for HIM shapes (#163273)
Summary:
Add an option to auto-tune for shape:
```
M=1024 N=171712 K=1024
```

Test Plan:
```
TRITON_PRINT_AUTOTUNING=1 buck2 run mode/opt-amd-gpu -c fbcode.enable_gpu_sections=true //pytorch/tritonbench:run -- --op fp8_gemm_rowwise --no_use_tma --no_use_persistent --m 1024 --n 171712 --k 1024 --bias
```
Before:
 {F1982074581}
After, saw 10%~ boost:
{F1982074585}

Differential Revision: D82687336

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163273
Approved by: https://github.com/jananisriram, https://github.com/Camyll
2025-09-26 00:05:57 +00:00
ff2f319e6e [MPS] Fix conv layout handling (#162776)
What started as simple fix for `mps_convolution_backward_input` resulted in a pretty significant refactor/fixes:
- Updated `mps_conv_use_channels_last` to return channels last output if either input or weights are channels last
- Use the same primitive throughout `Convolution.mm` to determine wether output should be allocated in channels last format or not

But doing only those two, resulted in crash in `test_memory_format_nn_Conv2d_mps_float32`, when weights were backward, and bias is present:
```
% python -c "import torch;print(torch.nn.functional.conv2d(torch.rand(2, 4, 3, 4,device='mps'), torch.rand(5, 4, 3, 3,device='mps').to(memory_format=torch.channels_last), torch.rand(5,device='mps')))"
/AppleInternal/Library/BuildRoots/4~B5E4ugDCh2RsPWAjMEoPu8LC5w1yXEwd7XweDhg/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphExecutable.mm:3619: failed assertion `Error: MLIR pass manager failed'
zsh: abort      python -c
```

Which requires a more thorough redesign/cleanup, namely:
- Do not alter the layout based on MacOS version, but rather do additional copies on MacOS-14 if inputs/output or weight are in channels last format ( done by defining `std::optional<Tensor> output_c;` that contains a contiguous copy of the output tensor
- Introduced `input_suggested_layout` which is set to ChannelsLast if and only if input is channels last and is running on MacOS-15+
- Delete unused `memory_layout` and `group` arguments from `fill_depthwise_conv_desc`
- Fix bias broadcasting logic for channels last

As result, in addition to adding one more regression test this change removes `expectedFailures` from:
- `TestModule.test_memory_format` for `Conv2d`, `ConvTranspose2d`, `LazyConv1d`, `LazyConvTranspose1d`
- `test_require_stride_expanded_dynamic_shapes`
-  `test_mutable_custom_op_fixed_layout2` for MacOS-14

Fixes https://github.com/pytorch/pytorch/issues/161905

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162776
Approved by: https://github.com/Skylion007
2025-09-25 23:41:34 +00:00
94195a37ae [BE] Remove HermeticPyObjectTLS and Simplify PythonOpRegistrationTrampoline (#163464)
Removes HermeticPyObjectTLS as we no longer need since torch deploy is no longer supported. PythonOpRegistrationTrampoline is also drastically simplified as and being prepped for removal in a future PR.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163464
Approved by: https://github.com/albanD, https://github.com/Skylion007
2025-09-25 23:30:50 +00:00
suo
c58e096cd0 [DTensor] implement logsumexp (#163879)
as title, mostly copypasta from internal. I am a dtensor noob, so please scrutinize my added test.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163879
Approved by: https://github.com/XilunWu
2025-09-25 23:08:30 +00:00
2a6e6a9e3b [FSDP][Replicate] tests replicate parity for shared parameters (#162836)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162836
Approved by: https://github.com/mori360
ghstack dependencies: #162830
2025-09-25 23:08:22 +00:00
6e6c899347 [Reland][163423] Promote @requires_nvshmem instead of enable_triton (#163549)
#163423 was approved but reverted due to a revert of base.
Relanding without base.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163549
Approved by: https://github.com/wdvr

Co-authored-by: Wouter Devriendt <wouterdevriendt@meta.com>
2025-09-25 23:02:00 +00:00
366961df78 [FSDP][Replicate] tests replicate parity with activation checkpointing (#162830)
**Summary:**  In order to ensure that replicate acts as intended (a specialized version of hsdp) we need to make sure that it can pass the same tests that fully_shard can for training. This tests that replicate function works correctly when combined with activation checkpointing

**Test Case**
1. pytest test/distributed/_composable/test_replicate_training.py -k test_train_parity_with_activation_checkpointing

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162830
Approved by: https://github.com/mori360
2025-09-25 22:57:00 +00:00
520fca82c8 Refactor Provenance Tracking (#163378)
Summary:
- Move the `provenance_level` flag check to inside the `set_kernel_post_grad_provenance_tracing` call to simply the code

- Move the `set_kernel_post_grad_provenance_tracing` call and `write_provenance_debug_handle` call to `codegen_comment`.

- If some `call_kernel` call sites don't have a proceeding `codegen_comment` call, add one. Now all `call_kernel` call sites are accompanied with a  `codegen_comment` call.

- Add a `codegen_comment` method to BaseScheduling and remove the noop `codegen_comment` method in Scheduling

- Remove `debug_handle` from `call_kernel`.

Test Plan:
CI

```
buck run @//mode/opt-split-dwarf fbcode//caffe2/test/inductor:provenance_tracing
```

Differential Revision: D82839271

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163378
Approved by: https://github.com/angelayi
2025-09-25 22:55:59 +00:00
908bcfd403 [AOTInductor] Add input information for Triton Kernels in AOTI (#160380)
Summary:
We use record_function to pass in input information to let Kineto show
input information.

Test Plan:
Before:
<img width="459" height="582" alt="Screenshot 2025-09-19 at 10 45 10 AM" src="https://github.com/user-attachments/assets/baa0c251-86e9-49ca-8c6c-fcd2619f7f48" />

After:
<img width="473" height="1130" alt="Screenshot 2025-09-19 at 10 44 53 AM" src="https://github.com/user-attachments/assets/b7942d84-0362-4b9e-9232-14de92bbdd00" />

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/160380
Approved by: https://github.com/desertfire
ghstack dependencies: #163593
2025-09-25 22:41:04 +00:00
96275dbf88 [CI] Fix test_triton_wait_until hang (#163886)
I don't know why `nvshmem_barrier_all_kernel`  leads the test to hang. Will investigate.
But since it is an unnecessary call here, I am removing it to unblock other PRs.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163886
Approved by: https://github.com/fegin
2025-09-25 22:22:16 +00:00
b14a14a662 [torchfuzz] make generated code much more concise and cleaner (#163812)
```
import torch

torch._dynamo.config.capture_scalar_outputs = True
torch.manual_seed(42)

def fuzzed_program(arg_0, arg_1, arg_2):
    var_node_3 = arg_0 # size=(1,), stride=(1,), dtype=complex128, device=cuda
    var_node_4 = torch.full((1,), (-0.29262632146522655-0.7687848816195035j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda
    var_node_2 = torch.ops.aten.add(var_node_3, var_node_4) # size=(1,), stride=(1,), dtype=complex128, device=cuda
    var_node_6 = arg_1 # size=(1,), stride=(1,), dtype=complex128, device=cuda
    var_node_7 = arg_2 # size=(1,), stride=(1,), dtype=complex128, device=cuda
    var_node_5 = torch.ops.aten.add(var_node_6, var_node_7) # size=(1,), stride=(1,), dtype=complex128, device=cuda
    var_node_1 = torch.ops.aten.add(var_node_2, var_node_5) # size=(1,), stride=(1,), dtype=complex128, device=cuda
    var_node_0 = var_node_1.item() # dtype=complex128
    return var_node_0

arg_0 = torch.as_strided(torch.randn(1).to(torch.complex128), (1,), (1,))
arg_1 = torch.as_strided(torch.randn(1).to(torch.complex128), (1,), (1,))
arg_2 = torch.as_strided(torch.randn(1).to(torch.complex128), (1,), (1,))

args = (arg_0, arg_1, arg_2)
result_original = fuzzed_program(*args)
print(' eager success')
compiled_program = torch.compile(fuzzed_program, fullgraph=False, dynamic=True)
result_compiled = compiled_program(*args)
print(' compile success')
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163812
Approved by: https://github.com/pianpwk
ghstack dependencies: #163743
2025-09-25 22:12:33 +00:00
92f7361e27 [DTensor] fix uneven _StridedShard (#163843)
Previous uneven `_StridedShard` in https://github.com/pytorch/pytorch/pull/150490 seems failing cases like sharding `tensor = torch.arange(6)` with FSDP 2, TP 2.

This PR attempts to reinvent `_StridedShard`.

I didn't test nested `_StridedShard`, because there shouldn't be any use cases. I think it will become quite messy when it comes to **nested uneven** `_StridedShard`. We are probably going to deprecate it anyway after @zpcore 's work https://github.com/pytorch/pytorch/pull/160266 on ordered sharding, so IMO not worth it to make it too general.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163843
Approved by: https://github.com/ezyang
2025-09-25 22:12:29 +00:00
6a6d838832 Add H100 runner to be recognized in actionlint (#163795)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163795
Approved by: https://github.com/huydhn, https://github.com/seemethere
2025-09-25 22:09:11 +00:00
183dca423f [Inductor] add a new config fallback_embedding_bag_byte_unpack (#163803)
Differential Revision: D82988783

introduce an inductor config fallback_embedding_bag_byte_unpack so we can have options to not let inductor decompose the op

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163803
Approved by: https://github.com/henryoier
2025-09-25 22:07:04 +00:00
b8efa336d2 [torchfuzz] simplify codegen and runner (#163743)
much less code. a followup PR will make these repro files even smaller.
small is important since it reduces the time for users to understand
what the repro is doing. here's a sample:

```
(/home/bobren/local/a/pytorch-env) [21:34] devgpu009:/home/bobren/local/a/pytorch/tools/experimental/dynamic_shapes/torchfuzz [130] python fuzzer.py --seed 42
Running single fuzz_and_execute...
Using seed: 42, max_depth: 10
Running generated program...
Selected CUDA_VISIBLE_DEVICES=2
=== Program Output ===
 eager success
 compile success

===============================
=== Program Source ===
import torch
import sys
import os
fuzzer_dir = r'/home/bobren/local/a/pytorch/tools/experimental/dynamic_shapes/torchfuzz'
if fuzzer_dir not in sys.path:
    sys.path.insert(0, fuzzer_dir)
from tensor_fuzzer import fuzz_scalar, fuzz_tensor_simple, ScalarSpec, TensorSpec

def fuzzed_program(arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10, arg_11, arg_12, arg_13, arg_14, arg_15, arg_16, arg_17, arg_18, arg_19, arg_20, arg_21, arg_22, arg_23, arg_24, arg_25, arg_26):
    # Node node_4: arg (depth 6)
    var_node_4 = arg_0 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_7: constant (depth 4)
    var_node_7 = torch.full((1,), (-0.8353595860703585-0.8384634248041143j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_8: arg (depth 4)
    var_node_8 = arg_1 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_6: tensor_pointwise (depth 5)
    var_node_6 = torch.ops.aten.mul(var_node_7, var_node_8) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_9: constant (depth 5)
    var_node_9 = torch.full((1,), (-0.32478860712861235+0.033909682598544454j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_5: tensor_pointwise (depth 6)
    var_node_5 = torch.ops.aten.mul(var_node_6, var_node_9) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_3: tensor_pointwise (depth 7)
    var_node_3 = torch.ops.aten.sub(var_node_4, var_node_5) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_11: arg (depth 6)
    var_node_11 = arg_2 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_18: constant (depth 0)
    var_node_18 = torch.full((1,), (0.12855308616305575+1.5268033634325642j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_19: arg (depth 0)
    var_node_19 = arg_3 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_17: tensor_pointwise (depth 1)
    var_node_17 = torch.ops.aten.mul(var_node_18, var_node_19) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_21: arg (depth 0)
    var_node_21 = arg_4 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_22: arg (depth 0)
    var_node_22 = arg_5 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_20: tensor_pointwise (depth 1)
    var_node_20 = torch.ops.aten.sub(var_node_21, var_node_22) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_16: tensor_pointwise (depth 2)
    var_node_16 = torch.ops.aten.add(var_node_17, var_node_20) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_25: arg (depth 0)
    var_node_25 = arg_6 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_26: arg (depth 0)
    var_node_26 = arg_7 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_24: tensor_pointwise (depth 1)
    var_node_24 = torch.ops.aten.add(var_node_25, var_node_26) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_27: constant (depth 1)
    var_node_27 = torch.full((1,), (-0.6315711191260084+1.342004076501214j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_23: tensor_pointwise (depth 2)
    var_node_23 = torch.ops.aten.mul(var_node_24, var_node_27) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_15: tensor_pointwise (depth 3)
    var_node_15 = torch.ops.aten.mul(var_node_16, var_node_23) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_28: constant (depth 3)
    var_node_28 = torch.full((1,), (1.064498531874825-0.37289464356501284j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_14: tensor_pointwise (depth 4)
    var_node_14 = torch.ops.aten.mul(var_node_15, var_node_28) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_30: arg (depth 3)
    var_node_30 = arg_8 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_32: arg (depth 2)
    var_node_32 = arg_9 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_33: constant (depth 2)
    var_node_33 = torch.full((1,), (1.5815627438573372+0.5124667911691704j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_31: tensor_pointwise (depth 3)
    var_node_31 = torch.ops.aten.div(var_node_32, var_node_33) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_29: tensor_pointwise (depth 4)
    var_node_29 = torch.ops.aten.div(var_node_30, var_node_31) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_13: tensor_pointwise (depth 5)
    var_node_13 = torch.ops.aten.div(var_node_14, var_node_29) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_39: arg (depth 0)
    var_node_39 = arg_10 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_40: constant (depth 0)
    var_node_40 = torch.full((1,), (-0.5987350493494642-0.5711360569376475j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_38: tensor_pointwise (depth 1)
    var_node_38 = torch.ops.aten.mul(var_node_39, var_node_40) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_41: arg (depth 1)
    var_node_41 = arg_11 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_37: tensor_pointwise (depth 2)
    var_node_37 = torch.ops.aten.add(var_node_38, var_node_41) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_42: constant (depth 2)
    var_node_42 = torch.full((1,), (0.7246044564672116-0.5930730980273312j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_36: tensor_pointwise (depth 3)
    var_node_36 = torch.ops.aten.mul(var_node_37, var_node_42) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_43: constant (depth 3)
    var_node_43 = torch.full((1,), (-0.7582976293117148+1.1880929376258396j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_35: tensor_pointwise (depth 4)
    var_node_35 = torch.ops.aten.mul(var_node_36, var_node_43) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_45: constant (depth 3)
    var_node_45 = torch.full((1,), (1.0896212896322774+0.3124038130417098j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_46: arg (depth 3)
    var_node_46 = arg_12 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_44: tensor_pointwise (depth 4)
    var_node_44 = torch.ops.aten.add(var_node_45, var_node_46) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_34: tensor_pointwise (depth 5)
    var_node_34 = torch.ops.aten.div(var_node_35, var_node_44) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_12: tensor_pointwise (depth 6)
    var_node_12 = torch.ops.aten.div(var_node_13, var_node_34) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_10: tensor_pointwise (depth 7)
    var_node_10 = torch.ops.aten.mul(var_node_11, var_node_12) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_2: tensor_pointwise (depth 8)
    var_node_2 = torch.ops.aten.div(var_node_3, var_node_10) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_48: constant (depth 7)
    var_node_48 = torch.full((1,), (-1.047745491289218+0.279447315087422j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_54: arg (depth 2)
    var_node_54 = arg_13 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_55: arg (depth 2)
    var_node_55 = arg_14 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_53: tensor_pointwise (depth 3)
    var_node_53 = torch.ops.aten.div(var_node_54, var_node_55) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_56: arg (depth 3)
    var_node_56 = arg_15 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_52: tensor_pointwise (depth 4)
    var_node_52 = torch.ops.aten.div(var_node_53, var_node_56) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_59: arg (depth 2)
    var_node_59 = arg_16 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_60: arg (depth 2)
    var_node_60 = arg_17 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_58: tensor_pointwise (depth 3)
    var_node_58 = torch.ops.aten.div(var_node_59, var_node_60) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_61: constant (depth 3)
    var_node_61 = torch.full((1,), (-0.7386327586576402-0.027025998767172658j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_57: tensor_pointwise (depth 4)
    var_node_57 = torch.ops.aten.add(var_node_58, var_node_61) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_51: tensor_pointwise (depth 5)
    var_node_51 = torch.ops.aten.sub(var_node_52, var_node_57) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_64: arg (depth 3)
    var_node_64 = arg_18 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_67: arg (depth 1)
    var_node_67 = arg_19 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_68: constant (depth 1)
    var_node_68 = torch.full((1,), (-0.6840241429755998+1.327637020136433j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_66: tensor_pointwise (depth 2)
    var_node_66 = torch.ops.aten.mul(var_node_67, var_node_68) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_69: arg (depth 2)
    var_node_69 = arg_20 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_65: tensor_pointwise (depth 3)
    var_node_65 = torch.ops.aten.sub(var_node_66, var_node_69) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_63: tensor_pointwise (depth 4)
    var_node_63 = torch.ops.aten.sub(var_node_64, var_node_65) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_70: arg (depth 4)
    var_node_70 = arg_21 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_62: tensor_pointwise (depth 5)
    var_node_62 = torch.ops.aten.sub(var_node_63, var_node_70) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_50: tensor_pointwise (depth 6)
    var_node_50 = torch.ops.aten.mul(var_node_51, var_node_62) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_76: constant (depth 1)
    var_node_76 = torch.full((1,), (1.864651314238342+0.27066487315113186j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_77: arg (depth 1)
    var_node_77 = arg_22 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_75: tensor_pointwise (depth 2)
    var_node_75 = torch.ops.aten.mul(var_node_76, var_node_77) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_78: arg (depth 2)
    var_node_78 = arg_23 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_74: tensor_pointwise (depth 3)
    var_node_74 = torch.ops.aten.add(var_node_75, var_node_78) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_79: arg (depth 3)
    var_node_79 = arg_24 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_73: tensor_pointwise (depth 4)
    var_node_73 = torch.ops.aten.mul(var_node_74, var_node_79) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_80: arg (depth 4)
    var_node_80 = arg_25 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_72: tensor_pointwise (depth 5)
    var_node_72 = torch.ops.aten.mul(var_node_73, var_node_80) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_82: constant (depth 4)
    var_node_82 = torch.full((1,), (1.6341547018841247+0.3096989611326181j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_84: constant (depth 3)
    var_node_84 = torch.full((1,), (0.9609065596935821+0.2920229825681946j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_85: arg (depth 3)
    var_node_85 = arg_26 # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_83: tensor_pointwise (depth 4)
    var_node_83 = torch.ops.aten.add(var_node_84, var_node_85) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_81: tensor_pointwise (depth 5)
    var_node_81 = torch.ops.aten.sub(var_node_82, var_node_83) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_71: tensor_pointwise (depth 6)
    var_node_71 = torch.ops.aten.sub(var_node_72, var_node_81) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_49: tensor_pointwise (depth 7)
    var_node_49 = torch.ops.aten.mul(var_node_50, var_node_71) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_47: tensor_pointwise (depth 8)
    var_node_47 = torch.ops.aten.add(var_node_48, var_node_49) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_1: tensor_pointwise (depth 9)
    var_node_1 = torch.ops.aten.add(var_node_2, var_node_47) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_0: torch.ops.aten.item (depth 10)
    var_node_0 = var_node_1.item() # dtype=complex128

    # Final result from root node
    return var_node_0

arg_0 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10042)
arg_1 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10043)
arg_2 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10044)
arg_3 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10045)
arg_4 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10046)
arg_5 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10047)
arg_6 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10048)
arg_7 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10049)
arg_8 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10050)
arg_9 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10051)
arg_10 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10052)
arg_11 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10053)
arg_12 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10054)
arg_13 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10055)
arg_14 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10056)
arg_15 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10057)
arg_16 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10058)
arg_17 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10059)
arg_18 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10060)
arg_19 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10061)
arg_20 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10062)
arg_21 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10063)
arg_22 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10064)
arg_23 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10065)
arg_24 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10066)
arg_25 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10067)
arg_26 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10068)
import torch
import sys
torch._dynamo.config.capture_scalar_outputs = True

args = (arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10, arg_11, arg_12, arg_13, arg_14, arg_15, arg_16, arg_17, arg_18, arg_19, arg_20, arg_21, arg_22, arg_23, arg_24, arg_25, arg_26)
result_original = fuzzed_program(*args)
print(' eager success')
sys.exit(1)
compiled_program = torch.compile(fuzzed_program, fullgraph=False, dynamic=True)
result_compiled = compiled_program(*args)
print(' compile success')

======================
Program exited with code: 1
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163743
Approved by: https://github.com/pianpwk
2025-09-25 21:42:22 +00:00
1cffa42d4d PyTorch histc fix for values with large magnitudes (#163506)
Summary:
The current implementation of the `histc` function on CPU doesn't take into account the nature of the floating point precision represenation when two numbers have very different magnitudes.

In the code of `histc` there is a following logic, which tries to fix an issue when automatically calculated `min` and `max` are identical:
```
if (leftmost_edge == rightmost_edge) {
        leftmost_edge -= 1;
        rightmost_edge += 1;
    }

...

TORCH_CHECK(leftmost_edge < rightmost_edge, "torch.histc: max must be larger than min");
```

But, not for all floating point values expanding the range exactly by 1 will give the representable result that is different from the original value.

The test code:

```
info = th.finfo(th.float32)
f_min = info.min

test_tensor = th.ones((224, 224), dtype=th.float64) * f_min
res = th.histc(test_tensor, bins=10)
```

Actual result:
```
RuntimeError: torch.histc: max must be larger than min
```

Expected result:
Everything should work fine.

NOTICE: If we set `f_min` just to small enough number, code works, which demonstrates the correct purpose of the possible range correction.

In short, `f_min + 1 == f_min` executes to true, since we reach the precision of the floating point prepresentation.
Please notice, this is not limitation of the float32 data type, since all computations happen in float64 (C++ data type `double`). The magnitudes are just different enough, that we reach the precision representation with simple approach of `+/-1`.

Interesting is that `histogram` function doesn't throw an exception, because edges range selection is implemented differently.

The fix we propose is to use `std::nextafter` which returns next representable floating point value starting from the current one in the direction of the lowest or max numbers. In theory, mathecmatically correct is to use this function without constrains, but to maintain backward compatibility in case if there is a code which relies on the current logic of `+/-1` offset we call `std::min` and `std::max` to pick the right representable value (i.e. for small floating point values the next representable value has step smaller than 1 for large values it's larger than 1).
We could stick to `histogram` implementation, but again, to avoid possible backward compatibility breaks, we decided to use the fix presented in this change.

*The real use case scenario:*
In our project we use the well-known transformer version from HuggingFace which fills up the buffer with float32 min (please note this is not a minimal value closer to 0, it's minimal absolute value which is often like `-max`).
The code where it sits is here:
https://github.com/huggingface/transformers/blob/v4.51.1/src/transformers/models/mimi/modeling_mimi.py#L1159

Switching to other version of the transformer will lead to other issues in our project and the bug which we fix here may appear in other projects and scenarios.

The real world problem appears when for such tensor the CPU version of the `histc` is called. In our usecase, it happens because this tensor is an input to the softmax activaiton function and as part of the quantisation the input parameter should go trough the observer as well. In our case the default Histogram observer is selected, which calls the `histc`.

Test Plan:
The simple test code snippet doesn't produce failure:
```
f_min = th.finfo(th.float32).min
test_tensor = th.ones((224, 224), dtype=th.float32) * f_min
th.histc(test_tensor, bins=10)
```

**Testing update:**
The `test_histc` has been updated accordingly.
Now when we have +INF as all values of the tensor, the previous representation of the floating number should be <max_float>, hence the assert message is changed from `[inf, inf]` to `[<max_float>|inf, inf]`.
The test also extended to check the assert message when tensor is filled with values -INF and with combination of (-INF, +INF).
The new regexp assert includes possible output as `inf` and any floating point number in scientific representation for one of the bin edges. We left `inf` as possible value due to possible difference in implementation between CPU and CUDA.

Differential Revision: D82955597

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163506
Approved by: https://github.com/jermenkoo, https://github.com/malfet
2025-09-25 20:55:25 +00:00
ebfc87e303 Always produce kernel_info.json (#163715)
Summary: Always produce kernel_info.json so zoomer can use this json to populate GPU traces

Test Plan: CI

Differential Revision: D82762435

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163715
Approved by: https://github.com/angelayi
2025-09-25 19:38:49 +00:00
21a41edd4f Add fake_impl for _native_multi_head_attention (#163700)
Test Plan: See added test in test_export.py

Differential Revision: D83099187

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163700
Approved by: https://github.com/angelayi
2025-09-25 19:01:27 +00:00
7bad9c5a64 Revert "Update ruff to 0.13.1 (#163744)"
This reverts commit 3dd89a079f2b0c1d39351f98ff5d5ca882523152.

Reverted https://github.com/pytorch/pytorch/pull/163744 on behalf of https://github.com/malfet due to Broke lint, see https://github.com/pytorch/pytorch/actions/runs/18016220484/job/51261729375 looks like a landrace with PR that updated min-version to 3.10 ([comment](https://github.com/pytorch/pytorch/pull/163744#issuecomment-3335534084))
2025-09-25 18:54:03 +00:00
151e66e50d Update documentation for torch.index_select (#163616)
Description said "entries in index which is a LongTensor" but index_select can accept an IntTensor as the parameter
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163616
Approved by: https://github.com/jbschlosser

Co-authored-by: Joel Schlosser <75754324+jbschlosser@users.noreply.github.com>
2025-09-25 18:29:17 +00:00
b61bdc7cc4 Fix cpp build (#162774)
Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162774
Approved by: https://github.com/malfet, https://github.com/atalman
2025-09-25 18:21:45 +00:00
3dd89a079f Update ruff to 0.13.1 (#163744)
Update ruff to 0.13.1 so that we can remove `UP038` from `pyproject.toml` because it has been removed from supported rules of ruff.
There are some fixes, the most notable one is [(PYI059)](https://docs.astral.sh/ruff/rules/generic-not-last-base-class/#generic-not-last-base-class-pyi059)
```
Checks for classes inheriting from typing.Generic[] where Generic[] is not the last base class in the bases tuple.

```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163744
Approved by: https://github.com/Skylion007, https://github.com/jingsh
2025-09-25 17:52:35 +00:00
6539537a59 [ROCm][CD] create ROCm 7.0 images for binary builds (#163860)
Adds gfx950.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163860
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-09-25 17:26:40 +00:00
3cbfbbd691 [ROCm] Transformer/SDPA unit test parity (#163745)
## Major Changes

* Efficient Attention on ROCM requires last dimensions of input tensors align with 16 bytes.
  - Unlike FA, ME does not pad input tensors in `scaled_dot_product_attention` and hence this is required.
* Fix `atomic_counter` handling in varlen FA API
* Unskips a few unit tests.

Fixes #157120
Fixes #157121
Fixes #157122
Fixes #157167
Fixes #155217
Fixes #157043
Fixes #157060

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163745
Approved by: https://github.com/jeffdaily
2025-09-25 17:14:19 +00:00
112e204797 Revert "[CUDA] Compare major version of the runtime device arch against the built version of the pytorch binary (#161299)"
This reverts commit 7163dce1e091cb5564c723110314bb372b5e81a8.

Reverted https://github.com/pytorch/pytorch/pull/161299 on behalf of https://github.com/nWEIdia due to Incorrectly suppressing useful warnings when running sm89 binary on sm86 ([comment](https://github.com/pytorch/pytorch/pull/161299#issuecomment-3335127621))
2025-09-25 17:13:32 +00:00
f9821b1be7 DebugMode supports_higher_order_operators=True (#163824)
Make DebugMode supports HOP

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163824
Approved by: https://github.com/ydwu4
2025-09-25 17:11:43 +00:00
c4312b443f [Tools] Adapting the Hypothesis library (version 5.x) for use with the PyTorch framework (#163748)
Starting from version 5.x, the Hypothesis library removed the timeout setting and only retained the deadline.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163748
Approved by: https://github.com/albanD, https://github.com/Skylion007
2025-09-25 16:41:50 +00:00
7194d77550 Revert "enable test_sampled_addmm_zero_sized_cuda for rocm (#121940)" (#163848)
This reverts commit 5494b2a8d38c3ddbeb2d96a5ac990e20ec4c48fd.

Need to skip `test_sparse_csr.py::TestSparseCSRCUDA::test_sampled_addmm_zero_sized_cuda_*` again. Tests are failing now with "core dumped" error
```
python test_sparse_csr.py -v -k test_sampled_addmm_zero_sized_cuda_float64

  test_sampled_addmm_zero_sized_cuda_float64 (__main__.TestSparseCSRCUDA) ... /tmp/pytorch/test/test_sparse_csr.py:2503:   c = torch.empty(m, n, dtype=dtype, device=device, layout=torch.sparse_csr)
GPU core dump created: gpucore.186789
:0:rocdevice.cpp            :2992: 4701819131755 us:  Callback: Queue 0x760cdcd00000 aborting with error : HSA_STATUS_ERROR_EXCEPTION: An HSAIL operation resulted in a hardware exception. code: 0x1016
Aborted (core dumped)
```
These failures are linked to `test_sparse_csr.py::TestSparseCSRCUDA::test_select_SparseBSC_int32_cuda_*` due to incorrect test log parsing. We will be able to close these issues also:

- Fixes https://github.com/pytorch/pytorch/issues/163663
- Fixes https://github.com/pytorch/pytorch/issues/160786
- Fixes https://github.com/pytorch/pytorch/issues/160785
- Fixes https://github.com/pytorch/pytorch/issues/160784

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163848
Approved by: https://github.com/jeffdaily
2025-09-25 16:38:00 +00:00
22d5f5ff94 [OpenReg][BE] Replacing explicit prefix/suffix with CMake variables (#163850)
As the title states, suffixes like`.dylib` and `lib` can be replaced by `CMAKE_SHARED_LIBRARY_SUFFIX`, and prefixes like `lib` can be replaced by `CMAKE_SHARED_LIBRARY_PREFIX` on Unix or `CMAKE_IMPORT_LIBRARY_PREFIX` on Windows.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163850
Approved by: https://github.com/albanD
2025-09-25 16:33:16 +00:00
c8e75c48b9 [fr] Skip the dtype check for some one to all or all to one collective (#163839)
As title, in practice we found that sometimes, the dtype of gather does not match when it comes to output among all ranks, which is a undefined behavior. Same with broadcast and scatter. And they are all completed, so we should not think they are errors, we can skip it.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163839
Approved by: https://github.com/VieEeEw
2025-09-25 16:02:06 +00:00
e8f5f1b1a2 [NFC] fixed mistake in comment (#163697)
I used "floor" instead of "ceil", so fix it. Also fixed other typo.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163697
Approved by: https://github.com/jcaip
2025-09-25 15:53:51 +00:00
10e69a6e17 Preserve user annotation in graph (#163673)
```
import torch
import torch.fx.traceback as fx_traceback
import torch.export

class M(torch.nn.Module):
    def forward(self, x):
        with fx_traceback.annotate({"pp_stage": 0}):
            with fx_traceback.annotate({"fdsp_bucket": 0}):
                x = x + 1
            x = x - 2
            with fx_traceback.annotate({"cuda_stream": 2, "fsdp_bucket": 1}):
                x = x * 2
        x = x / 3
        return x

m = M()

with fx_traceback.preserve_node_meta():
    ep = torch.export.export(m, (torch.randn(10),))

for node in ep.graph.nodes:
    if node.op == "call_function":
        print(f"{node.target}, {node.meta.get("custom", {})}")

```

prints

```
aten.add.Tensor, {'pp_stage': 0, 'fdsp_bucket': 0}
aten.sub.Tensor, {'pp_stage': 0}
aten.mul.Tensor, {'pp_stage': 0, 'cuda_stream': 2, 'fsdp_bucket': 1}
aten.div.Tensor, {}
```

TODOs:
- run_decomposition is failing
- Need to test with the new full graph capture + aot_export_joint apis
- Need to make the annotation propagate through autograd engine to reach the bw nodes. Sample impl here: https://github.com/pytorch/pytorch/pull/83558
- Edward want to restrict the key in custom field to be top-level singleton objects only
- also need to take care of metadata merging when passes are fusing nodes

Thanks @angelayi  for contributing the dynamo fixes.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163673
Approved by: https://github.com/albanD, https://github.com/angelayi
2025-09-25 15:50:15 +00:00
5fcde74aed Fix pipeline parallelism not correctly initializing backwards stages when evaluating before training. (#162823)
Previously, an eval() call before a training step() would not correctly initialize the backward pass of the pipeline stages, leading to errors during the subsequent training step. This PR ensures that the backward stages can still be initialized after an eval() call.

Fixes #162822

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162823
Approved by: https://github.com/dcci, https://github.com/H-Huang
2025-09-25 15:13:19 +00:00
6fa3715c12 Expose Kineto event metadata in PyTorch Profiler events (#161624)
## Overview
This PR allows the profiler users to access `Kineto` and `TorchOp` metadata in JSON string format through a new `metadata_json` attribute in `FunctionEvent` objects, which is triggered through a new `expose_kineto_event_metadata` flag in `ExperimentalConfig`.

## Testing
A unit test was added to validate functionality.

## Documentation
Added/updated function doc strings where appropriate.

## Example output
```python
import torch
from torch.profiler import profile

with profile(experimental_config=torch._C._profiler._ExperimentalConfig(expose_kineto_event_metadata=True)) as prof:
    res = torch.mm(torch.rand(1024, 1024), torch.rand(1024, 1024))

for event in prof.events():
    print(f'name: {event.key}, metadata: {event.metadata_json}')
```

```
name: aten::rand, metadata: "Ev Idx": 0
name: aten::empty, metadata: "Ev Idx": 1
name: aten::uniform_, metadata: "Ev Idx": 2
name: aten::rand, metadata: "Ev Idx": 3
name: aten::empty, metadata: "Ev Idx": 4
name: aten::uniform_, metadata: "Ev Idx": 5
name: aten::mm, metadata: "Ev Idx": 6
name: aten::resolve_conj, metadata: "Ev Idx": 7
name: aten::resolve_conj, metadata: "Ev Idx": 8
name: aten::resolve_conj, metadata: "Ev Idx": 9
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/161624
Approved by: https://github.com/sraikund16
2025-09-25 14:58:30 +00:00
98c4e35f14 [CD] Add statically linked windows libraries to exclude list (#163768)
Fixes: https://github.com/pytorch/pytorch/issues/159514

Seeing following in the Wheel build logs:
```
Linking CXX static library lib\kineto.lib
Linking CXX static library lib\dnnl.lib
....
```

These files are around 800MB uncompressed and 109MB compressed, hence provide ~50% size reduction for Windows CPU builds.

Test Plan: Build Pytorch Windows binary. Build vision, audio and torchcodec with this binary. Smoke test.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163768
Approved by: https://github.com/albanD, https://github.com/malfet
2025-09-25 14:03:14 +00:00
00059db034 Revert "[RELAND] Always build USE_DISTRIBUTED (#160449) and Make distributed modules importable even when backend not built (#159889) (#162594)"
This reverts commit 09cb34c1dce8fe1b880bbf3115d8ddad3401d871.

Reverted https://github.com/pytorch/pytorch/pull/162594 on behalf of https://github.com/malfet due to reverted internally and now can be safely reverted in OSS ([comment](https://github.com/pytorch/pytorch/pull/162594#issuecomment-3334176367))
2025-09-25 13:47:46 +00:00
22fcc8b76b [async_tp] Support mm+rs with scatter_dim matmul K by sharding B (#162794)
Current state: Shape mismatch failure when mm+rs on the last mm scatter dim.

Adding separate path to handle lastdim for aten.mm, scaled_mm should be handled similarly, but needs additional PR.
So disabling scaled_mm case with filter matmul function.

Adding inductor.config for this change that is True by default for fast debuggability of new path.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162794
Approved by: https://github.com/fegin
2025-09-25 12:18:39 +00:00
ab2ce3c50e [Code Clean] Replace std::runtime_error with TORCH_CHECK (#163264)
Related ISSUE: https://github.com/pytorch/pytorch/issues/148114
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163264
Approved by: https://github.com/albanD, https://github.com/cyyever
2025-09-25 11:28:51 +00:00
7d710403b0 Reapply "Make functionalization ViewMeta serializable with pickle. (#143712)" (#163769)
### Summary:
NOTE: This is a re-export of https://github.com/pytorch/pytorch/pull/161994 ; the changes between these two PRs is exclusively to the buck/build files

(Summary from #161994 )
Attempted rebase of https://github.com/pytorch/pytorch/pull/143712.

This reverts commit 6c713ccb5e0df227dd5b630057cbccd373cbe7d6.

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

imported-using-ghimport

Test Plan: Imported from OSS

Differential Revision: D81524507

Pulled By: Lucaskabela

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163769
Approved by: https://github.com/dolpm

Co-authored-by: Brian Hirsh <hirsheybar@fb.com>
2025-09-25 10:27:37 +00:00
29cbcbac42 [BE] Make PyObjectSlot use a global PyInterpreter (#162659)
This pr gets rid of the pyobj_interpreter_ variable from PyObjectSlot and saves a word in the process

Gonna ask for review from @huydhn as there are some changes to CI.

Testing: imported internally and the failed android build seems to work now!

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162659
Approved by: https://github.com/albanD, https://github.com/huydhn
2025-09-25 08:53:19 +00:00
5f90e8c7ae [PGO] ignore extra PGO key if warm/cold cache present (#163810)
Summary: avoids PGO profile merges

Test Plan: test_pgo

Differential Revision: D83200714

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163810
Approved by: https://github.com/bobrenjc93
2025-09-25 07:16:05 +00:00
eb7f4e0004 Add PEP 517 compliant Python source distribution to release process (#157815)
This adds the actual creation of a standards compliant sdist along with its upload to s3 to the create release workflow.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157815
Approved by: https://github.com/malfet, https://github.com/atalman
ghstack dependencies: #157814, #160315
2025-09-25 07:15:52 +00:00
42928876eb Add sdist handling to version finding (#160315)
The version finding logic triggered from `setup.py` generally tries to take the git information into account.
This is fine for most situations where we are building from a checkout, but it creates a problem in the case of sdists, as here the version is determined at the time of sdist creation, taking the git information into account, but then later recalculated when building wheels or installing from the sdist, now with the git information missing.

The solution is to take the version information directly from the sdist, which this PR adds by means of parsing the `PKG-INFO` which marks an unpacked sdist.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/160315
Approved by: https://github.com/atalman
ghstack dependencies: #157814
2025-09-25 07:15:51 +00:00
c44ec9f4c2 Improve MANIFEST.in for source distribution (#157814)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/157814
Approved by: https://github.com/XuehaiPan, https://github.com/atalman
2025-09-25 07:15:42 +00:00
353991dd92 [PGO] distinguish sticky PGO put (#163799)
Summary: put_remote_code_state vs. put_extra_remote_code_state

Test Plan: test_pgo

Differential Revision: D83195687

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163799
Approved by: https://github.com/bobrenjc93
2025-09-25 06:59:25 +00:00
2b6a74abf1 [optim] prevent unintended aliasing in lr_scheduler; update type annotations/docs (#163120)
1. Prevents unintended aliasing of `self._last_lr`/`get_last_lr(...)` with `group["lr"]` when `group["lr"]` is a tensor.
2. Prevents unintended aliasing of `LRScheduler.base_lrs` with the `group["initial_lr"]`s.
3. Updates `test/optim/test_lrscheduler.py` to test tensor LRs.
4. Changes type annotations for `_last_lr`, `get_last_lr()`, `base_lrs`, `get_lr()`, and `_get_closed_form_lr()` from `list[float]` to `list[float | Tensor]`; adds documentation.

Fixes #163103

LR schedulers can behave in unexpected ways when using a tensor LR due to patterns like this:
```python
self._last_lr: list[float] = [group["lr"] for group in self.optimizer.param_groups]
```

This PR adds a helper to address this:
```python
def _param_groups_val_list(optimizer: Optimizer, key: str) -> list[Any]:
    """Create a list containing group[key] for each optimizer param_group.
    Prevents aliasing when group[key] could be a Tensor.
    Raises a KeyError when group[key] does not exist.
    """
    return [
        group[key].clone() if isinstance(group[key], Tensor) else group[key]
        for group in optimizer.param_groups
    ]
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163120
Approved by: https://github.com/janeyx99
2025-09-25 06:58:58 +00:00
ad869c58f5 remove allow-untyped-defs from ./torch/utils/benchmark/op_fuzzers/sparse_unary.py (#163476)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163476
Approved by: https://github.com/ezyang, https://github.com/Skylion007
ghstack dependencies: #163478, #163475, #163471
2025-09-25 06:48:44 +00:00
d5afb9e31a remove allow-untyped-defs from ./torch/ao/quantization/quantizer/utils.py (#163471)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163471
Approved by: https://github.com/Skylion007
ghstack dependencies: #163478, #163475
2025-09-25 06:48:44 +00:00
e7d6ea65ca remove allow-untyped-defs from ./torch/nn/utils/_expanded_weights/embedding_expanded_weights.py (#163475)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163475
Approved by: https://github.com/ezyang, https://github.com/Skylion007
ghstack dependencies: #163478
2025-09-25 06:48:44 +00:00
a6974195da remove allow-untyped-defs from ./torch/fx/experimental/unification/multipledispatch/core.py (#163478)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163478
Approved by: https://github.com/ezyang
2025-09-25 06:48:44 +00:00
a213848703 [Code Clean] Remove deadcodes about Python3.9 [8/N] (#163728)
As the title stated.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163728
Approved by: https://github.com/albanD, https://github.com/cyyever
ghstack dependencies: #163626, #163627, #163629, #163643, #163644, #163645, #163646
2025-09-25 05:12:46 +00:00
cde5c9aebd fix pickling for BitwiseFn (#163571)
Summary:
ran into AttributeError: Can't get local object 'make_opaque_bitwise_fn.<locals>.BitwiseFn'

looks like it was fixed for UnaryFn but not BitwiseFn in https://github.com/pytorch/pytorch/pull/138395

Fixes #147841

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163571
Approved by: https://github.com/jamesjwu
2025-09-25 04:52:11 +00:00
783a9dcb6d [6/n] Quantization with min & max bounds support - using fbgemm changes in ATen (#162924)
Summary:
This diff uses the FBGEMM changes made in D78181177 & D81858256 to support using the provided per row min/max values while quantizaing float/half to 8-bit, 4-bit & 2-bit in ATen library.

Please find more context on this here: https://fburl.com/gdoc/yutf32a0

Test Plan:
```
buck test mode/opt caffe2/torch/fb/model_transform/splitting/tests:split_dispatcher_test
```
https://www.internalfb.com/intern/testinfra/testrun/7881299640979446

Please refer to D80905814's test plan for integration testing.

Rollback Plan:

Differential Revision: D81327342

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162924
Approved by: https://github.com/jerryzh168
2025-09-25 02:52:04 +00:00
ad2f7315ca [torchfuzz] print out tensor descriptor as comments in codegen (#163739)
eg.

```
    # Node node_12: tensor_pointwise (depth 6)
    var_node_12 = torch.ops.aten.mul(var_node_13, var_node_34) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_10: tensor_pointwise (depth 7)
    var_node_10 = torch.ops.aten.div(var_node_11, var_node_12) # size=(1,), stride=(1,), dtype=complex128, device=cuda

    # Node node_2: tensor_pointwise (depth 8)
    var_node_2 = torch.ops.aten.div(var_node_3, var_node_10) # size=(1,), stride=(1,), dtype=complex128, device=cuda
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163739
Approved by: https://github.com/pianpwk
ghstack dependencies: #163547, #163553, #163554, #163555, #163556, #163557, #163558, #163560, #163698
2025-09-25 01:29:29 +00:00
cc660d38ac [CI] Install libuv for Win testing (#163797)
Current working theory why f0078941cf caused a regression, are because Windows CI no longer could be build with distributed, as it could not find libuv
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163797
Approved by: https://github.com/wdvr
2025-09-25 01:10:14 +00:00
00f96dd84d [CI] Run CUDA-13 binary builds on trunk (#163787)
There are numerous other workflows that could be used to catch CUDA-12
build regression (our CI builds are almost identical to CD ones), but not many CUDA-13 builds around, so https://github.com/pytorch/pytorch/issues/163342 are really hard to detect in CI
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163787
Approved by: https://github.com/atalman, https://github.com/huydhn
2025-09-25 00:58:17 +00:00
77b9aac6c2 Add rule for typechecking maintainers (#161307)
Allow the following people merge rights on type checking configs:
  - @lolpack
  - @maggiemoss
  - @ndmitchell
  - @kinto0

Pull Request resolved: https://github.com/pytorch/pytorch/pull/161307
Approved by: https://github.com/albanD, https://github.com/ezyang
2025-09-25 00:14:31 +00:00
7163dce1e0 [CUDA] Compare major version of the runtime device arch against the built version of the pytorch binary (#161299)
Fixes misleading warning messages when running on sm12x devices using binaries built with sm120.
PyTorch binary built with sm120 is compatible with e.g. sm121, so no need for the warning of incompatibility.

Also allow the 'matched_cuda_warn' message to show when e.g. the user is running a binary built with only sm90 on sm12x, so that the user would be prompted to get a build which supports e.g. sm120.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/161299
Approved by: https://github.com/eqy, https://github.com/atalman
2025-09-24 23:59:19 +00:00
4ac4a7351e Shortcut redistribution when num_shards == 1 (#163742)
Redistribution doesn't need collectives when num_shards == 1 on a mesh dimension.
Only placement update is needed, local_tensor remains unchanged.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163742
Approved by: https://github.com/tianyu-l

Co-authored-by: tianyu-l <150487191+tianyu-l@users.noreply.github.com>
2025-09-24 23:49:08 +00:00
65ddd91421 Fix redundant H2D/D2H memcpy in cpp_wrapper by creating scalar tensors on CPU (#160584)
Fixes #160520

Summary:
When running Inductor with cpp_wrapper under a DeviceContext, non-tensor arguments were being wrapped with torch.tensor(arg) without specifying the device.

creating the tensor on the current active device (like CUDA), and later fetching it back to CPU via .item(), causing unnecessary host-device-host memory transfers.

PR fixes issue by explicitly creating scalar tensors on the CPU:

```
input_tensors = [
    arg if isinstance(arg, torch.Tensor) else torch.tensor(arg, device='cpu')
    for arg in args
]
```

impact: inductor, codegen

Pull Request resolved: https://github.com/pytorch/pytorch/pull/160584
Approved by: https://github.com/benjaminglass1, https://github.com/desertfire, https://github.com/mlazos, https://github.com/jeffdaily
2025-09-24 23:40:37 +00:00
8c98aee436 [Inductor] Update DeviceAssert op to behave like store (#163696)
Updated the DeviceAssert operation to match the behavior of Store, it will fixes the issue mentioned in [this PR](https://github.com/pytorch/pytorch/pull/163023) and updated testcases as Elias [suggested](https://github.com/pytorch/pytorch/pull/160677#discussion_r2353834646).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163696
Approved by: https://github.com/mlazos
2025-09-24 23:35:56 +00:00
d927e55498 [torchfuzz] refactor multi_process_fuzzer to be more readable (#163698)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163698
Approved by: https://github.com/pianpwk
ghstack dependencies: #163547, #163553, #163554, #163555, #163556, #163557, #163558, #163560
2025-09-24 23:32:34 +00:00
754c7e2e88 Update pyrefly configuration file (#163775)
Related to: https://github.com/pytorch/pytorch/issues/163283

This simply updates the existing pyrefly configuration and opts out additional directories. Running `pyrefly check` with this setup will result in ~100 errors reported.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163775
Approved by: https://github.com/ezyang, https://github.com/Skylion007
2025-09-24 23:14:39 +00:00
0ec946a052 [ROCm] Increase binary build timeout to 5 hours (300 minutes) (#163776)
Despite narrowing down the [FBGEMM_GENAI build to gfx942](https://github.com/pytorch/pytorch/pull/162648), the nightly builds still timed out because they [didn't get enough time to finish the post-PyTorch-build steps](https://github.com/pytorch/pytorch/actions/runs/17969771026/job/51109432897).

This PR increases timeout for ROCm builds for both [libtorch ](https://github.com/pytorch/pytorch/actions/runs/17969771026)and [manywheel](https://github.com/pytorch/pytorch/actions/runs/17969771041), because both of those are close to the 4hr mark currently.

This PR is a more ROCm-targeted version of https://github.com/pytorch/pytorch/pull/162880 (which is for release/2.9 branch).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163776
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-09-24 23:02:08 +00:00
2b1236de61 [dynamo] Fix handling of kwargs in exception constructor (#163390)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163390
Approved by: https://github.com/guilhermeleobas
2025-09-24 22:44:14 +00:00
bc8680c298 Avoid at::alias in the repeat op implementation (#163455)
Avoid `at::alias` in the `repeat` op implementation

## Summary

This PR removed the usage of `at::alias` in the implementation and just `permute`+`reshape` the tensor to fit the specs of the result.
This is a less hacky and a more readable way of implementing the op.
All the new ops we are using are view-only ops, which does not introduce overhead of changing the storage.

## Who want this

We are using `PrivateUse1` and accelerator, but this request to avoid `at::alias` in any op should be general enough for any backend who is using XLA, or who do not have explicit control over the memory allocation on the devices.

## Why we/they need this

As we support TPU, we are overriding some ATen ops by binding them to PrivateUse1.
However, it is not recommended to override the `repeat` op directly as we saw the following in `RegistrationDeclaration.h`.

```
at::Tensor repeat(const at::Tensor & self, c10::SymIntArrayRef repeats); // {"schema": "aten::repeat(Tensor self, SymInt[] repeats) -> Tensor", "dispatch": "True", "default": "True"}
```

We had to reuse the existing implementation of `repeat` to decomposite to other ops.
However, we are unable to support the current implementation, which uses `at::alias`.
It have two tensors share the same storage and modify one of them and return the other assuming it is changed, too.

As, we do not have explicit control over the memory allocation of the tensors using XLA/PJRT.

## Alternatives

We are open to alternative solutions that work for us if this PR is not in favor of the PyTorch community.
For example, we may just bind our version of `repeat` op implementation to both `PrivateUse` and `AutogradPrivateUse1`.
However, to my understanding, this would not work well with torch dynamo and `torch.compile`.

Would you mind guiding us on how to solve this?

Thanks!

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163455
Approved by: https://github.com/Skylion007
2025-09-24 22:28:24 +00:00
1495b35d29 Remove Python 3.9 for Triton builds (#163778)
Related to https://github.com/pytorch/pytorch/issues/161167

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163778
Approved by: https://github.com/malfet
2025-09-24 20:19:43 +00:00
90a282504e Add inference_mode hint message to use eval with inference. (#163619)
Fixes #162923

## Test Result

### Before

<img width="985" height="889" alt="image" src="https://github.com/user-attachments/assets/41de5cfa-7b25-4ba4-ade8-a6df745dcb30" />

### After

<img width="913" height="977" alt="image" src="https://github.com/user-attachments/assets/b6c06860-8db3-4b5d-9d46-31ece01fb04d" />

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163619
Approved by: https://github.com/jbschlosser
2025-09-24 20:07:14 +00:00
0dce2afd44 [ROCm][CI] adjust tf32 tolerance for test_compile_kernel_advanced (#163783)
Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163783
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-09-24 19:39:15 +00:00
71eec6a0bf [dist] handle discontiguous allgather/reducescatter inputs (#163712)
Fixes #163483

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163712
Approved by: https://github.com/ezyang, https://github.com/kwen2501
2025-09-24 19:38:44 +00:00
0456b23b77 [AOTI] Add verbose error information for extract file (#163718)
This PR optimize `extract_file` functions:
1. `normalize_path_separator` the dest path for Windows.
2. Add verbose error message:
a. On Linux, add mz_zip error string.
b. On Windows, add mz_zip error string and Windows error code.

For the UT `test_package_user_managed_weight`:
<img width="1910" height="442" alt="image" src="https://github.com/user-attachments/assets/6a63eda1-70ce-40fb-9681-adc955463884" />

It still have issue with error code `32`, checked https://learn.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499- and find the verbose is `ERROR_SHARING_VIOLATION`.

It is a little complex to debug, I will continue to working on it in further PR.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163718
Approved by: https://github.com/desertfire
2025-09-24 19:27:30 +00:00
c414f75c8b [WOQ][Inductor] Enable CUDA coverage for _weight_int8pack_mm (#163461)
Summary:
What: Unskip the CUDA path for test_int8_weight_only_quant in test_torchinductor.py as the kernel was added by #159325.

Why: Confirm CUDA backend for _weight_int8pack_mm is registered.

Test Plan:
```
buck2 test 'fbcode//mode/opt' fbcode//caffe2/test/inductor:test_inductor_cuda
```
https://www.internalfb.com/intern/testinfra/testrun/2533275104869494

Differential Revision: D82926440

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163461
Approved by: https://github.com/jerryzh168
2025-09-24 19:20:38 +00:00
768361e67f Add less warps config to inner reductions (#162447)
Add less warps to ensure proper vectorization + memory coalescing for inner reductions, prefer more work per thread

<img width="1717" height="731" alt="Screenshot 2025-09-17 at 10 03 25 AM" src="https://github.com/user-attachments/assets/7b1f4a30-62f2-4bee-bb9c-122501bde63e" />

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162447
Approved by: https://github.com/v0i0, https://github.com/eellison, https://github.com/shunting314
2025-09-24 19:09:02 +00:00
9341ede617 Revert to old behaviour of not padding strides if shape or stride is dynamic (#163639)
Differential Revision: D83053287

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163639
Approved by: https://github.com/blaine-rister
2025-09-24 18:31:01 +00:00
4c2c401ccf Record redistribute_local_tensor in DebugMode (#163704)
Explicit redistribute_local_tensor API call could also results in communication, record it!

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163704
Approved by: https://github.com/ezyang
2025-09-24 16:11:26 +00:00
5d0f639234 Make Tensor.__dlpack__(stream=None) capture-safe during CUDA Graph capture (#163242)
Many extensions (including pybind helpers) call `Tensor.__dlpack__()` without a stream argument. Before #150217, `stream=None` behaved like “no cross-stream sync” and was safe inside CUDA Graph capture. After #150217, `stream=None` maps to the legacy default stream, adding a cross-stream wait that invalidates capture when running on a non-default stream.

See this example

```
import torch
s = torch.cuda.Stream()
x = torch.randn(8, device="cuda")
g = torch.cuda.CUDAGraph()

with torch.cuda.stream(s):
    with torch.cuda.graph(g):
        _ = x + 1
        cap = x.__dlpack__()
        _ = torch.utils.dlpack.from_dlpack(cap)
```

This PR partially reverts #150217 that stream=None defaults to no sync.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163242
Approved by: https://github.com/ngimel
2025-09-24 16:04:19 +00:00
9d0d98acfe Use cuda nvrtc so file based on cuda version used by torch (#163642)
Fixes https://github.com/pytorch/pytorch/issues/162367

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163642
Approved by: https://github.com/msaroufim
2025-09-24 14:23:39 +00:00
3b73841f43 update test_quantization tests to run weekly (#163077)
Fixes #162854

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163077
Approved by: https://github.com/huydhn
2025-09-24 11:31:11 +00:00
141fc7276e [CD] CUDA 13.0 fix preload logic to include nvidia/cu13/lib/ (#163661)
Preload logic no longer works with CUDA 13.0
See the installation path:
```
ls /home/ubuntu/.venv/lib/python3.10/site-packages/nvidia/cu13/lib/
libcheckpoint.so   libcudadevrt.a      libcufft.so.12   libcufile_rdma.so.1  libcusolver.so.12    libnvJitLink.so.13  libnvperf_target.so            libnvrtc.alt.so.13    libpcsamplingutil.so
libcublas.so.13    libcudart.so.13     libcufftw.so.12  libcupti.so.13       libcusolverMg.so.12  libnvblas.so.13     libnvrtc-builtins.alt.so.13.0  libnvrtc.so.13
libcublasLt.so.13  libcudart_static.a  libcufile.so.0   libcurand.so.10      libcusparse.so.12    libnvperf_host.so   libnvrtc-builtins.so.13.0      libnvtx3interop.so.1

ls /home/ubuntu/.venv/lib/python3.10/site-packages/nvidia/
cu13  cudnn  cusparselt  nccl  nvshmem
```

Test using script from : https://github.com/pytorch/pytorch/issues/162367
```
Kernel test passed!
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163661
Approved by: https://github.com/nWEIdia, https://github.com/tinglvv, https://github.com/Camyll
2025-09-24 11:27:05 +00:00
b66aa1ade1 [ARM] Add test_memory_profiler to aarch64 tests (#145260)
TestMemoryProfilerE2E.test_memory_timeline is failing on AArch64, this fixes it and enables it in the opt-in list of tests for AArch64.

Fixes #142371

Pull Request resolved: https://github.com/pytorch/pytorch/pull/145260
Approved by: https://github.com/fadara01, https://github.com/sraikund16
2025-09-24 09:29:13 +00:00
207f104594 [Triton] [Inductor] Set default configs for Blackwell Matmul Template (#163740)
Summary: Sets the default configs for the Blackwell Matmul Templates.

Test Plan: NFC

Differential Revision: D83116342

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163740
Approved by: https://github.com/jananisriram
2025-09-24 08:17:35 +00:00
3e1b1a30f2 Revert "[inductor] Fix issue with scalar arg handling" (#163737)
This reverts commit a8cd437183142e17ba6fc8d7b5e9dcee462d7904.

See https://github.com/pytorch/pytorch/pull/163481#issuecomment-3326310774

This PR might also cause issues with cudagraphs.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163737
Approved by: https://github.com/ezyang
ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393, #163412, #163422, #163481, #163520, #163482
2025-09-24 07:33:12 +00:00
2390d34c9b [Code Clean] Remove deadcodes about Python3.9 [7/N] (#163646)
As the title stated.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163646
Approved by: https://github.com/jansel
ghstack dependencies: #163626, #163627, #163629, #163643, #163644, #163645
2025-09-24 07:30:50 +00:00
a635505a99 [Code Clean] Remove deadcodes about Python3.9 [6/N] (#163645)
As the title stated.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163645
Approved by: https://github.com/albanD
ghstack dependencies: #163626, #163627, #163629, #163643, #163644
2025-09-24 07:30:50 +00:00
6f34cc040f [Code Clean] Remove deadcodes about Python3.9 [5/N] (#163644)
As the title stated.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163644
Approved by: https://github.com/jansel
ghstack dependencies: #163626, #163627, #163629, #163643
2025-09-24 07:30:50 +00:00
ec0cd81c38 [Code Clean] Remove deadcodes about Python3.9 [4/N] (#163643)
As the title stated.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163643
Approved by: https://github.com/albanD
ghstack dependencies: #163626, #163627, #163629
2025-09-24 07:30:50 +00:00
33aabdd8ac [Code Clean] Remove deadcodes about Python3.9 [3/N] (#163629)
As the title stated.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163629
Approved by: https://github.com/albanD
ghstack dependencies: #163626, #163627
2025-09-24 07:30:50 +00:00
0bca77951d [Code Clean] Remove deadcodes about Python3.9 [2/N] (#163627)
As the title stated.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163627
Approved by: https://github.com/jansel
ghstack dependencies: #163626
2025-09-24 07:30:50 +00:00
bf0747c6c6 [Code Clean] Remove deadcodes about Python3.9 [1/N] (#163626)
As the title stated.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163626
Approved by: https://github.com/Skylion007, https://github.com/albanD
2025-09-24 07:30:50 +00:00
11a231ef52 [c10d] P2P tensors must be dense (#163719)
Fixes #161324
by adding `is_non_overlapping_and_dense` check.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163719
Approved by: https://github.com/ngimel
2025-09-24 06:58:03 +00:00
dad54ca7c0 Add mistral/gpt-oss to benchmarks (#163565)
Potential issues
* gpt-oss-20b is probably too big (I can't run on my devserver)
* Mistral requires HF authentication
* Mistral also takes a while to run the performance checks (need to wait for CI)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163565
Approved by: https://github.com/huydhn
2025-09-24 06:12:36 +00:00
2c5a3d7e60 Delete functorch C extension entirely. (#163340)
Signed-off-by: Edward Yang <ezyang@meta.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163340
Approved by: https://github.com/aorenste, https://github.com/wdvr, https://github.com/albanD, https://github.com/malfet
2025-09-24 06:08:58 +00:00
f68de58c9d [Inductor-FX] Support symbol and dynamic scalar graph inputs and outputs (#163596)
# Problems
This PR fixes a few edge cases that the FX converter missed related to dynamic shapes.

1. Inductor graphs can sometimes take `sympy.Symbol` inputs. We have logic to convert these to FX placeholder nodes. However, this logic did not update the `self.expr_to_proxy` table mapping symbols to proxy nodes. (There was existing logic to do this for `ir.TensorBox` inputs, but not `sympy.Symbol`.) This caused sympy tracing to fail when these symbol inputs were used in other expressions.

2. We lacked codegen for `ShapeAsConstantBuffer`. This IR node is seen when the graph input or output is a scalar computed from dynamic shapes.

# Fixes
a. Update `self.expr_to_proxy` when generating placeholders for `sympy.Symbol` inputs. Change `SymbolBuffer.get_example` to convert the symbol to a `torch.SymInt`, so we can populate `meta["val"]` correctly and use the value in other computations.
b. Support `ShapeAsConstantBuffer` by tracing the sympy expression.
c. Move output generation inside the metadata hook, allowing us to populate `meta["val"]` for the nodes computing `ShapeAsConstantBuffer`.

# Test plan
Added several new CI tests:
 1. `torch.cond` with dynamic shapes. This exposes both issues, as the predicate is a `ShapeAsConstantBuffer` and one of the subgraphs uses a symbol input, due to the closure. Also tests when the parent and subgraphs have different input shapes.
 2. Output dynamic shape scalar. This tests `ShapeAsConstantBuffer` as an output.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163596
Approved by: https://github.com/angelayi, https://github.com/jansel
2025-09-24 06:08:14 +00:00
a8e9ed2407 [inductor] turn on loaf (for oss) by default (#162030)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162030
Approved by: https://github.com/eellison, https://github.com/jansel
2025-09-24 06:02:02 +00:00
0390798dad [Triton] [Inductor] Enable Epilogue Subtiling in the blackwell ws template (#163145)
Summary: Enables support for epilogue subtiling in the blackwell ws template. This requires the ability to call `store_output` twice in the same kernel and reuse the same tensor descriptor across allocations.

Test Plan:
Tested with test_max_autotune.py on a Blackwell server.

Rollback Plan:

Differential Revision: D82610077

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163145
Approved by: https://github.com/eellison
2025-09-24 05:38:02 +00:00
124dd364e9 [hop] support local_map + SAC (#163322)
Some ops like local_map hop's deferred mode are not desugared by make_fx, this means that when we apply SAC tags, we will need to define dispatch rules for the SAC torch dispatch modes as pointed out here: https://github.com/pytorch/pytorch/issues/162246#issuecomment-3259176721. This PR adds those rules.

Additionally it fixes a pre-existing issue where we weren't coercing tangent layout (that AOTAutograd typically does) when partitioning the HOP joint.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163322
Approved by: https://github.com/ezyang
2025-09-24 04:57:40 +00:00
20eeb54814 Add api info for torch._C._nn.pyi (#162936)
Fix part of #148404

APis involved are as followed:

- silu
- silu_
- smooth_l1_loss
- soft_margin_loss
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162936
Approved by: https://github.com/FFFrog, https://github.com/ezyang
2025-09-24 04:55:57 +00:00
6f1d962d5b [vllm hash update] update the pinned vllm hash (#163711)
This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned vllm hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163711
Approved by: https://github.com/pytorchbot
2025-09-24 04:31:37 +00:00
42e9902a0f cd: Move arm64 to linux.arm64.r7g.12xlarge.memory (#163681)
This should reduce the amount of build time we have by a lot by just
throwing more hardware at the problem.

Signed-off-by: Eli Uriegas <eliuriegas@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163681
Approved by: https://github.com/huydhn, https://github.com/atalman, https://github.com/malfet
2025-09-24 04:06:09 +00:00
d746b987d8 [inductor] Fix divmod error in decomp (#163482)
Fixes #163457

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163482
Approved by: https://github.com/eellison
ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393, #163412, #163422, #163481, #163520
2025-09-24 02:52:36 +00:00
6fa972796e [inductor] Fix bugs in emulate_precision_casts (#163520)
Fixes #163449
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163520
Approved by: https://github.com/eellison
ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393, #163412, #163422, #163481
2025-09-24 02:52:36 +00:00
ca512af3e7 [inductor] Fix issue with scalar arg handling (#163481)
Fixes #163420

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163481
Approved by: https://github.com/eellison
ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393, #163412, #163422
2025-09-24 02:52:36 +00:00
c261c71f3e Simplify _compute_local_shape_and_global_offset and make it SPMD. (#163344)
There is only one substantive change: the branch on
`global_offset[shard_dim] <= local_offset[shard_dim]`
is removed because it is unnecessary: you can always treat the
first shard uniformly with the rest of the shards, because your
global offset is guaranteed to be zero in this case anyway.

I also switch the shard_size case to sym_ite, to make it possible
for LocalTensor to deal with the MPMD-ness here, but it's equivalent
to the old if-then-else.

I tried to rewrite the comments to be more clear what is going on
algorithmically here.

Signed-off-by: Edward Z. Yang <ezyang@meta.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163344
Approved by: https://github.com/albanD, https://github.com/zpcore, https://github.com/tianyu-l
2025-09-24 02:24:09 +00:00
e2ce79e4cc [Flex] Fix silent correctness w/ backpropping grads (#163677)
Fixes #https://github.com/pytorch/pytorch/issues/162228

# Summary

Majority of our tests are only compiling flex-attention in isolation. This means that for fake tensor propagation the input primals and all captured buffers dont do any intermediate computation below autograd.  As a result result the by happen chance match the `require_grad`ness of the eager implementation and this check  will pass. However if score_mod is a the result of some other intermediate fake tensor prop then it is not guaranteed to have accurate req_gradness, which was happening here.

TLDR is that this was a boot and suspenders that was actually harmful and we should just let the joint graph handle creating the correct joint graph

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163677
Approved by: https://github.com/ydwu4
2025-09-24 02:12:19 +00:00
be6c127927 [AOTI] Pass comments from metadata to the autotune block (#163600)
Summary: When generating Triton kernels in the compile-time autotune blocks, it will be useful to generate source information as code comments. Previously we ignore these comments for autotune code blocks because the generated main output code will contain the same information, but it won't work if the generated autotune code crashes.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163600
Approved by: https://github.com/yushangdi
2025-09-24 02:01:59 +00:00
1e754d5a80 docs and optional kwargs for full graph capture (#163550)
Test Plan: existing tests

Differential Revision: D82995546

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163550
Approved by: https://github.com/tugsbayasgalan
2025-09-24 01:20:27 +00:00
dc9352938b [Triton] [Inductor] Restrict subprocess autotuning to just Triton (#162688)
Summary: Restricts subprocess benchmarking to only `TritonTemplateCaller`, which is expected by the underlying `target` method. THhis triggered a bug with large K shapes because the decompose k is `SubgraphChoiceCaller`.

Test Plan:
mm autotuning with a large k and `TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC=1`

Rollback Plan:

Differential Revision: D82181924

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162688
Approved by: https://github.com/PaulZhang12, https://github.com/eellison, https://github.com/mlazos
2025-09-24 01:03:40 +00:00
4535254c28 [3/N] Use std::filesystem in inductor (#163632)
Continued work to use std::fs in inductor.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163632
Approved by: https://github.com/Skylion007
2025-09-24 00:23:34 +00:00
eb3fbf5b08 [inductor] in emulate_precision_casts, disable fma fusion in triton (#163073)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163073
Approved by: https://github.com/eellison, https://github.com/jansel
2025-09-23 23:59:17 +00:00
ee75c3d91f Support for amin, amax, and aminmax (#163669)
Support for amin, amax, and aminmax

Test Plan: E2E tests in the stack with benchmark suite passes.

Differential Revision: D83016894

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163669
Approved by: https://github.com/albanD, https://github.com/malfet
2025-09-23 23:45:43 +00:00
f9fa138a39 [BE] Delete all pre py-3.10 checks (#163653)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163653
Approved by: https://github.com/jansel
ghstack dependencies: #163648, #163649
2025-09-23 23:22:53 +00:00
f3f67ff43a Fix warn message (#163578)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163578
Approved by: https://github.com/albanD, https://github.com/Skylion007, https://github.com/atalman, https://github.com/v0i0
2025-09-23 22:46:51 +00:00
6b5ad5f211 [Kineto] Add list of string parsing for profiler (#163593)
Summary:
We add the parsing for list of string. This is needed for AOTInductor
profiling for input information of Triton kernels.

Test Plan:
Included in commit.
test_profiler_op_event_kwargs_list_of_strings

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163593
Approved by: https://github.com/sraikund16
2025-09-23 22:45:49 +00:00
20149080f2 [MPS] Compute offset2bag/bag_size/max_indices in _embedding_bag (#163281)
Part of #162270

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163281
Approved by: https://github.com/malfet
2025-09-23 22:30:48 +00:00
b879ef7c0d [ROCm][CI] skip TestCudaPrimaryCtx.test_set_device_0 (#163693)
Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163693
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-09-23 22:15:10 +00:00
c63e417c79 use reduction hint for aggressive rblock (#163371)
I had been using tiling scores to essentially check if this is an inner reduction. since that is not fully rolled out for dynamic shapes, use reduction hint when they are not available.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163371
Approved by: https://github.com/PaulZhang12
2025-09-23 22:04:22 +00:00
c3d9f089d9 [torchfuzz] introduce multi process fuzzer (#163560)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163560
Approved by: https://github.com/laithsakka
ghstack dependencies: #163547, #163553, #163554, #163555, #163556, #163557, #163558
2025-09-23 22:00:51 +00:00
29af25844b Less aggressive persistent reduction when it could induce large masking with dynamic shapes (#163365)
As per comment in source code:
```
            # If we are are coalescing on xblock (not ReductionHint.INNER) and this is not a tiny kernel
            # (not ReductionHint.OUTER_TINY), do not use persistent reduction if it induces tile
            # quantization. Peristent reduction forces rblock == rnumel, if the bounds between lower
            # and upper are large, for the lower values we will be masking off large % of read/writes,
            # when we could expand the coalescing xblock instead.
```

For the test case in question, this pr improves perf from 0.8573521325143717 -> 0.043151492193814305 because we were egregiously masking out rblock values (58/64 values).

Differential Revision: [D82853279](https://our.internmc.facebook.com/intern/diff/D82853279)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163365
Approved by: https://github.com/shunting314, https://github.com/PaulZhang12, https://github.com/jansel, https://github.com/v0i0
2025-09-23 21:58:57 +00:00
8c8416b021 Update pytorch.org links in docs/conf.py (#163682)
Update links in conf.py to docs.pytorch.org

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163682
Approved by: https://github.com/sekyondaMeta, https://github.com/albanD
2025-09-23 21:40:11 +00:00
b182365660 [ez] use list initializer syntax in fill_diagonal_ (#163607)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163607
Approved by: https://github.com/Skylion007
ghstack dependencies: #163485
2025-09-23 21:27:12 +00:00
5ca563ea09 symintify fill_diagonol_ (#163485)
Fixes #162271

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163485
Approved by: https://github.com/Skylion007
2025-09-23 21:27:12 +00:00
e671dcc969 Update tests to check for more robust pattern (#163107)
Landing this instead of https://github.com/pytorch/pytorch/pull/162994.

Here is how i think the whole dynamo + frame construction logic work:
1) There is no way to create a frame object in python land as this is created in runtime from cpython. So that's why aot_compile creates FrameInfo this way. (kind of like simulating the runtime) i guess you could write your own very simple eval_frame.c where you can interject the frame construction but we probably don't want that.
2) When there is no wrapper (the old export or aot_compile), we first assign sources by iterating over f_locals which contain both local args and closure variables (this is implementation details of cpython frame construction). So thats why closure variables end up getting LocalSource names as can be shown in this test case (f6ea41ead2/test/export/test_export.py (L1369)). Note that L["self"] here means we are referring to local object self. Important thing to keep in mind here is this self is not actually model self, but the outer self.
3) When we switch to wrapper case, we end up trying to inline the original inner module. When doing so, we need to track all local and closures for this inner module as can be seen here (f6ea41ead2/torch/_dynamo/variables/functions.py (L463)) Here we are not looking into inner frame's f_locals but just directly look at closures. I guess this is because we are one more frame up so there is no access to frame f_locals at this point. And it is probably not good idea to change dynamo's logic here. As a result, i get following error message that is different from old export:
"While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['self']._export_root.forward.__func__.__closure__[1].cell_contents.bank", "L['self']._export_root.forward.__func__.__closure__[1].cell_contents.bank_dict", "L['self']._export_root.forward.__func__.__closure__[0].cell_contents"]"

My initial attempt of solving this was taking inner closures and put them to f_locals for the frame i am constructing which turned out too compilcated because we needed to muck around bytecode instructions as well. So i am thinking we should just update the test to reflect new names and follow up with better post-processing step to have better names.

Differential Revision: [D82582029](https://our.internmc.facebook.com/intern/diff/D82582029)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163107
Approved by: https://github.com/avikchaudhuri
2025-09-23 21:11:48 +00:00
fc84743707 Implement CUDA stream protocol (#163614)
Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163614
Approved by: https://github.com/eqy
2025-09-23 21:02:08 +00:00
2a9745de3c [multi-kernel] shape-similarity kernel selection (#163090)
Introduces a variant of size-hint multi-kernel, where for novel runtime shapes, instead of performing full benchmarking to determine the optimal kernel, selects one of many kernels pre-generated from multi-kernel hints, based off similarity b/w hint / runtime input & output shapes (L1 distance in log2 space).

Some caveats/changes:
- Size-hint multi-kernel now only kicks in if the kernel has dynamic shapes
- Pre-generation still only does 1-d search over specified hints, e.g. `matmul([s0, s1], [s1, s2])` with size-hints `[64, 256]` only generates 2 kernels - based on tuning shapes ([64, 64], [64, 64]) and ([256, 256], [256, 256]). Extending this to reasonable n-d search (via user API?) is an extension

Benchmarking results, compared to multi-kernel w/ full benchmarking (hints 64, 4096), and compiling with the ground truth hint:
<img width="1902" height="1222" alt="550541081_1088709150049684_6528797079439730237_n" src="https://github.com/user-attachments/assets/056cca48-c16a-4451-9b4a-fa13a7a058a9" />

Full benchmarking doing worse is extremely weird, but we did see similar spikes in #156628

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163090
Approved by: https://github.com/bobrenjc93
2025-09-23 21:00:47 +00:00
22c5e8c17c Add num_store to inductor_meta and use it to scale persistent reduction x block (#162446)
Scale up XBLOCK for contiguous persistent reductions based on rnumel and number of loads + stores

<img width="928" height="656" alt="Screenshot 2025-09-18 at 5 02 57 PM" src="https://github.com/user-attachments/assets/ec3c561f-2a3f-4459-9e14-653715898da3" />

Differential Revision: [](https://our.internmc.facebook.com/intern/diff/)

Differential Revision: [](https://our.internmc.facebook.com/intern/diff/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162446
Approved by: https://github.com/v0i0, https://github.com/eellison, https://github.com/shunting314
ghstack dependencies: #162296
2025-09-23 20:36:39 +00:00
bcb893acb0 [ROCm] Build FBGEMM_GENAI for gfx942 only (#162648)
Fixes build timeouts >4h on libtorch build jobs: 75e7f49f9c/1

Brings back code to narrow down CK compilation targets from 69a25f6888 (diff-ce80f3115ab2f6be5142f0678a1fc92c6b2d7727766ce44f48726c99e720f777)

gfx942 supports fp8

Don't enable gfx950 for now, until more optimizations are in place as per https://github.com/pytorch/pytorch/pull/162648/files#r2369588738

Validation:
[rocm6.4](https://github.com/pytorch/pytorch/actions/runs/17944766350/job/51028483128) and [rocm6.3](https://github.com/pytorch/pytorch/actions/runs/17944766350/job/51028483093) libtorch builds finished within 3.9h.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162648
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-09-23 18:55:35 +00:00
8e6b0c71fb [Inductor] Remove no_type_check annotation on properties (#163570)
Some properties with `cache_on_self` were prevously annotated with `no_type_check`, to get around mypy limitations. This PR replaces both annotations with `cache_property_on_self`, to enable type checking.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163570
Approved by: https://github.com/mlazos, https://github.com/PaulZhang12, https://github.com/Skylion007
2025-09-23 18:20:04 +00:00
0696a4b0b8 [EZ] Perma-ignore UP038 (#163649)
As it has been removed, see https://docs.astral.sh/ruff/rules/non-pep604-isinstance/
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163649
Approved by: https://github.com/Skylion007
ghstack dependencies: #163648
2025-09-23 17:58:18 +00:00
ca35dc2fdd [EZ] Fix UP041 violations (#163648)
I.e. use `TimeoutError` instead of `socket.timeout`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163648
Approved by: https://github.com/cyyever, https://github.com/Skylion007
2025-09-23 17:58:18 +00:00
649ceda8a5 [export] handling NamedTuple inputs (#162959)
Fixes #160547
### Summary:
bug
```
    def test_namedtuple(self):
        from collections import namedtuple
        Point = namedtuple('Point', 'x y')

        class M(torch.nn.Module):
            def forward(self, x, y):
                return x + y

        inp = Point(torch.ones(3), torch.ones(3))
        print(M()(*inp))

        # errors
        ep = torch.export.export(M(), inp, strict=False)
        print(ep)

        # succeeds
        ep = torch.export.export(M(), inp, strict=True)
        print(ep)

        # workaround could be to convert namedtuple to a kwarg
        inp_kwargs =  {field: getattr(inp, field) for field in inp._fields}
        ep = torch.export.export(M(), (), inp_kwargs)
        print(ep)
```
FIx :
namedtuple is subclass of tuple
but namedtuple is not expected
So, this change handles named tuple case

I have added 🧪 test case for this as well
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162959
Approved by: https://github.com/angelayi

Co-authored-by: Angela Yi <angelayi@meta.com>
2025-09-23 17:43:50 +00:00
2aadcea05c [ROCm] Improve perf for elementwise broadcast with mixed dtype (#163562)
* Unroll loops manually to hide memory access latency

Co-author: @amd-hhashemi

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163562
Approved by: https://github.com/jeffdaily
2025-09-23 17:42:48 +00:00
fde929c8a8 [AOTI] Fix model_package_loader get_cpp_compile_command (#163561)
It should fix AOTI UTs of `test_aot_inductor_package.py`, these cases are failed at `compile_so`.

reproducer:
```cmd
pytest test\inductor\test_aot_inductor_package.py -v -k test_multiple_methods
```
<img width="1262" height="95" alt="image" src="https://github.com/user-attachments/assets/49458536-1cfe-498e-a12a-2bfd8da67a9e" />

Major fix at `get_cpp_compile_command`. The code is aligned to cpp_builder frontend code:  3ef1bef36c/torch/_inductor/cpp_builder.py (L1780-L1790)
3ef1bef36c/torch/_inductor/cpp_builder.py (L1959-L1976)

Fixed on Windows:
<img width="1261" height="89" alt="Image" src="https://github.com/user-attachments/assets/9bf43b11-aac1-4161-a625-e602e313a299" />

Also validated on Linux:
<img width="1039" height="81" alt="Image" src="https://github.com/user-attachments/assets/46063e16-6cf1-4a28-8466-0496871b8619" />

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163561
Approved by: https://github.com/jansel
2025-09-23 17:38:18 +00:00
134dfbeaef [DCP] DTensor slice dequantization with proper block alignment (#163532)
Summary:
When loading quantized tensors with DTensor slicing, the dequantization process was producing numerically incorrect results due to improper block-to-slice coordinate mapping. The previous implementation calculated block boundaries relative to the sliced tensor dimensions instead of the original full tensor dimensions, causing scale factors to be applied to wrong tensor regions.

This fix addresses the issue by:

1. **Proper coordinate mapping**: Added `_get_slice_to_block_mapping()` to correctly map tensor slices to quantization blocks using global coordinates from the full tensor shape.

3. **Block-aligned dequantization**: Updated `_dequantize_tensor()` to use proper block intersection logic, ensuring scale factors are applied to the correct portions of sliced tensors.

The fix ensures that when DTensor requests a slice of a quantized tensor, the dequantization correctly identifies which quantization blocks intersect with the requested slice and applies the appropriate scale factors to the right tensor regions.

Test Plan:
Tested with DTensor configurations where quantized tensors are sliced across different dimensions. Verified that:
1. Dequantized tensor values are numerically correct
2. Block boundaries are properly calculated relative to full tensor shape
3. Scale factors are applied to correct tensor regions
4. Tensor shapes map is built efficiently using only metadata

Correctness validation using https://github.com/wwwjn/torchtitan/blob/dsv3-sd-test/tests/fsdp_dequantized_load.py
```
{
  "model.layers.0.mlp.gate_proj.weight": {
    "mse": 4.30626645453458e-11,
    "mae": 9.98388827611052e-07,
    "max_abs_diff": 0.0009703934192657471,
    "cosine_similarity": 1.010810375213623,
    "relative_error": 0.001330620958469808,
    "kl_divergence_1_to_2": "6.563401e-08",
    "kl_divergence_2_to_1": "-6.522914e-08",
    "js_divergence": 1.3711876079014476e-10,
    "shape": [
      18432,
      7168
    ],
    "t1_stats": {
      "min": -0.4453125,
      "max": 0.30859375,
      "mean": -1.2592146958922967e-05
    },
    "t2_stats": {
      "min": -0.44529813528060913,
      "max": 0.3085886240005493,
      "mean": -1.2624391274584923e-05
    }
  },
  "model.layers.0.mlp.up_proj.weight": {
    "mse": 2.5534721906361746e-11,
    "mae": 3.118609583907528e-06,
    "max_abs_diff": 0.00047551095485687256,
    "cosine_similarity": 1.038962483406067,
    "relative_error": 0.0013681650161743164,
    "kl_divergence_1_to_2": "-5.8253768e-08",
    "kl_divergence_2_to_1": "5.8747577e-08",
    "js_divergence": NaN,
    "shape": [
      18432,
      7168
    ],
    "t1_stats": {
      "min": -0.228515625,
      "max": 0.2333984375,
      "mean": 8.862222955485777e-08
    },
    "t2_stats": {
      "min": -0.2285017967224121,
      "max": 0.23338991403579712,
      "mean": 8.824501662729745e-08
    }
  },
  "model.layers.0.mlp.down_proj.weight": {
    "mse": 2.2803769289536646e-11,
    "mae": 2.8916260816913564e-06,
    "max_abs_diff": 0.0008973777294158936,
    "cosine_similarity": 1.0376262664794922,
    "relative_error": 0.001346255769021809,
    "kl_divergence_1_to_2": "1.2744896e-07",
    "kl_divergence_2_to_1": "-1.2736885e-07",
    "js_divergence": 5.992362162032805e-11,
    "shape": [
      7168,
      18432
    ],
    "t1_stats": {
      "min": -0.54296875,
      "max": 0.546875,
      "mean": -2.9487239316949854e-07
    },
    "t2_stats": {
      "min": -0.5429964661598206,
      "max": 0.5469087362289429,
      "mean": -2.9507478416235244e-07
    }
  }
}
```

https://www.internalfb.com/intern/testinfra/testrun/3940649985202645

Differential Revision: D82975005

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163532
Approved by: https://github.com/wwwjn
2025-09-23 16:48:16 +00:00
221ac81043 Revert "[precompile] Add option to disable guard check on aot-compiled function. (#163432)"
This reverts commit 539e84e289fa7563032410706ede50a4eaa7a15d.

Reverted https://github.com/pytorch/pytorch/pull/163432 on behalf of https://github.com/Camyll due to breaking internal tests ([comment](https://github.com/pytorch/pytorch/pull/163432#issuecomment-3324757069))
2025-09-23 16:31:30 +00:00
6e5dddba64 Use accelerator API in common_dtensor (#163498)
Fixes #ISSUE_NUMBER

Try to unify the device checking in common_dtensor (testing module) by accelerator API

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163498
Approved by: https://github.com/albanD, https://github.com/H-Huang
2025-09-23 16:30:20 +00:00
ebddbe787a [ROCm][CI] skip test_sparse_triangular_solve (#163651)
need more time to debug, but also need clean CI signal test was unskipped by #163495, but had been skipp on rocm prior

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163651
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-09-23 15:55:51 +00:00
5f0c7cb4aa Add B200 smoke test (#159494)
Okay running test_max_autotune locally on B200is horrible read, for now to get something landed I am focusing on test_matmul_cuda.py and test_fp8

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159494
Approved by: https://github.com/nWEIdia, https://github.com/huydhn
ghstack dependencies: #163460, #163537, #163552
2025-09-23 15:45:05 +00:00
b3cf5c79dd Skip on sm100 later since Tests are non determinisitic (#163552)
This is tracked https://github.com/pytorch/pytorch/issues/163462

skipping since we are seeing sporadic errors locally and on CI,
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163552
Approved by: https://github.com/eqy, https://github.com/Skylion007
ghstack dependencies: #163460, #163537
2025-09-23 15:45:05 +00:00
0f674077f4 Large tests failing on bfloat16 (#163537)
# Summary

I ran these tests locally, each 10k Tests takes over 5 mins for an extremely beefy cpu to run. I think that this is overkill feel free to disagree. Also the 1 test I ran that failed earlier up in the stack failed with 1 ulp difference so I think that this is kind of an edgecase on how we do testing (will right up issue for my thoughts later)

``` Shell
==================================================================================================== FAILURES =====================================================================================================
_________________________________________________________ TestMatmulCudaCUDA.test_cublas_addmm_reduced_precision_size_10000_backend_cublas_cuda_bfloat16 __________________________________________________________
Traceback (most recent call last):
  File "/home/dev/.conda/envs/nightly/lib/python3.12/unittest/case.py", line 58, in testPartExecutor
    yield
  File "/home/dev/.conda/envs/nightly/lib/python3.12/unittest/case.py", line 634, in run
    self._callTestMethod(testMethod)
  File "/home/dev/.conda/envs/nightly/lib/python3.12/unittest/case.py", line 589, in _callTestMethod
    if method() is not None:
       ^^^^^^^^
  File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3223, in wrapper
    method(*args, **kwargs)
  File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3223, in wrapper
    method(*args, **kwargs)
  File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_device_type.py", line 426, in instantiated_test
    result = test(self, **param_kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_device_type.py", line 1408, in only_fn
    return fn(slf, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 2024, in wrap_fn
    return fn(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dev/meta/pytorch/test/test_matmul_cuda.py", line 190, in test_cublas_addmm_reduced_precision
    self.cublas_addmm(size, dtype, True)
  File "/home/dev/meta/pytorch/test/test_matmul_cuda.py", line 162, in cublas_addmm
    assert_close_with_ulp(res_cpu, res_cuda, atol=tolerance.atol, rtol=tolerance.rtol)
  File "/home/dev/meta/transformer_nuggets/transformer_nuggets/numerics/__init__.py", line 222, in assert_close_with_ulp
    raise AssertionError("\n".join(error_parts))
AssertionError: Tensor-likes are not close!

Mismatched elements: 425 / 100030002 (0.0%)
Greatest absolute difference: 16 at index (2176, 9325) (up to 10 allowed)
Greatest relative difference: 3984 at index (376, 3754) (up to 0.2 allowed)

============================================================
ULP Analysis of Failures:
============================================================

Total failures: 425
ULP distances: min=-32761, max=32763, mean=-11513.7

Top 10 failures by absolute difference:
  #  | Index                      | Abs Diff    | Rel Diff    | ULP  | Expected     | Actual
----------------------------------------------------------------------------------------------------
   1 | (6923, 1580)               | 1.600000e+01 | 5.390625e-01 |  146 |    29.750000 |    13.750000
   2 | (4677, 420)                | 1.600000e+01 | 6.601562e-01 |   95 |    24.250000 |    40.250000
   3 | (2176, 9325)               | 1.600000e+01 | 6.875000e-01 |  210 |    23.250000 |     7.250000
   4 | (5119, 7865)               | 1.600000e+01 | 1.164062e+00 |  146 |   -13.750000 |   -29.750000
   5 | (3218, 8334)               | 1.600000e+01 | 2.593750e+00 |  236 |     6.156250 |    22.125000
   6 | (5245, 241)                | 1.600000e+01 | 5.468750e-01 |   75 |    29.250000 |    45.250000
   7 | (7666, 6549)               | 1.600000e+01 | 1.640000e+03 | 1376 |    -0.009766 |   -16.000000
   8 | (1663, 1115)               | 1.593750e+01 | 8.375000e+00 | -32427 |     1.898438 |   -14.062500
   9 | (3967, 7708)               | 1.593750e+01 | 1.368750e+01 | -32510 |     1.164062 |   -14.750000
  10 | (2874, 2038)               | 1.593750e+01 | 1.710938e+00 |  181 |     9.312500 |    25.250000

Note: Maximum absolute and relative errors occur at different locations
  Max abs diff location (2176, 9325): 210 ULP
  Max rel diff location (376, 3754): 31868 ULP

To execute this test, run the following from the base repo dir:
    python test/test_matmul_cuda.py TestMatmulCudaCUDA.test_cublas_addmm_reduced_precision_size_10000_backend_cublas_cuda_bfloat16

This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
________________________________________________________ TestMatmulCudaCUDA.test_cublas_addmm_reduced_precision_size_10000_backend_cublaslt_cuda_bfloat16 _________________________________________________________
Traceback (most recent call last):
  File "/home/dev/.conda/envs/nightly/lib/python3.12/unittest/case.py", line 58, in testPartExecutor
    yield
  File "/home/dev/.conda/envs/nightly/lib/python3.12/unittest/case.py", line 634, in run
    self._callTestMethod(testMethod)
  File "/home/dev/.conda/envs/nightly/lib/python3.12/unittest/case.py", line 589, in _callTestMethod
    if method() is not None:
       ^^^^^^^^
  File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3223, in wrapper
    method(*args, **kwargs)
  File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3223, in wrapper
    method(*args, **kwargs)
  File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_device_type.py", line 426, in instantiated_test
    result = test(self, **param_kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_device_type.py", line 1408, in only_fn
    return fn(slf, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 2024, in wrap_fn
    return fn(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dev/meta/pytorch/test/test_matmul_cuda.py", line 190, in test_cublas_addmm_reduced_precision
    self.cublas_addmm(size, dtype, True)
  File "/home/dev/meta/pytorch/test/test_matmul_cuda.py", line 162, in cublas_addmm
    assert_close_with_ulp(res_cpu, res_cuda, atol=tolerance.atol, rtol=tolerance.rtol)
  File "/home/dev/meta/transformer_nuggets/transformer_nuggets/numerics/__init__.py", line 222, in assert_close_with_ulp
    raise AssertionError("\n".join(error_parts))
AssertionError: Tensor-likes are not close!

Mismatched elements: 425 / 100030002 (0.0%)
Greatest absolute difference: 16 at index (2176, 9325) (up to 10 allowed)
Greatest relative difference: 3984 at index (376, 3754) (up to 0.2 allowed)

============================================================
ULP Analysis of Failures:
============================================================

Total failures: 425
ULP distances: min=-32761, max=32763, mean=-11513.7

Top 10 failures by absolute difference:
  #  | Index                      | Abs Diff    | Rel Diff    | ULP  | Expected     | Actual
----------------------------------------------------------------------------------------------------
   1 | (6923, 1580)               | 1.600000e+01 | 5.390625e-01 |  146 |    29.750000 |    13.750000
   2 | (4677, 420)                | 1.600000e+01 | 6.601562e-01 |   95 |    24.250000 |    40.250000
   3 | (2176, 9325)               | 1.600000e+01 | 6.875000e-01 |  210 |    23.250000 |     7.250000
   4 | (5119, 7865)               | 1.600000e+01 | 1.164062e+00 |  146 |   -13.750000 |   -29.750000
   5 | (3218, 8334)               | 1.600000e+01 | 2.593750e+00 |  236 |     6.156250 |    22.125000
   6 | (5245, 241)                | 1.600000e+01 | 5.468750e-01 |   75 |    29.250000 |    45.250000
   7 | (7666, 6549)               | 1.600000e+01 | 1.640000e+03 | 1376 |    -0.009766 |   -16.000000
   8 | (1663, 1115)               | 1.593750e+01 | 8.375000e+00 | -32427 |     1.898438 |   -14.062500
   9 | (3967, 7708)               | 1.593750e+01 | 1.368750e+01 | -32510 |     1.164062 |   -14.750000
  10 | (2874, 2038)               | 1.593750e+01 | 1.710938e+00 |  181 |     9.312500 |    25.250000

Note: Maximum absolute and relative errors occur at different locations
  Max abs diff location (2176, 9325): 210 ULP
  Max rel diff location (376, 3754): 31868 ULP

To execute this test, run the following from the base repo dir:
    python test/test_matmul_cuda.py TestMatmulCudaCUDA.test_cublas_addmm_reduced_precision_size_10000_backend_cublaslt_cuda_bfloat16

This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
```
Okay the bfloat16 are forsure  real cc @eqy
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163537
Approved by: https://github.com/Skylion007, https://github.com/malfet, https://github.com/eqy
ghstack dependencies: #163460
2025-09-23 15:45:05 +00:00
720a7b2887 [export] Remove .contiguous() when saving weights to raw bytes (#163587)
Summary: `.contiguous()` will discard the original storage size of the tensor, and could lead to issues during loading.

Test Plan:
buck2 run mode/dev-nosan caffe2/test:test_export -- -r test_1D_tensor_slicing
buck2 run mode/dev-nosan caffe2/test:test_export -- -r test_2D_tensor_slicing

Differential Revision: D83016250

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163587
Approved by: https://github.com/angelayi
2025-09-23 15:44:56 +00:00
49e7b2f69d [inductor] Fix error from custom CUDA allocators (#163422)
Fixes #163257

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163422
Approved by: https://github.com/eellison
ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393, #163412
2025-09-23 15:37:45 +00:00
6ef74879f6 [dynamo] Fix TorchFunctionMode handling with get_rng_state (#163412)
Fixes #162624
Fixes #162586

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163412
Approved by: https://github.com/eellison
ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393
2025-09-23 15:37:45 +00:00
9c4d9f940b [inductor] Support out_dtype arg to matmul (#163393)
Fixes #163275

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163393
Approved by: https://github.com/eellison, https://github.com/coconutruben
ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434
2025-09-23 15:37:38 +00:00
ed84e808f0 [inductor] Freeze layouts in FlexAttention (#163434)
Fixes #163300

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163434
Approved by: https://github.com/drisspg
ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419
2025-09-23 15:37:29 +00:00
518c320676 [inductor] libdevice.sqrt => tl.sqrt_rn (#163419)
Fixes #163082

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163419
Approved by: https://github.com/Skylion007, https://github.com/mlazos
ghstack dependencies: #163386, #163398, #163387, #163414, #163415
2025-09-23 15:37:21 +00:00
4264fd34ec Add basic tests for torch.distributed.tensor._utils.compute_global_tensor_info (#162968)
Next PR writes a C++ implementation. Seems good to have tests first.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162968
Approved by: https://github.com/ezyang
ghstack dependencies: #161695, #162508
2025-09-23 14:56:32 +00:00
e05c9c0c84 [ROCm][CI] cudagraph trees ut fixes (#163592)
Fixes #162125.
Fixes #160719.
Fixes #157901.
Fixes #157871.
Fixes #157761.
Fixes #157723.
Fixes #157643.
Fixes #157616.
Fixes #157556.
Fixes #157533.
Fixes #157449.
Fixes #157428.
Fixes #157413.
Fixes #157367.
Fixes #157350.
Fixes #157339.
Fixes #157312.
Fixes #157280.
Fixes #157258.
Fixes #157173.
Fixes #157143.
Fixes #157112.
Fixes #157086.
Fixes #157058.
Fixes #157035.
Fixes #156984.
Fixes #156957.
Fixes #156954.
Fixes #156922.
Fixes #156886.
Fixes #156838.
Fixes #156808.
Fixes #156801.
Fixes #156778.
Fixes #156755.
Fixes #156735.
Fixes #156693.
Fixes #152561.
Fixes #130749.
Fixes #100074.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163592
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-09-23 14:45:00 +00:00
aff76c046d Revert "Add fake_impl for _native_multi_head_attention (#163167)"
This reverts commit 27164b6788cab6e6d8095012839e51c958a819d6.

Reverted https://github.com/pytorch/pytorch/pull/163167 on behalf of https://github.com/malfet due to This broke in inductor-cpu-test, see 1a42656d6c/1 ([comment](https://github.com/pytorch/pytorch/pull/163167#issuecomment-3324302026))
2025-09-23 14:36:45 +00:00
1a42656d6c [Flex attention] Fix flex attention head broadcast (#163426)
Fixes part of #163314

In particular bug: **Bug 1: H=None Broadcasting Produces Incorrect Results**

This fixes a shape bug when slicing BlockMask on the Q-tile axis with an int (**mask[:, :, i]**). That form of indexing collapses the Q dimension, so kv_num_blocks/kv_indices lose their expected [B, H, Q_tiles, …] shape. Due to them losing shape, even though the mask_mod remains "interpretable", the kernel’s stride math then reads wrong offsets. Due to this we get silent numerical mismatches compared to regular SDPA, especially when single position decoding/H broadcasting.

The B=None, H=None works case is accidental: with singleton batch/head the kernel maps to index 0 via `sparse_idx_z = off_zq % 1` and `sparse_idx_hq = off_hq % 1` and with a single Q tile `q_start // SPARSE_Q_MULTIPLE = 0`. The missing Q-tiles stride is multiplied by 0, so the bad offset from the collapsed Q axis doesn’t move the pointer and it happens to read the first tile correctly. Once H > 1 or there are multiple Q tiles, those terms become nonzero and the kernel indexes with wrong strides which causes silent error

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163426
Approved by: https://github.com/drisspg
2025-09-23 13:01:51 +00:00
bda9ab291d [inductor] fix as_strided lowering with .view(dtype) inputs (#163319)
FIXES https://github.com/pytorch/pytorch/issues/163286

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163319
Approved by: https://github.com/eellison
2025-09-23 12:50:57 +00:00
3c64b2abab CUDA 13.0 Warning update for supported architectures (#163585)
Please see build script: 8da008678f/.ci/manywheel/build_cuda.sh (L69-L71)

This should display correct warning:
``
Please install PyTorch with a following CUDA
configurations: 12.6 12.8 13.0 following instructions at
https://pytorch.org/get-started/locally/
``
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163585
Approved by: https://github.com/malfet
2025-09-23 11:27:11 +00:00
5d749ceb92 Remove test conditions for CUDA<12 (#163495)
Because it required that CUDA >=12.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/163495
Approved by: https://github.com/janeyx99
2025-09-23 07:52:00 +00:00
475 changed files with 10896 additions and 10310 deletions

View File

@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
try:
with socket.create_connection((addr, port), timeout=timeout):
return
except (ConnectionRefusedError, socket.timeout): # noqa: PERF203
except (ConnectionRefusedError, TimeoutError): # noqa: PERF203
if i == attempt_cnt - 1:
raise
time.sleep(timeout)
@ -1004,7 +1004,7 @@ if __name__ == "__main__":
install_condaforge_python(host, args.python_version)
sys.exit(0)
python_version = args.python_version if args.python_version is not None else "3.9"
python_version = args.python_version if args.python_version is not None else "3.10"
if args.use_torch_from_pypi:
configure_system(host, compiler=args.compiler, python_version=python_version)

View File

@ -69,7 +69,8 @@ RUN bash ./install_cuda.sh 13.0
ENV DESIRED_CUDA=13.0
FROM ${ROCM_IMAGE} as rocm
ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
ARG PYTORCH_ROCM_ARCH
ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
ADD ./common/install_mkl.sh install_mkl.sh
RUN bash ./install_mkl.sh && rm install_mkl.sh
ENV MKLROOT /opt/intel

View File

@ -36,6 +36,12 @@ case ${DOCKER_TAG_PREFIX} in
;;
rocm*)
BASE_TARGET=rocm
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
# add gfx950 conditionally starting in ROCm 7.0
if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
fi
EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
;;
*)
echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"

View File

@ -40,12 +40,16 @@ case ${DOCKER_TAG_PREFIX} in
;;
rocm*)
# we want the patch version of 6.4 instead
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
fi
BASE_TARGET=rocm
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
# add gfx950 conditionally starting in ROCm 7.0
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
fi
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
;;
*)

View File

@ -82,7 +82,7 @@ case ${image} in
;;
manylinux2_28-builder:rocm*)
# we want the patch version of 6.4 instead
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
fi
TARGET=rocm_final
@ -90,6 +90,10 @@ case ${image} in
DEVTOOLSET_VERSION="11"
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
# add gfx950 conditionally starting in ROCm 7.0
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
fi
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
;;
manylinux2_28-builder:xpu)

View File

@ -112,8 +112,6 @@ ninja==1.11.1.3
#Pinned versions: 1.11.1.3
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
#Description: Just-In-Time Compiler for Numerical Functions
@ -134,7 +132,7 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
#test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
#test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
#test_binary_ufuncs.py
numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
numpy==1.22.4; python_version == "3.10"
numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
numpy==2.1.2; python_version >= "3.13"
@ -326,8 +324,6 @@ pywavelets==1.7.0 ; python_version >= "3.12"
lxml==5.3.0
#Description: This is a requirement of unittest-xml-reporting
# Python-3.9 binaries
PyGithub==2.3.0
sympy==1.13.3

View File

@ -1,11 +1,11 @@
SHELL=/usr/bin/env bash
DOCKER_CMD ?= docker
DESIRED_ROCM ?= 6.4
DESIRED_ROCM ?= 7.0
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
PACKAGE_NAME = magma-rocm
# inherit this from underlying docker image, do not pass this env var to docker
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
magma-rocm/build_magma.sh
.PHONY: all
all: magma-rocm70
all: magma-rocm64
all: magma-rocm63
@ -24,6 +25,11 @@ clean:
$(RM) -r magma-*
$(RM) -r output
.PHONY: magma-rocm70
magma-rocm70: DESIRED_ROCM := 7.0
magma-rocm70:
$(DOCKER_RUN)
.PHONY: magma-rocm64
magma-rocm64: DESIRED_ROCM := 6.4
magma-rocm64:

View File

@ -6,8 +6,8 @@ set -eou pipefail
# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
# Version 2.7.2 + ROCm related updates
MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
# https://github.com/icl-utk-edu/magma/pull/65
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
# Folders for the build
PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
# Fetch magma sources and verify checksum
pushd ${PACKAGE_DIR}
git clone https://bitbucket.org/icl/magma.git
git clone https://github.com/jeffdaily/magma
pushd magma
git checkout ${MAGMA_VERSION}
popd

View File

@ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \
# Build the docs
pushd docs/cpp
time make VERBOSE=1 html -j
time make VERBOSE=1 html
popd
popd

View File

@ -35,10 +35,11 @@ fi
print_cmake_info
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
else
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
# backends (specifically the gloo backend), so test that this case works too
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
fi
if which sccache > /dev/null; then

View File

@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
fi
popd
python -mpip install -r requirements.txt
# enable debug asserts in serialization
export TORCH_SERIALIZATION_DEBUG=1
python -mpip install --no-input -r requirements.txt
setup_test_python() {
# The CircleCI worker hostname doesn't resolve to an address.
# This environment variable makes ProcessGroupGloo default to
@ -59,7 +55,7 @@ test_python_shard() {
setup_test_python
time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS"
time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS"
assert_git_not_dirty
}

View File

@ -322,23 +322,29 @@ test_python_shard() {
# modify LD_LIBRARY_PATH to ensure it has the conda env.
# This set of tests has been shown to be buggy without it for the split-build
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty
}
test_python() {
# shellcheck disable=SC2086
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
assert_git_not_dirty
}
test_python_smoke() {
# Smoke tests for H100
# Smoke tests for H100/B200
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty
}
test_python_smoke_b200() {
# Targeted smoke tests for B200 - staged approach to avoid too many failures
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty
}
test_h100_distributed() {
# Distributed tests at H100
time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
@ -384,6 +390,7 @@ test_dynamo_wrapped_shard() {
--exclude-distributed-tests \
--exclude-torch-export-tests \
--exclude-aot-dispatch-tests \
--exclude-quantization-tests \
--shard "$1" "$NUM_TEST_SHARDS" \
--verbose \
--upload-artifacts-while-running
@ -1156,6 +1163,12 @@ test_distributed() {
fi
}
test_quantization() {
echo "Testing quantization"
python test/test_quantization.py
}
test_rpc() {
echo "Testing RPC C++ tests"
# NB: the ending test_rpc must match the current function name for the current
@ -1573,7 +1586,7 @@ test_executorch() {
test_linux_aarch64() {
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \
distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
@ -1649,6 +1662,8 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
test_executorch
elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
test_python_legacy_jit
elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
test_quantization
elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
# TODO: run some C++ tests
echo "no-op at the moment"
@ -1773,6 +1788,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
test_xpu_bin
elif [[ "${TEST_CONFIG}" == smoke ]]; then
test_python_smoke
elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
test_python_smoke_b200
elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
test_h100_distributed
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then

View File

@ -25,7 +25,7 @@ echo Copying over test times file
robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"
echo Run nn tests
python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
if ERRORLEVEL 1 goto fail
popd

View File

@ -177,8 +177,7 @@ source ~/${desired_python}-build/bin/activate
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
retry brew install libomp
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
# is build as part of tensorpipe submodule
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
export USE_DISTRIBUTED=1
export USE_MKLDNN=OFF

View File

@ -22,6 +22,9 @@ self-hosted-runner:
- linux.arm64.m7g.4xlarge
- linux.arm64.m7g.4xlarge.ephemeral
- linux.arm64.r7g.12xlarge.memory
- linux.aws.h100
- linux.aws.h100.4
- linux.aws.h100.8
- linux.4xlarge.nvidia.gpu
- linux.8xlarge.nvidia.gpu
- linux.16xlarge.nvidia.gpu

View File

@ -59,7 +59,7 @@ runs:
set -x
# Create new py_tmp env with python-version
${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp
${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv
PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
EXIT_CODE=$?

View File

@ -1 +1 @@
090197034faf3b193c4467cedeb9281e3078892d
9fe4c2bdb9859c14ad7f7479e1db7e01083bada3

View File

@ -525,6 +525,21 @@
- Lint
- pull
- name: typechecking
patterns:
- 'pyrefly.toml'
- 'mypy.ini'
- 'mypy-strict.ini'
approved_by:
- lolpack
- maggiemoss
- ndmitchell
- kinto0
mandatory_checks_name:
- EasyCLA
- Lint
- pull
- name: superuser
patterns:
- '*'

View File

@ -19,6 +19,7 @@ ciflow_push_tags:
- ciflow/nightly
- ciflow/periodic
- ciflow/periodic-rocm-mi300
- ciflow/quantization-periodic
- ciflow/rocm
- ciflow/rocm-mi300
- ciflow/s390
@ -36,6 +37,7 @@ ciflow_push_tags:
- ciflow/win-arm64
- ciflow/h100-symm-mem
- ciflow/h100-cutlass-backend
- ciflow/b200
retryable_workflows:
- pull
- trunk

View File

@ -155,7 +155,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
package_type="manywheel",
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.LINUX,
arches=["12.8"],
arches=["13.0"],
python_versions=["3.12"],
),
branches="main",

View File

@ -71,12 +71,15 @@ jobs:
with:!{{ upload.binary_env_as_input(config) }}
{%- if "aarch64" in build_environment %}
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
{%- elif "s390x" in build_environment %}
runs_on: linux.s390x
ALPINE_IMAGE: "docker.io/s390x/alpine"
timeout-minutes: 420
{%- elif config["gpu_arch_type"] == "rocm" %}
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
{%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.24xlarge.ephemeral

View File

@ -67,7 +67,7 @@ jobs:
# an OOM issue when running the job, so this upgrades the runner from 4xlarge
# to the next available tier of 12xlarge. So much memory just to generate cpp
# doc
runner: ${{ inputs.runner_prefix }}linux.12xlarge
runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory
# TODO: Nightly cpp docs take longer and longer to finish (more than 3h now)
# Let's try to figure out how this can be improved
timeout-minutes: 360

View File

@ -36,7 +36,7 @@ jobs:
runs-on: linux.9xlarge.ephemeral
strategy:
matrix:
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "rocm7.0", "cpu"]
steps:
- name: Build docker image
uses: pytorch/pytorch/.github/actions/binary-docker-build@main

View File

@ -54,6 +54,7 @@ jobs:
{ tag: "cuda12.6" },
{ tag: "rocm6.3" },
{ tag: "rocm6.4" },
{ tag: "rocm7.0" },
{ tag: "cpu" },
]
steps:

View File

@ -34,7 +34,7 @@ jobs:
id-token: write
strategy:
matrix:
rocm_version: ["64", "63"]
rocm_version: ["70", "64", "63"]
steps:
- name: Checkout PyTorch
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -54,6 +54,7 @@ jobs:
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm6.3", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm7.0", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxcxx11-abi-builder", tag: "cpu-cxx11-abi", runner: "linux.9xlarge.ephemeral" },

View File

@ -50,7 +50,7 @@ jobs:
strategy:
fail-fast: false
matrix:
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
device: ["cuda", "rocm", "xpu", "aarch64"]
docker-image: ["pytorch/manylinux2_28-builder:cpu"]
include:
@ -108,9 +108,6 @@ jobs:
# Determine python executable for given version
case $PY_VERS in
3.9)
PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
;;
3.10)
PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
;;
@ -194,7 +191,7 @@ jobs:
strategy:
fail-fast: false
matrix:
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
device: ["xpu"]
timeout-minutes: 40
env:

View File

@ -35,6 +35,7 @@ jobs:
contents: write
outputs:
pt_release_name: ${{ steps.release_name.outputs.pt_release_name }}
pt_pep517_release_name: ${{ steps.release_name.outputs.pt_pep517_release_name }}
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
@ -53,8 +54,12 @@ jobs:
tag_or_branch="${tag_or_branch#refs/heads/}"
# replace directory separators with _ in branch name
tag_or_branch="${tag_or_branch//\//_}"
echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')"
{
echo "PT_RELEASE_NAME=pytorch-$tag_or_branch";
echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz";
echo "PT_PEP517_RELEASE_FILE=torch-${torch_version}.tar.gz";
} >> "$GITHUB_ENV"
- name: Checkout optional submodules
run: python3 tools/optional_submodules.py
- name: Copy docs requirements for inclusion
@ -64,30 +69,47 @@ jobs:
cp .ci/docker/requirements-docs.txt docs/requirements.txt
- name: Create source distribution
run: |
# Create new folder with specified name so extracting the archive yields that
rm -rf "/tmp/$PT_RELEASE_NAME"
cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
mv "/tmp/$PT_RELEASE_NAME" .
# Cleanup
rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
# Create archive
tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
# Create new folder with specified name so extracting the archive yields that
rm -rf "/tmp/$PT_RELEASE_NAME"
cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
mv "/tmp/$PT_RELEASE_NAME" .
# Cleanup
rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
# Create archive
tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
- name: Create PEP 517 compatible source distribution
run: |
pip install build==1.2.2.post1 || exit 1
python -m build --sdist || exit 1
cd dist || exit 1
- name: Upload source distribution for release
if: ${{ github.event_name == 'release' }}
uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2
with:
files: ${{env.PT_RELEASE_FILE}}
- name: Upload source distribution to GHA artifacts for release tags
files: |
${{ env.PT_RELEASE_FILE }}
${{ env.PT_PEP517_RELEASE_FILE }}
- name: Upload source distribution to GHA artifacts # for release tags
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with:
name: ${{ env.PT_RELEASE_FILE }}
path: ${{ env.PT_RELEASE_FILE }}
- name: Upload PEP 517 source distribution to GHA artifacts # for release tags
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with:
name: ${{ env.PT_PEP517_RELEASE_FILE }}
path: dist/${{ env.PT_PEP517_RELEASE_FILE }}
- name: Set output
id: release_name
run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"
run: |
{
echo "pt_release_name=${{ env.PT_RELEASE_FILE }}";
echo "pt_pep517_release_name=${{ env.PT_PEP517_RELEASE_FILE }}";
} >> "${GITHUB_OUTPUT}"
upload_source_code_to_s3:
if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
@ -103,6 +125,9 @@ jobs:
- uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
with:
name: ${{ needs.release.outputs.pt_release_name }}
- uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
with:
name: ${{ needs.release.outputs.pt_pep517_release_name }}
- name: Configure AWS credentials(PyTorch account)
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
@ -113,7 +138,9 @@ jobs:
s3-bucket: pytorch
s3-prefix: source_code/test
if-no-files-found: warn
path: ${{ needs.release.outputs.pt_release_name }}
path: |
${{ needs.release.outputs.pt_release_name }}
${{ needs.release.outputs.pt_pep517_release_name }}
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}

View File

@ -62,7 +62,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
@ -128,7 +128,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
@ -174,7 +174,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
@ -220,7 +220,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
@ -265,7 +265,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
@ -331,7 +331,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
@ -377,7 +377,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
@ -423,7 +423,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
@ -468,7 +468,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
@ -534,7 +534,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
@ -580,7 +580,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
@ -626,7 +626,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
@ -671,7 +671,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
@ -737,7 +737,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
@ -783,7 +783,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
@ -829,7 +829,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
@ -874,7 +874,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
@ -940,7 +940,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
@ -986,7 +986,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
@ -1032,7 +1032,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
@ -1077,7 +1077,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
@ -1143,7 +1143,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
@ -1189,7 +1189,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
@ -1235,7 +1235,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
@ -1280,7 +1280,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
@ -1346,7 +1346,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
@ -1392,7 +1392,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
@ -1438,7 +1438,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
runs_on: linux.arm64.r7g.12xlarge.memory
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel

View File

@ -333,6 +333,7 @@ jobs:
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: libtorch-rocm6_3-shared-with-deps-release
build_environment: linux-binary-libtorch
secrets:
@ -447,6 +448,7 @@ jobs:
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: libtorch-rocm6_4-shared-with-deps-release
build_environment: linux-binary-libtorch
secrets:

View File

@ -42,7 +42,7 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
manywheel-py3_12-cuda12_8-build:
manywheel-py3_12-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
@ -51,22 +51,22 @@ jobs:
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu128
GPU_ARCH_VERSION: "12.8"
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_8
build_name: manywheel-py3_12-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_8-test: # Testing
manywheel-py3_12-cuda13_0-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_12-cuda12_8-build
- manywheel-py3_12-cuda13_0-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
@ -74,13 +74,13 @@ jobs:
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu128
GPU_ARCH_VERSION: "12.8"
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda12_8
build_name: manywheel-py3_12-cuda13_0
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner

View File

@ -323,6 +323,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_10-rocm6_3
build_environment: linux-binary-manywheel
secrets:
@ -434,6 +435,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_10-rocm6_4
build_environment: linux-binary-manywheel
secrets:
@ -915,6 +917,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_11-rocm6_3
build_environment: linux-binary-manywheel
secrets:
@ -1026,6 +1029,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_11-rocm6_4
build_environment: linux-binary-manywheel
secrets:
@ -1507,6 +1511,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_12-rocm6_3
build_environment: linux-binary-manywheel
secrets:
@ -1618,6 +1623,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_12-rocm6_4
build_environment: linux-binary-manywheel
secrets:
@ -2099,6 +2105,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_13-rocm6_3
build_environment: linux-binary-manywheel
secrets:
@ -2210,6 +2217,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_13-rocm6_4
build_environment: linux-binary-manywheel
secrets:
@ -2691,6 +2699,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_13t-rocm6_3
build_environment: linux-binary-manywheel
secrets:
@ -2802,6 +2811,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_13t-rocm6_4
build_environment: linux-binary-manywheel
secrets:
@ -3283,6 +3293,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_14-rocm6_3
build_environment: linux-binary-manywheel
secrets:
@ -3394,6 +3405,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_14-rocm6_4
build_environment: linux-binary-manywheel
secrets:
@ -3875,6 +3887,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_14t-rocm6_3
build_environment: linux-binary-manywheel
secrets:
@ -3986,6 +3999,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_14t-rocm6_4
build_environment: linux-binary-manywheel
secrets:

View File

@ -60,6 +60,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_10-rocm6_4
build_environment: linux-binary-manywheel-rocm
secrets:

View File

@ -127,6 +127,8 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# More memory is needed to build with asan
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan

View File

@ -0,0 +1,54 @@
name: quantization-periodic
on:
push:
tags:
- ciflow/quantization-periodic/*
workflow_dispatch:
schedule:
# run weekly
- cron: "45 0 * * 0"
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-default-label-prefix:
name: get-default-label-prefix
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
periodic-quantization-build:
name: periodic-quantization-build
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '8.9'
test-matrix: |
{ include: [
{ config: "quantization", shard: 1, num_shards: 1, runner: "${{ needs.get-default-label-prefix.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
]}
secrets: inherit
periodic-test-quantization:
name: periodic-test-quantization
uses: ./.github/workflows/_linux-test.yml
needs: periodic-quantization-build
with:
build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
docker-image: ${{ needs.periodic-quantization-build.outputs.docker-image }}
test-matrix: ${{ needs.periodic-quantization-build.outputs.test-matrix }}
secrets: inherit

View File

@ -140,6 +140,8 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# More memory is needed to build with asan
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan

76
.github/workflows/test-b200.yml vendored Normal file
View File

@ -0,0 +1,76 @@
# B200 Smoke Tests CI Workflow
#
# This workflow runs smoke tests on B200 hardware
#
# Flow:
# 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
# 2. Runs smoke tests on linux.dgx.b200 runner
# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
#
# Triggered by:
# - Pull requests modifying this workflow file
# - Manual dispatch
# - Schedule (every 6 hours)
# - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag)
name: B200 Smoke Tests
on:
pull_request:
paths:
- .github/workflows/test-b200.yml
workflow_dispatch:
schedule:
- cron: 0 4,10,16,22 * * * # every 6 hours
push:
tags:
- ciflow/b200/*
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-cuda12_8-py3_10-gcc11-sm100-build:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
]}
# config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
secrets: inherit

1
.gitignore vendored
View File

@ -82,6 +82,7 @@ torch/return_types.pyi
torch/nn/functional.pyi
torch/utils/data/datapipes/datapipe.pyi
torch/csrc/autograd/generated/*
torch/csrc/functionalization/generated/*
torch/csrc/lazy/generated/*.[!m]*
torch_compile_debug/
# Listed manually because some files in this directory are not generated

View File

@ -49,7 +49,7 @@ init_command = [
'mccabe==0.7.0',
'pycodestyle==2.14.0',
'pyflakes==3.4.0',
'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"',
]
@ -153,7 +153,7 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
'numpy==2.1.0 ; python_version >= "3.12"',
'expecttest==0.3.0',
'mypy==1.16.0',
@ -1453,7 +1453,7 @@ init_command = [
'--dry-run={{DRYRUN}}',
'usort==1.0.8.post1',
'isort==6.0.1',
'ruff==0.12.9', # sync with RUFF
'ruff==0.13.1', # sync with RUFF
]
is_formatter = true
@ -1587,7 +1587,7 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'ruff==0.12.9', # sync with PYFMT
'ruff==0.13.1', # sync with PYFMT
]
is_formatter = true

View File

@ -22,6 +22,7 @@ COMMON_COPTS = [
"-DHAVE_SHM_UNLINK=1",
"-D_FILE_OFFSET_BITS=64",
"-DUSE_FBGEMM",
"-DUSE_DISTRIBUTED",
"-DAT_PER_OPERATOR_HEADERS",
"-DATEN_THREADING=NATIVE",
"-DNO_CUDNN_DESTROY_HANDLE",
@ -90,6 +91,8 @@ generated_cpu_cpp = [
"aten/src/ATen/NativeMetaFunctions.h",
"aten/src/ATen/RegistrationDeclarations.h",
"aten/src/ATen/VmapGeneratedPlumbing.h",
"aten/src/ATen/ViewMetaClasses.h",
"aten/src/ATen/ViewMetaClasses.cpp",
"aten/src/ATen/core/aten_interned_strings.h",
"aten/src/ATen/core/enum_tag.h",
"aten/src/ATen/core/TensorBody.h",
@ -810,7 +813,7 @@ cc_library(
name = "torch_python",
srcs = libtorch_python_core_sources
+ if_cuda(libtorch_python_cuda_sources)
+ libtorch_python_distributed_sources
+ if_cuda(libtorch_python_distributed_sources)
+ GENERATED_AUTOGRAD_PYTHON,
hdrs = glob([
"torch/csrc/generic/*.cpp",
@ -832,36 +835,6 @@ pybind_extension(
],
)
cc_library(
name = "functorch",
hdrs = glob([
"functorch/csrc/dim/*.h",
]),
srcs = glob([
"functorch/csrc/dim/*.cpp",
]),
deps = [
":aten_nvrtc",
":torch_python",
"@pybind11",
],
)
pybind_extension(
name = "functorch/_C",
copts=[
"-DTORCH_EXTENSION_NAME=_C"
],
srcs = [
"functorch/csrc/init_dim_only.cpp",
],
deps = [
":functorch",
":torch_python",
":aten_nvrtc",
],
)
cc_binary(
name = "torch/bin/torch_shm_manager",
srcs = [
@ -902,7 +875,6 @@ py_library(
],
data = [
":torch/_C.so",
":functorch/_C.so",
":torch/bin/torch_shm_manager",
],
)
@ -1105,6 +1077,7 @@ test_suite(
"aten/src/ATen/templates/LazyNonNativeIr.h",
"aten/src/ATen/templates/RegisterDispatchKey.cpp",
"aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
"aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp",
"aten/src/ATen/native/native_functions.yaml",
"aten/src/ATen/native/tags.yaml",
"aten/src/ATen/native/ts_native_functions.yaml",

View File

@ -180,9 +180,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
set(CPU_POWER ON)
endif()
# For non-supported platforms, turn USE_DISTRIBUTED off by default.
# NB: USE_DISTRIBUTED simply disables the backend; distributed code
# still gets built
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
# tested and likely won't work without additional changes.
if(NOT LINUX AND NOT WIN32)
set(USE_DISTRIBUTED
OFF
@ -262,11 +261,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
option(USE_NATIVE_ARCH "Use -march=native" OFF)
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
option(USE_DISTRIBUTED "Enable default distributed backends" ON)
option(USE_DISTRIBUTED "Use distributed" ON)
cmake_dependent_option(USE_NCCL "Use NCCL" ON
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_XCCL "Use XCCL" ON
"USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
"USE_XPU;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@ -438,10 +437,11 @@ if(WIN32)
PATH_SUFFIXES lib
NO_DEFAULT_PATH)
if(NOT libuv_tmp_LIBRARY)
set(USE_DISTRIBUTED OFF)
set(USE_GLOO OFF)
message(
WARNING
"Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
)
else()
@ -1390,10 +1390,6 @@ endif()
include(cmake/Summary.cmake)
caffe2_print_configuration_summary()
if(BUILD_FUNCTORCH)
add_subdirectory(functorch)
endif()
# Parse custom debug info
if(DEFINED USE_CUSTOM_DEBINFO)
string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")

View File

@ -1,20 +1,61 @@
# Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html
# Include source files in SDist
include CMakeLists.txt
include *.bzl *.bazel .bazel* BUILD *.BUILD BUILD.* WORKSPACE
include BUCK BUCK.*
include requirements*.txt
include version.txt
include [Mm]akefile *.[Mm]akefile [Mm]akefile.*
include [Dd]ockerfile *.[Dd]ockerfile [Dd]ockerfile.* .dockerignore
# Include individual top-level files
include CITATION.cff
include CODEOWNERS
include Dockerfile
include LICENSE
include MANIFEST.in
include Makefile
include NOTICE
include .bc-linter.yml
include .clang-format .clang-tidy
include .cmakelintrc
include .coveragerc
include .dockerignore
include .editorconfig
include .flake8
include .gdbinit
include .lintrunner.toml
include .lldbinit
include codex_setup.sh
include docker.Makefile
include pyrefly.toml
include ubsan.supp
# Include bazel and BUCK related files
include BUILD.bazel BUCK.oss
include WORKSPACE
include *.bzl
include .bazelignore .bazelrc .bazelversion
# Include general configuration files
include *.ini
# Include important top-level information
include *.md
# Include technical text files at the moment, comprises
# version.txt, CMakeLists.txt, requirements.txt
include *.txt
# Include ctags configuration
include .ctags.d/*.ctags
# Include subfolders completely
graft .devcontainer
graft .vscode
graft android
graft aten
graft benchmarks
graft binaries
graft c10
graft caffe2
graft cmake
graft docs
graft functorch
graft ios
graft mypy_plugins
graft scripts
graft test
graft third_party
graft tools
graft torch
@ -22,29 +63,37 @@ graft torchgen
# FIXME: torch-xla build during codegen will fail if include this file in wheel
exclude torchgen/BUILD.bazel
# Misc files and directories in SDist
include *.md
include CITATION.cff
include LICENSE NOTICE
include mypy*.ini
graft benchmarks
graft docs
graft mypy_plugins
graft scripts
# The following exclusions omit parts from third-party dependencies that
# contain invalid symlinks[1] and that are not needed for pytorch, such as
# bindings for unused languages
prune third_party/flatbuffers/java
prune third_party/flatbuffers/kotlin
prune third_party/ittapi/rust
prune third_party/nccl/pkg/debian
prune third_party/opentelemetry-cpp/third_party/prometheus-cpp/cmake/project-import-*
# The following document is also an invalid symlink[1] and superfluous
exclude third_party/flatbuffers/docs/source/CONTRIBUTING.md
# Omit autogenerated code
prune torchgen/packaged
# Omit caches, compiled, and scm related content
prune */__pycache__
prune **/.github
prune **/.gitlab
global-exclude *.o *.obj *.so *.dylib *.a *.pxd *.dll *.lib
global-exclude *.py[cod] *.swp *~
global-exclude .git .git-blame-ignore-revs .gitattributes .gitignore .gitmodules
global-exclude .gitlab-ci.yml
# Misc files needed for custom setuptools command
include .gitignore
include .gitmodules
# Include test suites in SDist
graft test
include pytest.ini
include .coveragerc
# [1] Invalid symlinks for the purposes of Python source distributions are,
# according to the source distribution format[2] links pointing outside the
# destination directory or links with a `..` component, which is those of
# concern here.
# Prune generated/compiled files
prune torchgen/packaged
prune */__pycache__
global-exclude *.o *.obj *.so *.a *.dylib *.pxd *.dll *.lib *.py[cod]
prune */.git
global-exclude .git *~ *.swp
# [2] https://packaging.python.org/en/latest/specifications/source-distribution-format/#source-distribution-archive-features

View File

@ -161,7 +161,7 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
#### Prerequisites
If you are installing from source, you will need:
- Python 3.9 or later
- Python 3.10 or later
- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
- Visual Studio or Visual Studio Build Tool (Windows only)

View File

@ -317,10 +317,20 @@ IF(USE_FBGEMM_GENAI)
-greedy-reverse-local-assignment=1
-fhip-new-launch-api)
# Only compile for gfx942 for now.
# This is rather hacky, I could not figure out a clean solution :(
set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
endif()
set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
hip_add_library(
fbgemm_genai STATIC
${fbgemm_genai_native_rocm_hip}
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)

View File

@ -468,7 +468,7 @@ inline Tensor _sum_to(
// if we assume no reduction due to unbacked we ensure that at runtime.
TORCH_MAYBE_SYM_CHECK(
sym_eq(shape[i - leading_dims], sizes[i]),
"non-reduction path was assumed due to unabcked symbols expected those two sizes to be the same:",
"non-reduction path was assumed due to unbacked symbols expected those two sizes to be the same:",
shape[i - leading_dims],
", ",
sizes[i])

View File

@ -9,11 +9,6 @@
namespace at::functionalization {
ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
if (out_idx == this->out_index) return *this;
return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx);
}
// Note [Functionalization: Alias Removal Part 2]
// See Note [Functionalization: Alias Removal] for more details.
// This function applies a single update from one of the views to the StorageImpl.
@ -42,12 +37,12 @@ ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
at::Tensor t = update.new_val;
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
if (update.view_metas.empty()) return t;
if (update.view_metas.empty()) { return t; }
std::vector<at::Tensor> tmp_values({base});
tmp_values.reserve(update.view_metas.size());
for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back());
// NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
// All of these ops require additional information to recover the sizes of the original tensor.
// If need to, we could probably apply this optimization and only bother computing tmp_values
@ -55,9 +50,8 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co
tmp_values.push_back(std::move(next_view));
}
for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) {
int64_t out_idx = update.view_metas[i].out_index;
// Each view inverse is implemented in ViewInverses.cpp.
t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
t = update.view_metas[i]->reverse(tmp_values[i], t);
}
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
return t;
@ -111,13 +105,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
}
void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) {
TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");
if (metas.size() > 1) {
for (size_t i = 1; i < metas.size(); ++i) {
// Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided,
"During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
" was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
"so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "

View File

@ -8,44 +8,89 @@ namespace at::functionalization {
// See Note [Functionalization Pass In Core]
enum class InverseReturnMode {
/// Specifies that functional inverses should always return a view.
AlwaysView,
/// Specifies that functional inverses should always return a non-view / copy.
NeverView,
/// Specifies that functional inverses should return a view unless a (copying)
/// scatter
/// inverse exists, in which case that will be used instead.
/// This avoids as_strided() calls that can be difficult for subclasses to
/// handle.
ViewOrScatterInverse,
};
#define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \
static const char* name() { \
return #TYPE; \
}
#define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \
using SerializableTuple = std::tuple<__VA_ARGS__>
// ViewMeta is a class used by the functionalization pass to navigate between
// a base tensor and a view tensor.
// For example, if I call `b = a.view1(...)`
// the functionalization pass will generate and store a ViewMeta on b that looks
// like:
// the functionalization pass will generate and store a ViewMeta specialization
// for `view1` operation on b that looks like:
//
// ViewMeta(
// [<captures>](const Tensor& base, int64_t mutated_view_idx) {
// return base.view1(...);
// },
// [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
// int64_t mutated_view_idx) -> at::Tensor {
// return at::functionalization::impl::view1_inverse(base, mutated_view,
// ...);
// struct TORCH_API view1_ViewMeta : public ViewMeta {
// FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta);
// FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
// bool /* reapply_views */,
// const std::vector<int64_t>&);
//
// view1_ViewMeta(const SerializableTuple& tpl)
// : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
//
// view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
// : ViewMeta(/*has_symbolic_inputs=*/false),
// reapply_views(reapply_views),
// size(size) {}
//
// Tensor forward(const Tensor& base) override {
// return base.view1(...);
// }
//
// The forward_fn lambda describes how to replay view1 on a tensor.
// Tensor reverse(const Tensor& base, const Tensor& mutated_view) override {
// return at::functionalization::impl::view1_inverse(base, mutated_view,
// ...);
// }
//
// The reverse_fn lambda describes how, given a tensor that is already a view,
// SerializableTuple to_serializable_tuple() {
// return std::make_tuple(reapply_views, size);
// }
//
// bool reapply_views;
// std::vector<int64_t> size;
// };
//
// The forward function describes how to replay view1 on a tensor.
//
// The reverse function describes how, given a tensor that is already a view,
// how to get the corresponding base tensor. See Note [Functionalization Pass:
// View Inverses] for details.
//
// `SerializedTuple` is a typedef that defines an `std::tuple<...>` type
// representing the `ViewMeta` instance state. Methods that take in/return such
// a type are used for supporting pickle serialization.
struct ViewMeta {
ViewMeta(
std::function<Tensor(const Tensor&, int64_t)> forward,
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
bool has_symbolic_inputs,
bool is_multi_output = false,
bool is_as_strided = false,
int64_t out_idx = 0)
: forward_fn(std::move(forward)),
reverse_fn(std::move(reverse)),
out_index(out_idx),
: out_index(out_idx),
is_multi_output(is_multi_output),
is_as_strided(is_as_strided),
has_symbolic_inputs(has_symbolic_inputs) {}
std::function<Tensor(const Tensor&, int64_t)> forward_fn;
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
virtual ~ViewMeta() = default;
virtual Tensor forward(const Tensor& base) = 0;
virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
// See Note [out_idx in ViewMeta]
int64_t out_index;
@ -57,10 +102,17 @@ struct ViewMeta {
// Tells us if this view operation has any symbolic inputs
bool has_symbolic_inputs;
// Returns a copy of the current ViewMeta, if out_idx matches the current
// out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
// Returns a new ViewMeta with the same forward/reverse
// functions, but a new out index.
ViewMeta to_out_idx(int64_t out_idx);
//
// This method should be implemented by those `ViewMeta` that have more than
// one output.
virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) {
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"ViewMeta::to_out_index not implemented. ",
"Likely because there's only one output.");
}
};
// FunctionalStorageImpl is a subclass of StorageImpl used by the
@ -93,14 +145,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
const at::Tensor new_val;
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
const std::vector<ViewMeta> view_metas;
const std::vector<std::shared_ptr<ViewMeta>> view_metas;
};
explicit FunctionalStorageImpl(const Tensor& value);
void add_update(
const Tensor& updated_val,
const std::vector<ViewMeta>& view_metas);
const std::vector<std::shared_ptr<ViewMeta>>& view_metas);
bool apply_updates();
const Tensor& base() {
return base_;

View File

@ -129,17 +129,19 @@ void FunctionalTensorWrapper::freeze_storage() const {
// - view_value: The output tensor that we need to wrap.
// - base: The "base" of the view that `view_value` was generated from.
// See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
: c10::TensorImpl(
c10::DispatchKeySet(DispatchKey::Functionalize),
view_value.dtype(),
base->storage().data_ptr().device()
),
value_(view_value),
is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
was_storage_changed_(base->was_storage_changed_),
is_symbolic_(base->is_symbolic_)
{
FunctionalTensorWrapper::FunctionalTensorWrapper(
const Tensor& view_value,
const FunctionalTensorWrapper* base,
const std::shared_ptr<functionalization::ViewMeta>& meta)
: c10::TensorImpl(
c10::DispatchKeySet(DispatchKey::Functionalize),
view_value.dtype(),
base->storage().data_ptr().device()),
value_(view_value),
is_multi_output_view_(
base->is_multi_output_view_ || meta->is_multi_output),
was_storage_changed_(base->was_storage_changed_),
is_symbolic_(base->is_symbolic_) {
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
set_constructor_metadata();
@ -148,11 +150,10 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
view_metas_ = base->view_metas_; // copy
}
view_metas_.push_back(meta);
maybe_mark_symbolic(meta);
maybe_mark_symbolic(meta.get());
storage_ = base->storage_; // alias this tensor's storage with the base tensor's
}
functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
}
@ -176,18 +177,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
}
// See Note [Functionalization Pass - Inplace View Ops]
void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) {
view_metas_.push_back(meta);
// Manually track the fact that this tensor received a metadata mutation!
has_metadata_mutation_ = true;
// Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
maybe_mark_symbolic(meta);
maybe_mark_symbolic(meta.get());
// Note [Functionalization Pass - Inplace View Ops]
// So, these ops are special - they're mutation AND view ops. They get special codegen.
// An example is transpose_, e.g. `a.transpose_()`
// Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
at::AutoDispatchSkipFunctionalize guard;
value_ = meta.forward_fn(value_, meta.out_index);
value_ = meta->forward(value_);
TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
}
@ -368,15 +369,8 @@ void FunctionalTensorWrapper::sync_() {
regenerate_from_base();
}
Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
auto t = base;
// Reapply views to get the viewed tensor from the base in alias_
for (auto& view_meta: view_metas_) {
t = view_meta.forward_fn(t, view_meta.out_index);
}
return t;
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const {
return view_metas_;
}
void FunctionalTensorWrapper::regenerate_from_base() {
@ -385,7 +379,7 @@ void FunctionalTensorWrapper::regenerate_from_base() {
auto t = storage_impl->base();
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
t = apply_view_metas(t);
t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_);
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
replace_(t, /*from_lazy_regenerate=*/true);
@ -727,11 +721,11 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) {
}
bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
if (t_list.empty()) return false;
if (t_list.empty()) { return false; }
auto functional_count = 0;
for (const auto i : c10::irange(t_list.size())) {
auto const & e= t_list[i];
if (!e.has_value() || !e->defined()) continue;
if (!e.has_value() || !e->defined()) { continue; }
if (isFunctionalTensor(e)) {
++functional_count;
}
@ -741,10 +735,10 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
template <typename T>
static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
if (list.size() == 0) return false;
if (list.size() == 0) { return false; }
auto functional_count = 0;
for (const auto& tensor : list) {
if (!tensor.defined()) continue;
if (!tensor.defined()) { continue; }
if (isFunctionalTensor(tensor)) {
++functional_count;
}
@ -762,20 +756,28 @@ void freeze_functional_tensor(const Tensor& tensor) {
functional_base_impl->freeze_storage();
}
Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
Tensor create_functional_tensor_with_view_meta(
const at::Tensor& view_to_wrap,
const at::Tensor& base,
const std::shared_ptr<functionalization::ViewMeta>& meta,
int64_t out_idx) {
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
auto meta_ = meta;
if (out_idx != 0) {
// Note [out_idx in ViewMeta]
// When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
// Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
meta = meta.to_out_idx(out_idx);
meta_ = meta->to_out_index(out_idx);
}
return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_);
}
std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
std::vector<Tensor> create_functional_tensor_with_view_meta(
ITensorListRef view_to_wrap,
const at::Tensor& base,
const std::shared_ptr<functionalization::ViewMeta>& meta) {
std::vector<Tensor> outputs(view_to_wrap.size());
int64_t i = 0;
for (const auto& tensor : view_to_wrap) {
@ -785,12 +787,22 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_
return outputs;
}
void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) {
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
self_impl->mutate_view_meta(meta);
}
Tensor apply_view_meta_sequence(
const Tensor& base,
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) {
Tensor r = base;
for (auto& vm : sequence) {
r = vm->forward(r);
}
return r;
}
// Note [Propagating strides in the functionalization pass]
// In order to properly compute stride information, the functionalization pass
// calls each {view} reference implementations with meta tensors.
@ -884,7 +896,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s
const auto& ivalue = returns[idx];
if (ivalue.isTensor()) {
const auto& t = ivalue.toTensor();
if (!t.defined()) continue;
if (!t.defined()) { continue; }
at::functionalization::impl::sync(t);
auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
(*stack)[returns_begin + idx] = t_new;

View File

@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
explicit FunctionalTensorWrapper(
const Tensor& view_value,
const FunctionalTensorWrapper* base,
const functionalization::ViewMeta& meta);
const std::shared_ptr<functionalization::ViewMeta>& meta);
// Get the underlying, actual tensor, that doesn't know anything about
// functionalization.
@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
->are_all_mutations_under_no_grad_or_inference_mode();
}
void maybe_mark_symbolic(const functionalization::ViewMeta& meta) {
is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs;
void maybe_mark_symbolic(functionalization::ViewMeta* meta) {
is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs;
}
bool is_symbolic() const {
return is_symbolic_;
}
// Runs the forward_fn of every ViewMeta collected in the current instance
// to some other base.
Tensor apply_view_metas(const Tensor& base);
// Retrieves the ViewMeta sequence of this tensor.
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas()
const;
// Sync's the underlying tensor with its alias, if it's out of date. This
// involves two steps: 1) Apply any pending updates/mutations to the alias 2)
@ -146,7 +146,8 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
// from the base tensor. This method is used by inplace-view ops like
// transpose_. It appends a ViewMeta to the existing stack, and refreshes the
// tensor by replaying the views off of the alias.
void mutate_view_meta(const at::functionalization::ViewMeta& meta);
void mutate_view_meta(
const std::shared_ptr<at::functionalization::ViewMeta>& meta);
// Custom implementation of self.set_(src)
void set__impl(const FunctionalTensorWrapper* other);
@ -285,7 +286,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
bool is_symbolic_ = false;
size_t generation_ = 0;
std::vector<at::functionalization::ViewMeta> view_metas_;
std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_;
protected:
static void copy_tensor_metadata(
@ -377,16 +378,20 @@ TORCH_API void propagate_xla_data_direct(
Tensor create_functional_tensor_with_view_meta(
const Tensor& view_to_wrap,
const Tensor& base,
functionalization::ViewMeta meta,
const std::shared_ptr<functionalization::ViewMeta>& meta,
int64_t out_idx = 0);
std::vector<Tensor> create_functional_tensor_with_view_meta(
ITensorListRef view_to_wrap,
const Tensor& base,
const functionalization::ViewMeta& meta);
const std::shared_ptr<functionalization::ViewMeta>& meta);
void mutate_view_meta(
const Tensor& self,
const functionalization::ViewMeta& meta);
const std::shared_ptr<functionalization::ViewMeta>& meta);
TORCH_API Tensor apply_view_meta_sequence(
const Tensor& base,
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);
void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
void set_sizes_strides_offset(

View File

@ -1,3 +1,5 @@
#include <ATen/FunctionalizeFallbackKernel.h>
#include <ATen/core/dispatch/Dispatcher.h>
#include <ATen/core/LegacyTypeDispatch.h>
#include <ATen/EmptyTensor.h>
@ -7,7 +9,6 @@
#include <torch/library.h>
#include <c10/util/irange.h>
#include <c10/util/strides.h>
#include <ATen/EmptyTensor.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/ATen.h>
@ -28,6 +29,31 @@
#include <utility>
#endif
namespace at::functionalization {
Tensor resize__ViewMeta::forward(const Tensor& base) {
if (reapply_views) {
return base.as_strided(size, c10::contiguous_strides(size));
} else {
return at::as_strided_copy(base, size, c10::contiguous_strides(size));
}
}
Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
return base.as_strided_scatter(
mutated_view, size, c10::contiguous_strides(size));
}
Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) {
return at::_unsafe_view_symint(base, size);
}
Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
}
} // namespace at::functionalization
namespace {
void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) {
const auto& schema = op.schema();
@ -106,7 +132,9 @@ namespace {
const auto& ivalue = returns[idx];
if (ivalue.isTensor() && should_wrap_outputs) {
const auto& t = ivalue.toTensor();
if (!t.defined()) continue;
if (!t.defined()) {
continue;
}
auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
(*stack)[returns_begin + idx] = t_new;
} else if (ivalue.isTensorList() && should_wrap_outputs) {
@ -169,19 +197,8 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
// The output of resizing is equivalent to taking a slice of a larger tensor.
// We have to emulate this "slicing" with an as_strided call.
auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
[reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
if (reapply_views) {
return base.as_strided(size, c10::contiguous_strides(size));
} else {
return at::as_strided_copy(base, size, c10::contiguous_strides(size));
}
},
[size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
},
/*has_symbolic_inputs=*/false
);
auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>(
reapply_views, size.vec());
at::functionalization::impl::mutate_view_meta(self, view_meta);
return self;
}
@ -300,17 +317,11 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
tmp_output = at::_unsafe_view_symint(self_, size);
}
bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
[size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
return at::_unsafe_view_symint(base, size);
},
[size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
},
/*has_symbolic_inputs=*/has_symbolic_inputs
);
bool has_symbolic_inputs = std::any_of(
size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
auto view_meta =
std::make_shared<at::functionalization::_unsafe_view_ViewMeta>(
has_symbolic_inputs, size.vec());
auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta));
// See Note [Propagating strides in the functionalization pass]

View File

@ -0,0 +1,58 @@
#pragma once
#include <ATen/FunctionalStorageImpl.h>
namespace at::functionalization {
// `ViewMeta` implementation for `resize_` operation.
struct TORCH_API resize__ViewMeta : public ViewMeta {
FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta)
FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
bool /* reapply_views */,
const std::vector<int64_t>&);
resize__ViewMeta(const SerializableTuple& tpl)
: resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
resize__ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
: ViewMeta(/*has_symbolic_inputs=*/false),
reapply_views(reapply_views),
size(size) {}
Tensor forward(const Tensor& base) override;
Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
SerializableTuple to_serializable_tuple() {
return std::make_tuple(reapply_views, size);
}
bool reapply_views;
std::vector<int64_t> size;
};
// `ViewMeta` implementation for `_unsafe_view` operation.
struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta {
FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta)
FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
bool /* has_symbolic_inputs */,
const std::vector<c10::SymInt>&);
_unsafe_view_ViewMeta(const SerializableTuple& tpl)
: _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
_unsafe_view_ViewMeta(
bool has_symbolic_inputs,
const std::vector<c10::SymInt>& size)
: ViewMeta(has_symbolic_inputs), size(size) {}
Tensor forward(const Tensor& base) override;
Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
SerializableTuple to_serializable_tuple() {
return std::make_tuple(has_symbolic_inputs, size);
}
std::vector<c10::SymInt> size;
};
} // namespace at::functionalization

View File

@ -1,32 +1,22 @@
#include <ATen/core/PythonOpRegistrationTrampoline.h>
#include <c10/core/impl/PyInterpreterHooks.h>
// TODO: delete this
namespace at::impl {
// The strategy is that all python interpreters attempt to register themselves
// as the main interpreter, but only one wins. Only that interpreter is
// allowed to interact with the C++ dispatcher. Furthermore, when we execute
// logic on that interpreter, we do so hermetically, never setting pyobj field
// on Tensor.
std::atomic<c10::impl::PyInterpreter*>
PythonOpRegistrationTrampoline::interpreter_{nullptr};
c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::interpreter_ = nullptr;
c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() {
return PythonOpRegistrationTrampoline::interpreter_.load();
return c10::impl::getGlobalPyInterpreter();
}
bool PythonOpRegistrationTrampoline::registerInterpreter(
c10::impl::PyInterpreter* interp) {
c10::impl::PyInterpreter* expected = nullptr;
interpreter_.compare_exchange_strong(expected, interp);
if (expected != nullptr) {
// This is the second (or later) Python interpreter, which means we need
// non-trivial hermetic PyObject TLS
c10::impl::HermeticPyObjectTLS::init_state();
if (interpreter_ != nullptr) {
return false;
} else {
return true;
}
interpreter_ = interp;
return true;
}
} // namespace at::impl

View File

@ -2,19 +2,21 @@
#include <ATen/core/dispatch/Dispatcher.h>
// TODO: this can probably live in c10
// TODO: We can get rid of this
namespace at::impl {
// Manages the single Python interpreter instance for PyTorch.
class TORCH_API PythonOpRegistrationTrampoline final {
static std::atomic<c10::impl::PyInterpreter*> interpreter_;
static c10::impl::PyInterpreter* interpreter_;
public:
// Returns true if you successfully registered yourself (that means
// you are in the hot seat for doing the operator registrations!)
// Register the Python interpreter. Returns true on first registration,
// false if an interpreter was already registered.
static bool registerInterpreter(c10::impl::PyInterpreter*);
// Returns the registered interpreter via the global PyInterpreter hooks.
// Returns nullptr if no interpreter has been registered yet.
static c10::impl::PyInterpreter* getInterpreter();
};

View File

@ -149,5 +149,105 @@ static inline void pack_vnni4(
#endif
}
// This is a helper function for transpose_pack_vnni4
// Transform a [4, 16] block (with incontiguous output)
// Src:
// a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16
// b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15 b16
// c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16
// d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 d16
// Dst:
// a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d1 d2 d3 d4
// a5 a6 a7 a8 b5 b6 b7 b8 c5 c6 c7 c8 d5 d6 d7 d8
// a9 a10 a11 a12 b9 b10 b11 b12 c9 c10 c11 c12 d9 d10 d11 d12
// a13 a14 a15 a16 b13 b14 b15 b16 c13 c14 c15 c16 d13 d14 d15 d16
template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
static inline void transpose_vnni4_pad_4x16_block(
const scalar_t* src,
scalar_t* dst,
int64_t ld_src,
int64_t ld_dst,
int krem = 4) {
#if defined(CPU_CAPABILITY_AVX512)
__m128i r[4];
for (int i = 0; i < krem; ++i) {
r[i] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * ld_src));
}
for (int i = krem; i < 4; ++i) {
r[i] = _mm_setzero_si128();
}
// Transpose 4x16 bytes using unpack and shuffle
__m128i t0 = _mm_unpacklo_epi32(r[0], r[1]);
__m128i t1 = _mm_unpackhi_epi32(r[0], r[1]);
__m128i t2 = _mm_unpacklo_epi32(r[2], r[3]);
__m128i t3 = _mm_unpackhi_epi32(r[2], r[3]);
__m128i r0 = _mm_unpacklo_epi64(t0, t2);
__m128i r1 = _mm_unpackhi_epi64(t0, t2);
__m128i r2 = _mm_unpacklo_epi64(t1, t3);
__m128i r3 = _mm_unpackhi_epi64(t1, t3);
// Store output
if (krem == 4) {
// normal case
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), r0);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst), r1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 2), r2);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 3), r3);
} else {
// masked case
__mmask16 mask = (1ULL << (krem * 4)) - 1;
_mm_mask_storeu_epi8(dst, mask, r0);
_mm_mask_storeu_epi8(reinterpret_cast<__m128i*>(dst + ld_dst), mask, r1);
_mm_mask_storeu_epi8(
reinterpret_cast<__m128i*>(dst + ld_dst * 2), mask, r2);
_mm_mask_storeu_epi8(
reinterpret_cast<__m128i*>(dst + ld_dst * 3), mask, r3);
}
#else
TORCH_CHECK(
false,
"transpose_vnni4_pad_4x16_block is only supported when AVX-512 is supported")
#endif
}
// Do the transpose packing fusion with VNNI4
// Reorder [K, N] → [N/4, K, 4] (VNNI4-style layout for bit8)
template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
static inline void transpose_pack_vnni4(
const scalar_t* src,
scalar_t* dst,
int64_t ld_src,
int64_t K,
int64_t N) {
#if defined(CPU_CAPABILITY_AVX512)
TORCH_CHECK(
N % 16 == 0, "N needs to be multiple of 16 for transpose_pack_vnni4");
int64_t bk = 0;
int64_t _K = K / 4 * 4;
for (; bk < _K; bk += 4) {
int64_t bn = 0;
for (; bn < N; bn += 16) {
transpose_vnni4_pad_4x16_block(
src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4);
}
}
// Handle leftover K rows (< 4)
if (K % 4 != 0) {
int krem = K - bk;
int64_t bn = 0;
for (; bn < N; bn += 16) {
transpose_vnni4_pad_4x16_block(
src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4, krem);
}
}
#else
TORCH_CHECK(
false, "transpose_pack_vnni4 is only supported when AVX-512 is supported")
#endif
}
} // namespace CPU_CAPABILITY
} // namespace at::vec

View File

@ -281,6 +281,9 @@ bool CUDAHooks::compiledWithMIOpen() const {
bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
#if AT_CUDNN_ENABLED()
if (!hasCUDA()) {
return false;
}
// NOTE: extra parenthesis around numbers disable clang warnings about
// dead code
return true;
@ -291,6 +294,9 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const {
#if AT_CUDNN_ENABLED()
if (!hasCUDA()) {
return false;
}
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
// Check for Volta cores
if (prop->major >= 7) {
@ -305,6 +311,9 @@ bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const {
bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
#if AT_CUDNN_ENABLED()
if (!hasCUDA()) {
return false;
}
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
// Check for Volta cores
if (prop->major >= 8) {

View File

@ -465,8 +465,11 @@ inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor
return false;
}
auto fmt = input.suggest_memory_format();
return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
auto is_channel_last = [](const at::Tensor& t) {
auto fmt = t.suggest_memory_format();
return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
};
return is_channel_last(input) || is_channel_last(weight);
}
} // namespace at::native

View File

@ -32,10 +32,6 @@
#include <ATen/native/mkldnn/Utils.h>
#endif
#ifdef USE_MPS
#include <ATen/mps/MPSDevice.h>
#endif
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
@ -1429,12 +1425,8 @@ static inline at::MemoryFormat determine_backend_memory_format(
}
break;
case ConvBackend::Mps:
case ConvBackend::MpsTranspose:
if (mps_conv_use_channels_last(input, weight)) {
#ifdef USE_MPS
if (!mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS)) {
break;
}
#endif
backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast;
}
break;

View File

@ -9,6 +9,7 @@
#include <ATen/native/TransposeType.h>
#include <ATen/native/Unfold3d.h>
#include <c10/util/irange.h>
#include <c10/util/safe_numerics.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
@ -174,6 +175,23 @@ static inline void slow_conv3d_shape_check(
const int64_t input_height = input.size(dim_height);
const int64_t input_width = input.size(dim_width);
constexpr int64_t MAX_SAFE_PAD = (1LL << 61);
TORCH_CHECK_VALUE(
pad_height <= MAX_SAFE_PAD,
"Padding height too large: pad_height=",
pad_height);
TORCH_CHECK_VALUE(
pad_width <= MAX_SAFE_PAD,
"Padding width too large: pad_width=",
pad_width);
TORCH_CHECK_VALUE(
pad_depth <= MAX_SAFE_PAD,
"Padding depth too large: pad_depth=",
pad_depth);
const int64_t exact_input_depth = input_depth + 2 * pad_depth;
const int64_t exact_input_height = input_height + 2 * pad_height;
const int64_t exact_input_width = input_width + 2 * pad_width;
@ -221,6 +239,14 @@ static inline void slow_conv3d_shape_check(
output_width,
"). Output size is too small");
uint64_t kernel_product;
TORCH_CHECK(
!c10::mul_overflows(kernel_height, kernel_width, &kernel_product),
"Kernel height x width product is too large: kernel_height=",
kernel_height,
", kernel_width=",
kernel_width);
if (weight.defined()) {
int64_t n_input_plane = weight.size(1);
if (weight.dim() == 2) {

View File

@ -97,43 +97,38 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
int64_t nDims = self.dim();
TORCH_CHECK(nDims >= 2, "dimensions must larger than 1");
int64_t height = self.size(0);
int64_t width = self.size(1);
auto height = self.sym_size(0);
auto width = self.sym_size(1);
if (nDims > 2) {
int64_t dim1 = height;
for (const auto i : c10::irange(1, nDims)) {
if (self.size(i) != dim1) {
if (self.sym_size(i) != height) {
TORCH_CHECK(false, "all dimensions of input must be of equal length");
}
}
}
int64_t storage_offset = self.storage_offset();
std::vector<int64_t> sizes;
std::vector<int64_t> strides;
int64_t size = std::min(height, width);
auto storage_offset = self.sym_storage_offset();
auto size = std::min(height, width);
int64_t stride = 0;
for (const auto i : c10::irange(nDims)) {
stride += self.stride(i);
}
strides.push_back(stride);
sizes.push_back(size);
std::vector<SymInt> strides{stride};
std::vector<SymInt> sizes{size};
auto main_diag = self.as_strided(sizes, strides, storage_offset);
auto main_diag = self.as_strided_symint(sizes, strides, storage_offset);
main_diag.fill_(fill_value);
if (wrap && nDims == 2 && height > width + 1) {
std::vector<int64_t> wrap_sizes;
auto step = width + 1;
auto wrap_size = ((self.numel() + step - 1) / step) - size;
std::vector<SymInt> wrap_sizes{wrap_size};
int64_t step = width + 1;
int64_t wrap_size = ((self.numel() + step - 1) / step) - size;
wrap_sizes.push_back(wrap_size);
auto offset = self.stride(0) * (width + 1);
int64_t offset = self.stride(0) * (width + 1);
auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset);
auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset);
wrap_diag.fill_(fill_value);
}

View File

@ -23,6 +23,7 @@
#include <ATen/ops/linspace.h>
#endif
#include <cmath>
#include <numeric>
#include <tuple>
#include <vector>
@ -202,6 +203,46 @@ select_outer_bin_edges(const Tensor& input, std::optional<c10::ArrayRef<double>>
return std::make_pair(leftmost_edges, rightmost_edges);
}
/* Bin edges correction based on the precision representation.
* To maintain the backward compatibility we take max(std::nextafter<>, +1)
* and min(std::nextafter<>, -1) for scalar types. For other types +/- 1 as usual.
*/
void bins_edges_correction(const ScalarType& t, double &leftmost_edge, double &rightmost_edge)
{
#define UPDATE_WITH_LIMIT(real_type, scalartype) \
case ScalarType::scalartype: \
leftmost_edge = std::min( \
static_cast<double>( \
std::nexttoward( \
static_cast<real_type>(leftmost_edge), \
std::numeric_limits<real_type>::lowest() \
) \
), \
leftmost_edge - 1. \
); \
rightmost_edge = std::max( \
static_cast<double>( \
std::nexttoward( \
static_cast<real_type>(rightmost_edge), \
std::numeric_limits<real_type>::max() \
) \
), \
rightmost_edge + 1. \
); \
break;
switch (t) {
UPDATE_WITH_LIMIT(double, Double)
UPDATE_WITH_LIMIT(float, Float)
default:
// Fallback to the default behavior for other types
leftmost_edge -= 1;
rightmost_edge += 1;
}
#undef UPDATE_WITH_LIMIT
}
/* histc's version of the logic for outermost bin edges.
*/
std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input,
@ -216,8 +257,7 @@ std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input,
}
if (leftmost_edge == rightmost_edge) {
leftmost_edge -= 1;
rightmost_edge += 1;
bins_edges_correction(input.dtype().toScalarType(), leftmost_edge, rightmost_edge);
}
TORCH_CHECK(!(std::isinf(leftmost_edge) || std::isinf(rightmost_edge) ||

View File

@ -1,3 +1,5 @@
#include <ATen/core/ATen_fwd.h>
#include <c10/core/ScalarType.h>
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/AccumulateType.h>
#include <ATen/Dispatch.h>
@ -1878,19 +1880,18 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
Tensor xtensor = self.expand(padded_size);
Tensor result;
Tensor urtensor;
if (self.is_quantized()) {
result = at::empty_quantized(target_size, self);
urtensor = at::empty_quantized(target_size, self);
} else {
result = at::empty(target_size, self.options());
urtensor = at::empty(target_size, self.options());
}
// return an empty tensor if one of the repeat dimensions is zero
if (zero_tensor) {
return result;
return urtensor;
}
Tensor urtensor = at::alias(result);
for (const auto i : c10::irange(xtensor.dim())) {
// can't unfold with step 0, so make sure step is at least 1
// (it doesn't matter what it is in that case, because the size is 0).
@ -1900,7 +1901,22 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
urtensor.copy_(xtensor.expand_as(urtensor));
return result;
// Combine the dimensions to produce the target_size.
// xtensor dims: [a0, ..., ad-1]
// urtensor dims: [a0, ..., ad-1, b0, ..., bd-1]
// b dims are produced by unfold.
// Transform urtensor to [a0 * b0, ..., ad-1 * bd-1]
const int64_t n_dims = xtensor.dim();
auto range_a = at::arange(xtensor.dim(), at::TensorOptions(at::kLong));
auto range_b = range_a + n_dims;
auto stacked = stack({std::move(range_a), std::move(range_b)}, 1).flatten();
auto permutation = IntArrayRef(stacked.data_ptr<int64_t>(), n_dims * 2);
// Permute from [a0, ..., ad-1, b0, ..., bd-1] to [a0, b0, ..., ad-1, bd-1]
urtensor = urtensor.permute(permutation);
// Reshape from [a0, b0, ..., ad-1, bd-1] to [a0 * b0, ..., ad-1 * bd-1]
urtensor = urtensor.reshape(target_size);
return urtensor;
}
Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) {

View File

@ -999,12 +999,41 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
dtypes[i] = iter.dtype(i);
}
auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
#ifdef USE_ROCM
constexpr int grp_sz = 128;
launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
if (unrl) {
auto offsets0 = offset_calc.get(idx);
auto offsets1 = offset_calc.get(idx + grp_sz);
auto offsets2 = offset_calc.get(idx + grp_sz * 2);
auto offsets3 = offset_calc.get(idx + grp_sz * 3);
void* out0 = data[0] + offsets0[0];
void* out1 = data[0] + offsets1[0];
void* out2 = data[0] + offsets2[0];
void* out3 = data[0] + offsets3[0];
arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1);
arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1);
arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1);
arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1);
c10::cast_and_store<arg0_t>(dtypes[0], out0, result0);
c10::cast_and_store<arg0_t>(dtypes[0], out1, result1);
c10::cast_and_store<arg0_t>(dtypes[0], out2, result2);
c10::cast_and_store<arg0_t>(dtypes[0], out3, result3);
} else {
auto offsets = offset_calc.get(idx);
void* out = data[0] + offsets[0];
arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
c10::cast_and_store<arg0_t>(dtypes[0], out, result);
}
});
#else
launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
auto offsets = offset_calc.get(idx);
void* out = data[0] + offsets[0];
arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
c10::cast_and_store<arg0_t>(dtypes[0], out, result);
});
#endif
}
}

View File

@ -42,6 +42,19 @@ void bfloat16_copy_kernel_cuda(TensorIteratorBase &iter) {
});
}
#ifdef USE_ROCM
void bfloat16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) {
gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::BFloat16 value) {
return static_cast<float>(value);
});
}
void float16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) {
gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::Half value) {
return static_cast<float>(value);
});
}
#endif
void float8_copy_kernel_cuda(TensorIteratorBase &iter) {
ScalarType dtype = iter.dtype(0);
ScalarType other_dtype = iter.dtype(1);
@ -187,7 +200,17 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) {
} else {
float16_copy_kernel_cuda(iter);
}
} else if (isBitsType(dtype)) {
}
#ifdef USE_ROCM
else if ((iter.dtype(1) == kBFloat16 || iter.dtype(1) == kHalf) && dtype == kFloat) {
if (iter.dtype(1) == kBFloat16) {
bfloat16tofloat32_copy_kernel_cuda(iter);
} else {
float16tofloat32_copy_kernel_cuda(iter);
}
}
#endif
else if (isBitsType(dtype)) {
TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting "
"bits types to different bits types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype);
AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] {

View File

@ -14,7 +14,7 @@ struct EmbeddingBagParams {
::c10::metal::array<idx_type_t, 2> output_strides;
::c10::metal::array<idx_type_t, 2> max_indices_strides;
idx_type_t per_sample_weights_strides;
idx_type_t per_sample_weights_stride;
idx_type_t num_indices;
idx_type_t num_bags;

View File

@ -23,54 +23,72 @@ struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
template <EmbeddingBagMode M, typename T>
struct ReductionOp {
inline opmath_t<T> operator()(
T weight_val,
opmath_t<T> weight_val,
opmath_t<T> out_val,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_strides);
};
template <typename T>
struct ReductionOp<EmbeddingBagMode::SUM, T> {
inline opmath_t<T> operator()(
T weight_val,
opmath_t<T> out_val,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_strides) {
if (per_sample_weights_strides) {
T per_sample_weight = per_sample_weights
[per_sample_weights_strides * per_sample_weights_index];
return static_cast<opmath_t<T>>(per_sample_weight) *
static_cast<opmath_t<T>>(weight_val) +
out_val;
} else {
return static_cast<opmath_t<T>>(weight_val) + out_val;
}
}
};
template <typename T>
struct ReductionOp<EmbeddingBagMode::MEAN, T> {
inline opmath_t<T> operator()(
T weight_val,
opmath_t<T> out_val,
uint32_t,
constant T*,
uint32_t) {
return static_cast<opmath_t<T>>(weight_val) + out_val;
bool is_first) {
return weight_val + out_val;
}
};
template <typename T>
struct ReductionOp<EmbeddingBagMode::MAX, T> {
inline opmath_t<T> operator()(
T weight_val,
opmath_t<T> weight_val,
opmath_t<T> out_val,
uint32_t,
constant T*,
uint32_t) {
return max(static_cast<opmath_t<T>>(weight_val), out_val);
bool is_first) {
return (is_first || weight_val > out_val) ? weight_val : out_val;
}
};
template <EmbeddingBagMode M, typename T>
struct MaybeApplyPerSampleWeight {
inline opmath_t<T> operator()(
opmath_t<T> weight_val,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_stride) {
return weight_val;
}
};
template <typename T>
struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
inline opmath_t<T> operator()(
opmath_t<T> weight_val,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_stride) {
if (per_sample_weights_stride) {
T per_sample_weight = per_sample_weights
[per_sample_weights_stride * per_sample_weights_index];
return static_cast<opmath_t<T>>(per_sample_weight) * weight_val;
} else {
return weight_val;
}
}
};
template <EmbeddingBagMode M, typename T, typename I>
struct MaybeCalcMaxIndex {
inline void operator()(
opmath_t<T> weight_val,
opmath_t<T> out_val,
bool is_first,
thread I& max_idx,
I weight_idx,
bool pad) {}
};
template <typename T, typename I>
struct MaybeCalcMaxIndex<EmbeddingBagMode::MAX, T, I> {
inline void operator()(
opmath_t<T> weight_val,
opmath_t<T> out_val,
bool is_first,
thread I& max_idx,
I weight_idx,
bool pad) {
max_idx = !pad && (is_first || weight_val > out_val) ? weight_idx : max_idx;
}
};
@ -96,6 +114,30 @@ struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
}
};
template <EmbeddingBagMode M, typename I>
struct MaybeWriteMaxIndex {
inline void operator()(
device I*,
const constant ::c10::metal::array<uint32_t, 2>&,
uint32_t,
uint32_t,
I) {}
};
template <typename I>
struct MaybeWriteMaxIndex<EmbeddingBagMode::MAX, I> {
inline void operator()(
device I* max_indices,
const constant ::c10::metal::array<uint32_t, 2>& max_indices_strides,
uint32_t bag_idx,
uint32_t feature_idx,
I max_idx) {
max_indices
[bag_idx * max_indices_strides[0] +
feature_idx * max_indices_strides[1]] = max_idx;
}
};
template <EmbeddingBagMode M, typename T, typename I>
void embedding_bag_impl(
constant T* weight,
@ -112,7 +154,7 @@ void embedding_bag_impl(
auto num_bags = params.num_bags;
auto feature_size = params.feature_size;
auto padding_idx = params.padding_idx;
auto per_sample_weights_strides = params.per_sample_weights_strides;
auto per_sample_weights_stride = params.per_sample_weights_stride;
constant auto& output_strides = params.output_strides;
constant auto& weight_strides = params.weight_strides;
constant auto& max_indices_strides = params.max_indices_strides;
@ -120,8 +162,6 @@ void embedding_bag_impl(
auto bag_idx = tid / feature_size;
auto feature_idx = tid % feature_size;
output += bag_idx * output_strides[0] + feature_idx * output_strides[1];
uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
bool is_last_bag = bag_idx + 1 == num_bags;
uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
@ -131,28 +171,37 @@ void embedding_bag_impl(
auto out_val = ReductionOpInit<M, T>()();
uint32_t bag_size_ = 0;
I max_idx = 0;
for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
indices_idx++) {
I weight_idx = indices[indices_idx];
bool pad = (weight_idx == padding_idx);
T weight_val = weight
[static_cast<uint32_t>(weight_idx) * weight_strides[0] +
feature_idx * weight_strides[1]];
auto weight_val = static_cast<opmath_t<T>>(
weight
[static_cast<uint32_t>(weight_idx) * weight_strides[0] +
feature_idx * weight_strides[1]]);
weight_val = MaybeApplyPerSampleWeight<M, T>()(
weight_val, indices_idx, per_sample_weights, per_sample_weights_stride);
auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0);
MaybeCalcMaxIndex<M, T, I>()(
weight_val, out_val, bag_size_ == 0, max_idx, weight_idx, pad);
out_val = pad ? out_val : new_out_val;
offset2bag[indices_idx] = bag_idx;
bag_size_ += static_cast<uint32_t>(!pad);
auto tmp_val = ReductionOp<M, T>()(
weight_val,
out_val,
indices_idx,
per_sample_weights,
per_sample_weights_strides);
out_val = pad ? out_val : tmp_val;
}
*output = ReductionOpFinal<M, T>()(out_val, bag_size_);
output[bag_idx * output_strides[0] + feature_idx * output_strides[1]] =
ReductionOpFinal<M, T>()(out_val, bag_size_);
bag_size[bag_idx] = bag_size_;
MaybeWriteMaxIndex<M, I>()(
max_indices, max_indices_strides, bag_idx, feature_idx, max_idx);
}
#define DISPATCH_IMPL(MODE) \

View File

@ -52,9 +52,7 @@ static void fill_depthwise_conv_desc(MPSGraphDepthwiseConvolution3DOpDescriptor*
NSUInteger dilationRateInX,
NSUInteger dilationRateInY,
NSUInteger paddingHorizontal,
NSUInteger paddingVertical,
c10::MemoryFormat memory_format,
NSUInteger groups) {
NSUInteger paddingVertical) {
descriptor_.strides =
@[ @1, [[NSNumber alloc] initWithInteger:strideInY], [[NSNumber alloc] initWithInteger:strideInX] ];
descriptor_.dilationRates =
@ -103,7 +101,7 @@ static void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
descriptor_.groups = groups;
}
static Tensor _mps_convolution_impl(const Tensor& input_t_,
static Tensor _mps_convolution_impl(const Tensor& input_t,
const Tensor& weight_t,
const std::optional<Tensor>& bias_opt,
IntArrayRef padding,
@ -111,12 +109,15 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
IntArrayRef dilation,
int64_t groups,
std::optional<IntArrayRef> input_shape) {
const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
Tensor input_t = input_t_;
bool is3DConv = input_t.dim() == 5;
if (!is_macOS_15_0_or_newer || is3DConv) {
input_t = input_t.contiguous();
}
constexpr auto kChannelsLast = MemoryFormat::ChannelsLast;
constexpr auto kContiguous = MemoryFormat::Contiguous;
const bool is_macos_15_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
const bool is3DConv = input_t.dim() == 5;
const auto memory_format = input_t.suggest_memory_format();
const auto input_suggested_layout = memory_format == kChannelsLast && is_macos_15_plus ? kChannelsLast : kContiguous;
const bool is_channels_last = mps_conv_use_channels_last(input_t, weight_t) && !is3DConv;
const bool bias_defined = bias_opt ? bias_opt->defined() : false;
TORCH_CHECK(isFloatingType(input_t.scalar_type()), "Convolution is supported only for Floating types");
@ -126,15 +127,6 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
checkAllSameType(c, {input, weight});
checkAllSameGPU(c, {input, weight});
bool bias_defined;
if (bias_opt == std::nullopt)
bias_defined = false;
else
bias_defined = bias_opt->defined();
auto memory_format = input_t.suggest_memory_format();
bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast) && !is3DConv;
auto output_t =
at::empty(input_shape.has_value() ? input_shape.value()
: conv_output_size(input->sizes(), weight->sizes(), padding, stride, dilation),
@ -142,12 +134,18 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
std::nullopt,
kMPS,
std::nullopt,
is_macOS_15_0_or_newer ? memory_format : MemoryFormat::Contiguous);
is_channels_last ? kChannelsLast : kContiguous);
if (output_t.numel() == 0) {
return output_t;
}
TensorArg output{output_t, "result", 0};
// TODO: Remove me when MacOS-14 is no longer supported
std::optional<Tensor> output_c;
if (!is_macos_15_plus && is_channels_last) {
output_c = at::empty_like(output_t, output_t.options().memory_format(kContiguous));
}
if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_1_PLUS)) {
// On macOS < 15.1, MPS convolution kernel does not support output channels > 2^16
for (auto elem : output_t.sizes()) {
@ -186,32 +184,22 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
getArrayRefString(dilation),
getArrayRefString(padding),
groups,
is_channels_last,
input_suggested_layout == kChannelsLast,
mps::getTensorsStringKey({input_t, weight_t}),
bias_defined,
bias_shape_key);
MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
MPSNDArray* inputNDArray = nil;
MPSNDArray* outputNDArray = nil;
if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) {
inputNDArray = getMPSNDArray(input_t, inputShape);
outputNDArray = getMPSNDArray(output_t, outputShape);
}
auto inputShape = mps::getMPSShape(input_t, input_suggested_layout);
auto outputShape = mps::getMPSShape(output_t, input_suggested_layout);
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
MPSShape* weightShape = mps::getMPSShape(weight_t);
bool isDepthwiseConv = ((groups > 1 && (weightShape[1].intValue == 1)) && inputShape.count >= 4 &&
weightShape.count >= 4 && !is_channels_last);
bool isDepthwiseConv =
(groups > 1 && weight_t.size(1) == 1) && input_t.dim() >= 4 && weight_t.dim() >= 4 && !is_channels_last;
MPSGraphTensor* inputTensor =
mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(input_t.scalar_type()), inputShape);
MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
MPSGraphTensor* outputTensor;
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(input_t), inputShape);
auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
MPSGraphTensor* outputTensor = nil;
if (is3DConv) {
MPSGraphConvolution3DOpDescriptor* conv3dDescriptor_ = [[MPSGraphConvolution3DOpDescriptor new] autorelease];
auto conv3dDescriptor_ = [[MPSGraphConvolution3DOpDescriptor new] autorelease];
fill_conv3d_desc(conv3dDescriptor_,
stride[2],
stride[1],
@ -229,17 +217,9 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
descriptor:conv3dDescriptor_
name:nil];
} else if (isDepthwiseConv) {
MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
[[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
stride[1],
stride[0],
dilation[1],
dilation[0],
padding[1],
padding[0],
memory_format,
groups);
auto depthWiseConv3dDescriptor_ = [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
fill_depthwise_conv_desc(
depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor
dimension:-3
@ -258,7 +238,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
dilation[0],
padding[1],
padding[0],
memory_format,
input_suggested_layout,
groups);
outputTensor = [mpsGraph convolution2DWithSourceTensor:inputTensor
@ -270,13 +250,6 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
MPSGraphTensor* biasTensor = nil;
if (bias_defined) {
biasTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(bias_opt.value()));
}
if (is_channels_last && !is_macOS_15_0_or_newer) {
outputTensor = mps::convertNHWCtoNCHW(mpsGraph, outputTensor);
}
if (bias_defined) {
outputTensor = [mpsGraph additionWithPrimaryTensor:outputTensor secondaryTensor:biasTensor name:nil];
}
newCachedGraph->inputTensor_ = inputTensor;
@ -285,27 +258,26 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
newCachedGraph->outputTensor_ = outputTensor;
});
auto inputPlaceholder = inputNDArray ? Placeholder(cachedGraph->inputTensor_, inputNDArray)
: Placeholder(cachedGraph->inputTensor_, input_t, inputShape);
auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
auto inputPlaceholder = input_suggested_layout == kContiguous
? Placeholder(cachedGraph->inputTensor_, output_c || is3DConv ? input_t.contiguous() : input_t)
: Placeholder(cachedGraph->inputTensor_, getMPSNDArray(input_t, inputShape));
auto outputPlaceholder = input_suggested_layout == kContiguous
? Placeholder(cachedGraph->outputTensor_, output_c ? *output_c : output_t)
: Placeholder(cachedGraph->outputTensor_, getMPSNDArray(output_t, outputShape));
auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, output_c ? weight_t.contiguous() : weight_t);
auto biasPlaceholder = Placeholder();
// Reshape the bias to be broadcastable with output of conv2d or conv3d
if (bias_defined) {
if (is3DConv) {
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1, 1}));
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, bias_shape[0], 1, 1, 1}));
} else if (input_suggested_layout == kChannelsLast) {
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, 1, 1, bias_shape[0]}));
} else {
if (is_channels_last && is_macOS_15_0_or_newer) {
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, 1, 1, bias_shape[0]}));
} else {
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1}));
}
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, bias_shape[0], 1, 1}));
}
}
auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray)
: Placeholder(cachedGraph->outputTensor_, output_t);
NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
[[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
auto feeds = [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
feeds[weightsPlaceholder.getMPSGraphTensor()] = weightsPlaceholder.getMPSGraphTensorData();
if (bias_defined) {
@ -315,6 +287,10 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
}
if (output_c) {
output_t.copy_(*output_c);
}
return output_t;
}
@ -351,14 +327,21 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
TensorArg grad_output{grad_output_t, "grad_output", 1}, weight{weight_t, "weight", 2};
checkAllSameType(c, {grad_output, weight});
checkAllSameGPU(c, {grad_output, weight});
auto memory_format = grad_output_t.suggest_memory_format();
bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast) && !is3DConv;
auto grad_input_t = at::empty(input_size, grad_output_t.options(), std::nullopt);
constexpr auto kChannelsLast = at::MemoryFormat::ChannelsLast;
bool is_channels_last = mps_conv_use_channels_last(grad_output_t, weight_t) && !is3DConv;
auto grad_input_t =
at::empty(input_size, grad_output_t.options(), is_channels_last ? std::optional(kChannelsLast) : std::nullopt);
// Avoid "grad_input" when this is being used as transposed convolution
TensorArg grad_input{grad_input_t, "result", 0};
convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
// TODO: Remove me when MacOS-14 is no longer supported
std::optional<Tensor> grad_input_c;
if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_channels_last) {
grad_input_c = at::empty_like(grad_input_t, grad_input_t.options().memory_format(MemoryFormat::Contiguous));
}
// Derive from MPSCachedGraph
struct CachedGraph : public MPSCachedGraph {
CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@ -370,7 +353,6 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
// Add backward with input
@autoreleasepool {
MPSStream* stream = getCurrentMPSStream();
MPSShape* mps_input_shape = getMPSShape(input_size);
std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
is3DConv ? "3d_" : "",
@ -411,15 +393,8 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
} else if (isDepthwiseConv) {
MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
[[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
stride[1],
stride[0],
dilation[1],
dilation[0],
padding[1],
padding[0],
at::MemoryFormat::Contiguous,
groups);
fill_depthwise_conv_desc(
depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor
dimension:-3
withDimension:-4
@ -454,14 +429,18 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
newCachedGraph->gradInputTensor_ = gradInputTensor;
});
auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
auto gradOutputPlaceholder =
Placeholder(cachedGraph->gradOutputTensor_, grad_input_c ? grad_output_t.contiguous() : grad_output_t);
auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, grad_input_c ? weight_t.contiguous() : weight_t);
auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input_c ? *grad_input_c : grad_input_t);
auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, weightsPlaceholder);
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
}
return *grad_input;
if (grad_input_c) {
grad_input_t.copy_(*grad_input_c);
}
return grad_input_t;
}
static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
@ -474,9 +453,11 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
bool bias_defined) {
using namespace at::native::mps;
using namespace mps;
bool is3DConv = input_t.dim() == 5;
const bool is3DConv = input_t.dim() == 5;
TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types");
CheckedFrom c = "mps_convolution_backward_weights";
constexpr auto kChannelsLast = at::MemoryFormat::ChannelsLast;
bool is_channels_last = mps_conv_use_channels_last(input_t, grad_output_t) && !is3DConv;
// For uniformity with everything else, although it seems grad_weight
// would be unambiguous too.
@ -487,7 +468,8 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
checkAllSameGPU(c, {grad_output, input});
auto grad_weight_t =
at::empty(weight_size, grad_output_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
at::empty(weight_size, grad_output_t.options(), is_channels_last ? std::optional(kChannelsLast) : std::nullopt);
TensorArg grad_weight{grad_weight_t, "result", 0};
convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
@ -500,16 +482,23 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
MPSGraphTensor* gradWeightTensor_ = nil;
};
// TODO: Remove me when MacOS-14 is no longer supported
std::optional<Tensor> grad_weight_c;
if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_channels_last) {
grad_weight_c = at::empty_like(grad_weight_t, grad_weight_t.options().memory_format(MemoryFormat::Contiguous));
}
@autoreleasepool {
MPSStream* stream = getCurrentMPSStream();
MPSShape* mps_weight_shape = getMPSShape(weight_size);
std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}:{}",
is3DConv ? "3d_" : "",
getArrayRefString(stride),
getArrayRefString(dilation),
getArrayRefString(padding),
groups,
is_channels_last,
getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
MPSShape* inputShape = getMPSShape(input_t);
@ -541,15 +530,8 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
} else if (isDepthwiseConv) {
MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
[[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
stride[1],
stride[0],
dilation[1],
dilation[0],
padding[1],
padding[0],
at::MemoryFormat::Contiguous,
groups);
fill_depthwise_conv_desc(
depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
NSNumber* outputFeatChannelDim = mps_weight_shape[0];
MPSShape* weightShapeTranspose = @[ @1, outputFeatChannelDim, mps_weight_shape[2], mps_weight_shape[3] ];
MPSGraphTensor* gradWeightTensorTranspose =
@ -583,14 +565,19 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
newCachedGraph->gradWeightTensor_ = gradWeightTensor;
});
auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t);
auto gradOutputPlaceholder =
Placeholder(cachedGraph->gradOutputTensor_, grad_weight_c ? grad_output_t.contiguous() : grad_output_t);
auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, grad_weight_c ? input_t.contiguous() : input_t);
auto outputPlaceholder =
Placeholder(cachedGraph->gradWeightTensor_, grad_weight_c ? *grad_weight_c : grad_weight_t);
auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder);
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
}
if (grad_weight_c) {
grad_weight_t.copy_(*grad_weight_c);
}
return grad_weight_t;
}

View File

@ -66,11 +66,12 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
int64_t num_indices = indices.size(0);
int64_t num_bags = offsets.size(0);
if (include_last_offset) {
TORCH_CHECK(num_bags >= 1, "include_last_offset: number of offsets should be at least 1");
num_bags -= 1;
}
int64_t feature_size = weight.size(1);
auto bag_size = at::empty(offsets.sizes(), indices.options());
auto bag_size = at::empty({num_bags}, indices.options());
auto offset2bag = at::empty({indices.size(0)}, indices.options());
auto output = at::empty({num_bags, feature_size}, weight.options());
@ -94,7 +95,7 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
}
bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
params.per_sample_weights_strides = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
params.num_indices = num_indices;
params.num_bags = num_bags;

View File

@ -3858,7 +3858,7 @@
device_check: NoCheck # TensorIterator
structured: True
dispatch:
CPU, CUDA: aminmax_out
CPU, CUDA, MTIA: aminmax_out
MPS: aminmax_out_mps
- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
@ -3909,7 +3909,7 @@
- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU, CUDA: amax_out
CPU, CUDA, MTIA: amax_out
MPS: amax_out_mps
# Return: (Tensor output, Tensor indices)
@ -4090,7 +4090,7 @@
- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU, CUDA: amin_out
CPU, CUDA, MTIA: amin_out
MPS: amin_out_mps
# TODO: Add this function to MPS dispatch key so that we avoid declaring it in

View File

@ -158,12 +158,46 @@ c10::intrusive_ptr<EmbeddingPackedParamsBase> PackedEmbeddingBagWeight::prepack(
return packed_ptr;
}
#ifdef USE_FBGEMM
namespace {
/// Number of columns in the rowwise min/max buffer passed to the quantization function(s)
constexpr int kRowwiseMinMaxNumCols = 2;
bool _validate_rowwise_min_max(
const at::Tensor& weight,
const std::optional<at::Tensor>& rowwise_min_max_opt) {
const auto is_valid_rowwise_min_max = rowwise_min_max_opt.has_value();
if (is_valid_rowwise_min_max) {
TORCH_CHECK(
(rowwise_min_max_opt->dim() == 2 &&
rowwise_min_max_opt->size(0) == weight.size(0) &&
rowwise_min_max_opt->size(1) == kRowwiseMinMaxNumCols),
"'rowwise_min_max' must be a 2D tensor with shape [num_rows(weight), 2].");
}
return is_valid_rowwise_min_max;
}
auto _get_rowwise_min_max_contig(
const std::optional<at::Tensor>& rowwise_min_max_opt) {
return rowwise_min_max_opt.has_value()
? rowwise_min_max_opt->expect_contiguous(rowwise_min_max_opt->suggest_memory_format())
: at::borrow_from_optional_tensor(rowwise_min_max_opt);
}
}
#endif // USE_FBGEMM
namespace at::native {
// Note - This is a temporary pack function for embedding bag which quantizes
// and packs the float weight tensor. In the next step it will be replaced by a
// quantize and pack function once we support FP scale and FP zero_point
//
// The optional rowwise_min_max argument is to support callers to pass in the min/max
// values of the weight tensor. If the rowwise_min_max is not provided, the min/max
// values will be computed from the weight tensor.
//
// Python example examining a packed 8bit zero_point and scale:
//
// >> x = torch.from_numpy(np.array([[[10, 20], [30, 40]],[[50, 60], [70, 80]]],
@ -221,7 +255,10 @@ namespace at::native {
//
// [[50. , 60.00000035],
// [70. , 80.00000035]]])
Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
Tensor& qembeddingbag_byte_prepack_out(
Tensor& output,
const Tensor& weight,
const std::optional<Tensor>& rowwise_min_max_opt) {
// The "last" dimension of an N-Dimensioned batch of embedding bags is
// quantization channel. E.g. for a 2D embedding bag, this has
// [ row, col ] dimensions, for batched of embedding bags, dimensions might be
@ -256,9 +293,16 @@ Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
auto* output_data = output.data_ptr<uint8_t>();
#ifdef USE_FBGEMM
// Move these outside of the ifdef when we support non-FBGEMM flow.
const auto is_valid_rowwise_min_max = _validate_rowwise_min_max(weight, rowwise_min_max_opt);
const auto rowwise_min_max_contig = _get_rowwise_min_max_contig(rowwise_min_max_opt);
if (weight_contig->scalar_type() == at::ScalarType::Half) {
const auto weight_data =
static_cast<fbgemm::float16*>(weight_contig->data_ptr());
const auto rowwise_min_max_data = is_valid_rowwise_min_max
? static_cast<fbgemm::float16*>(rowwise_min_max_contig->data_ptr())
: nullptr;
at::parallel_for(
0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<
@ -266,17 +310,21 @@ Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
weight_data + start_idx * embedding_cols,
end_idx - start_idx,
embedding_cols,
output_data + start_idx * output_columns);
output_data + start_idx * output_columns,
(is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
});
} else {
const auto weight_data = weight_contig->data_ptr<float>();
const auto rowwise_min_max_data =
is_valid_rowwise_min_max ? rowwise_min_max_contig->data_ptr<float>() : nullptr;
at::parallel_for(
0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<float>(
weight_data + start_idx * embedding_cols,
end_idx - start_idx,
embedding_cols,
output_data + start_idx * output_columns);
output_data + start_idx * output_columns,
(is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
});
}
@ -326,6 +374,22 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
return output;
}
static Tensor qembeddingbag_byte_prepack_with_rowwise_min_max(
const Tensor& weight,
const Tensor& rowwise_min_max) {
const auto weight_contig =
weight.expect_contiguous(weight.suggest_memory_format());
Tensor output = at::detail::empty_cpu(
{0},
at::kByte,
weight_contig->layout(),
weight_contig->device(),
std::nullopt,
std::nullopt);
qembeddingbag_byte_prepack_out(output, weight, rowwise_min_max);
return output;
}
Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
const auto weight_contig =
weight.expect_contiguous(weight.suggest_memory_format());
@ -335,7 +399,7 @@ Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
"'embedding_bag_byte_prepack' only support float32 or float16.");
const auto weight_sizes = weight.sym_sizes();
const auto cols_dim = weight.ndimension() - 1;
const auto embedding_cols = weight_sizes[cols_dim];
const auto& embedding_cols = weight_sizes[cols_dim];
// Add 8 bytes per column to store FP32 scale and zero_point per row.
const auto output_columns = embedding_cols + 2 * sizeof(float);
@ -359,7 +423,8 @@ Tensor _qembeddingbag_nbit_prepack_helper(
int bit_width,
const bool optimized_qparams,
const int64_t nbins,
const double ratio) {
const double ratio,
const std::optional<Tensor>& rowwise_min_max_opt = std::nullopt) {
TORCH_CHECK(
weight.scalar_type() == at::ScalarType::Float ||
weight.scalar_type() == at::ScalarType::Half,
@ -401,10 +466,17 @@ Tensor _qembeddingbag_nbit_prepack_helper(
auto* output_data = output.data_ptr<uint8_t>();
#ifdef USE_FBGEMM
// Move these outside of the ifdef when we support non-FBGEMM flow.
const auto is_valid_rowwise_min_max = _validate_rowwise_min_max(weight, rowwise_min_max_opt);
const auto rowwise_min_max_contig = _get_rowwise_min_max_contig(rowwise_min_max_opt);
if (!optimized_qparams) {
if (weight_contig.scalar_type() == at::ScalarType::Half) {
const auto weight_data =
static_cast<fbgemm::float16*>(weight_contig.data_ptr());
const auto rowwise_min_max_data = is_valid_rowwise_min_max
? static_cast<fbgemm::float16*>(rowwise_min_max_contig->data_ptr())
: nullptr;
at::parallel_for(
0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<
@ -413,10 +485,13 @@ Tensor _qembeddingbag_nbit_prepack_helper(
weight_data + start_idx * embedding_cols,
end_idx - start_idx,
static_cast<int>(embedding_cols),
output_data + start_idx * output_shape[1]);
output_data + start_idx * output_shape[1],
(is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
});
} else {
const auto weight_data = weight_contig.data_ptr<float>();
const auto rowwise_min_max_data =
is_valid_rowwise_min_max ? rowwise_min_max_contig->data_ptr<float>() : nullptr;
at::parallel_for(
0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<float>(
@ -424,7 +499,8 @@ Tensor _qembeddingbag_nbit_prepack_helper(
weight_data + start_idx * embedding_cols,
end_idx - start_idx,
static_cast<int>(embedding_cols),
output_data + start_idx * output_shape[1]);
output_data + start_idx * output_shape[1],
(is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
});
}
} else {
@ -514,6 +590,16 @@ Tensor qembeddingbag_4bit_prepack(
weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio);
}
Tensor qembeddingbag_4bit_prepack_with_rowwise_min_max(
const Tensor& weight,
const Tensor& rowwise_min_max,
const bool optimized_qparams,
const int64_t nbins,
const double ratio) {
return _qembeddingbag_nbit_prepack_helper(
weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio, rowwise_min_max);
}
// Applies 2-bit row-wise quantization by determining the range
// (maximum - minimum) and bias (minimum value) of each row in the input
// matrix, and then scaling each element to an 2-bit number between 0 and
@ -531,6 +617,16 @@ Tensor qembeddingbag_2bit_prepack(
weight, 2 /*bit_width*/, optimized_qparams, nbins, ratio);
}
Tensor qembeddingbag_2bit_prepack_with_rowwise_min_max(
const Tensor& weight,
const Tensor& rowwise_min_max,
const bool optimized_qparams,
const int64_t nbins,
const double ratio) {
return _qembeddingbag_nbit_prepack_helper(
weight, 2 /*bit_width*/, optimized_qparams, nbins, ratio, rowwise_min_max);
}
class QEmbeddingPackWeights final {
public:
static c10::intrusive_ptr<EmbeddingPackedParamsBase> run(const at::Tensor& weight) {
@ -542,12 +638,21 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
m.impl(
TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack"),
TORCH_FN(qembeddingbag_byte_prepack));
m.impl(
TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack_with_rowwise_min_max"),
TORCH_FN(qembeddingbag_byte_prepack_with_rowwise_min_max));
m.impl(
TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack"),
TORCH_FN(qembeddingbag_4bit_prepack));
m.impl(
TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack_with_rowwise_min_max"),
TORCH_FN(qembeddingbag_4bit_prepack_with_rowwise_min_max));
m.impl(
TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack"),
TORCH_FN(qembeddingbag_2bit_prepack));
m.impl(
TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack_with_rowwise_min_max"),
TORCH_FN(qembeddingbag_2bit_prepack_with_rowwise_min_max));
}
TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {

View File

@ -3,7 +3,10 @@
namespace at::native {
Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight);
Tensor& qembeddingbag_byte_prepack_out(
Tensor& output,
const Tensor& weight,
const std::optional<Tensor>& rowwise_min_max_opt = std::nullopt);
Tensor qembeddingbag_byte_prepack(const Tensor& weight);

View File

@ -121,9 +121,12 @@ TORCH_LIBRARY(quantized, m) {
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag});

View File

@ -120,7 +120,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
// buffer (in bytes)
size_t orig_m = sparse_input.size(0);
size_t div = orig_m * sparse_input.itemsize();
size_t new_n = (compressed_size + div - 1) / div; // floor
size_t new_n = (compressed_size + div - 1) / div; // ceil(s,d) = (s+d-1)/d
auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
@ -155,7 +155,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
handle_initialized = true;
}
// cupsarselt constructs
// cuSPARSELt constructs
cusparseLtMatmulDescriptor_t matmul;
cusparseLtMatmulPlan_t plan;
cusparseLtMatmulAlgSelection_t alg_sel;

View File

@ -176,6 +176,28 @@ bool check_head_dim_size_flash(sdp_params const& params, bool debug) {
}
return false;
}
if constexpr(caller_is_meff) {
bool is_half = (params.query.dtype() == at::kHalf) ||
(params.query.dtype() == at::kBFloat16);
const int64_t alignment = is_half ? 8 : 4;
if (!(query_size_last % alignment == 0 && query_size_last > 0 &&
value_size_last % alignment == 0 && value_size_last > 0)) {
if (debug) {
TORCH_WARN(
"Mem efficient attention requires last dimension of inputs to be divisible by ",
alignment,
". ",
"Got Query.size(-1): ",
query_size_last,
", Key.size(-1): ",
params.key.sym_size(-1),
", Value.size(-1): ",
params.value.sym_size(-1),
" instead.");
}
return false;
}
}
return true;
}

View File

@ -462,10 +462,11 @@ mha_varlen_fwd_aot(const at::Tensor &q, // total_q x num_heads x head_size, tot
using sdp::aotriton_adapter::mk_aotensor;
using sdp::aotriton_adapter::mk_aoscalartensor;
using sdp::aotriton_adapter::mk_philoxtensor;
using sdp::aotriton_adapter::mk_atomictensor;
using sdp::aotriton_adapter::cast_dtype;
at::Tensor atomic_counter;
if (is_causal) {
atomic_counter = at::zeros({1}, q.options());
atomic_counter = at::zeros({1}, q.options().dtype(at::kInt));
}
aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
auto seed = use_philox_state ? mk_philoxtensor(philox_state.seed_.ptr) : mk_aoscalartensor(seed_t);
@ -474,7 +475,7 @@ mha_varlen_fwd_aot(const at::Tensor &q, // total_q x num_heads x head_size, tot
auto nullscalar = mk_philoxtensor(nullptr);
auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr<int64_t>()) : nullscalar;
auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr<int64_t>()) : nullscalar;
auto persistent_counter = is_causal ? mk_philoxtensor(atomic_counter.data_ptr<int64_t>()) : nullscalar;
auto persistent_counter = mk_atomictensor(is_causal ? atomic_counter.data_ptr<int32_t>() : nullptr);
if (uses_swa || AOTRITON_ALWAYS_V3_API) {
#if AOTRITON_V3_API
using aotriton::v3::flash::CausalType;

View File

@ -2,22 +2,12 @@
// ${generated_comment}
#include <ATen/FunctionalStorageImpl.h>
#include <ATen/Tensor.h>
namespace at {
namespace functionalization {
enum class InverseReturnMode {
/// Specifies that functional inverses should always return a view.
AlwaysView,
/// Specifies that functional inverses should always return a non-view / copy.
NeverView,
/// Specifies that functional inverses should return a view unless a (copying) scatter
/// inverse exists, in which case that will be used instead.
/// This avoids as_strided() calls that can be difficult for subclasses to handle.
ViewOrScatterInverse,
};
struct FunctionalInverses {
${view_inverse_declarations}

View File

@ -4,7 +4,7 @@
#include <ATen/core/LegacyTypeDispatch.h>
#include <ATen/EmptyTensor.h>
#include <ATen/FunctionalTensorWrapper.h>
#include <ATen/FunctionalInverses.h>
#include <ATen/ViewMetaClasses.h>
#include <ATen/MemoryOverlap.h>
#include <torch/library.h>

View File

@ -0,0 +1,19 @@
// ${generated_comment}
#include <ATen/FunctionalInverses.h>
#include <ATen/ViewMetaClasses.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Operators.h>
#include <ATen/NativeFunctions.h>
#else
${op_headers}
#endif
namespace at {
namespace functionalization {
${view_meta_implementations}
} // namespace functionalization
} // namespace at

View File

@ -0,0 +1,12 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
// ${generated_comment}
#include <ATen/FunctionalStorageImpl.h>
namespace at {
namespace functionalization {
${view_meta_declarations}
} // namespace functionalization
} // namespace at

View File

@ -0,0 +1,11 @@
#include <ATen/ViewMetaClasses.h>
#include <torch/csrc/functionalization/Module.h>
namespace torch::functionalization {
void initGenerated(PyObject* module) {
auto functionalization = py::handle(module).cast<py::module>();
$view_meta_bindings
}
} // namespace torch::functionalization

View File

@ -1561,6 +1561,38 @@ namespace {
<< "Failure Details:\nTest Seed to reproduce: " << seed;
}
}
#endif
#if defined(CPU_CAPABILITY_AVX512)
TYPED_TEST(Quantization8BitTests, TransposePackVNNI4) {
using VT = ValueType<TypeParam>;
constexpr auto K = 197;
constexpr auto N = 64;
constexpr auto L = K * N;
constexpr auto ld_src = N;
constexpr auto ld_dst = K * 4;
CACHE_ALIGN VT x[L];
CACHE_ALIGN VT y[L];
CACHE_ALIGN VT ref[L];
auto seed = TestSeed();
ValueGen<VT> generator(VT(-100), VT(100), seed);
for (const auto i : c10::irange(L)) {
x[i] = generator.get();
}
at::vec::transpose_pack_vnni4(x, y, ld_src, K, N);
int64_t _N = N / 4;
for (int64_t k = 0; k < K; k++) {
for(int64_t n = 0; n < _N; n++) {
for(int64_t l = 0; l < 4; l++) {
ref[n * ld_dst + k * 4 + l] =
c10::load(&(x[k * ld_src + n * 4 + l]));
}
}
}
for (const auto i : c10::irange(L)) {
ASSERT_EQ(y[i], ref[i])
<< "Failure Details:\nTest Seed to reproduce: " << seed;
}
}
#endif
TYPED_TEST(FunctionalTests, Map) {
using vec = TypeParam;

View File

@ -78,6 +78,8 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
"google/gemma-3-4b-it",
"openai/whisper-tiny",
"Qwen/Qwen3-0.6B",
"mistralai/Mistral-7B-Instruct-v0.3",
"openai/gpt-oss-20b",
}
)

View File

@ -61,6 +61,8 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
"google/gemma-3-4b-it",
"openai/whisper-tiny",
"Qwen/Qwen3-0.6B",
"mistralai/Mistral-7B-Instruct-v0.3",
"openai/gpt-oss-20b",
}
)

View File

@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
Qwen/Qwen3-0.6B,pass,0
mistralai/Mistral-7B-Instruct-v0.3,pass,0
openai/gpt-oss-20b,pass,0

1 name accuracy graph_breaks
191
192
193
194
195
196
197
198
199
200
201

View File

@ -187,3 +187,11 @@ openai/whisper-tiny,fail_to_run,0
Qwen/Qwen3-0.6B,fail_to_run,0
mistralai/Mistral-7B-Instruct-v0.3,fail_to_run,0
openai/gpt-oss-20b,fail_to_run,0

1 name accuracy graph_breaks
187
188
189
190
191
192
193
194
195
196
197

View File

@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0
Qwen/Qwen3-0.6B,pass_due_to_skip,0
mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
openai/gpt-oss-20b,pass_due_to_skip,0

1 name accuracy graph_breaks
191
192
193
194
195
196
197
198
199
200
201

View File

@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0
Qwen/Qwen3-0.6B,pass_due_to_skip,0
mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
openai/gpt-oss-20b,pass_due_to_skip,0

1 name accuracy graph_breaks
191
192
193
194
195
196
197
198
199
200
201

View File

@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0
Qwen/Qwen3-0.6B,pass_due_to_skip,0
mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
openai/gpt-oss-20b,pass_due_to_skip,0

1 name accuracy graph_breaks
191
192
193
194
195
196
197
198
199
200
201

View File

@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
Qwen/Qwen3-0.6B,pass,0
mistralai/Mistral-7B-Instruct-v0.3,pass,0
openai/gpt-oss-20b,pass,0

1 name accuracy graph_breaks
191
192
193
194
195
196
197
198
199
200
201

View File

@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
Qwen/Qwen3-0.6B,pass,0
mistralai/Mistral-7B-Instruct-v0.3,pass,0
openai/gpt-oss-20b,pass,0

1 name accuracy graph_breaks
191
192
193
194
195
196
197
198
199
200
201

View File

@ -318,7 +318,7 @@ timm_vovnet,pass,0
torch_multimodal_clip,pass,3
torch_multimodal_clip,pass,0

1 name accuracy graph_breaks
318
319
320
321
322
323
324

View File

@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
Qwen/Qwen3-0.6B,pass,0
mistralai/Mistral-7B-Instruct-v0.3,pass,0
openai/gpt-oss-20b,pass,0

1 name accuracy graph_breaks
191
192
193
194
195
196
197
198
199
200
201

View File

@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
Qwen/Qwen3-0.6B,pass,0
mistralai/Mistral-7B-Instruct-v0.3,pass,0
openai/gpt-oss-20b,pass,0

1 name accuracy graph_breaks
191
192
193
194
195
196
197
198
199
200
201

View File

@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
Qwen/Qwen3-0.6B,pass,0
mistralai/Mistral-7B-Instruct-v0.3,pass,0
openai/gpt-oss-20b,pass,0

1 name accuracy graph_breaks
191
192
193
194
195
196
197
198
199
200
201

View File

@ -11,6 +11,8 @@ skip:
- GPTJForQuestionAnswering
# Model too big
- google/gemma-3-4b-it
- openai/gpt-oss-20b
- mistralai/Mistral-7B-Instruct-v0.3
device:
cpu:
@ -19,6 +21,8 @@ skip:
- google/gemma-3-4b-it
- openai/whisper-tiny
- Qwen/Qwen3-0.6B
- mistralai/Mistral-7B-Instruct-v0.3
- openai/gpt-oss-20b
control_flow:
- AllenaiLongformerBase
@ -79,6 +83,8 @@ batch_size:
google/gemma-3-4b-it: 8
openai/whisper-tiny: 8
Qwen/Qwen3-0.6B: 8
mistralai/Mistral-7B-Instruct-v0.3: 8
openai/gpt-oss-20b: 8
tolerance:

View File

@ -99,4 +99,6 @@ HF_LLM_MODELS: dict[str, Benchmark] = {
"google/gemma-3-4b-it": TextGenerationBenchmark,
"openai/whisper-tiny": WhisperBenchmark,
"Qwen/Qwen3-0.6B": TextGenerationBenchmark,
"mistralai/Mistral-7B-Instruct-v0.3": TextGenerationBenchmark,
"openai/gpt-oss-20b": TextGenerationBenchmark,
}

View File

@ -51,3 +51,5 @@ google/gemma-2-2b,8
google/gemma-3-4b-it,8
openai/whisper-tiny,8
Qwen/Qwen3-0.6B,8
mistralai/Mistral-7B-Instruct-v0.3, 8
openai/gpt-oss-20b, 8

View File

@ -156,7 +156,7 @@ ROOT = "//" if IS_OSS else "//xplat/caffe2"
# for targets in subfolders
ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/"
C10 = "//c10:c10" if IS_OSS else ("//xplat/caffe2/c10:c10_ovrsource" if is_arvr_mode() else "//xplat/caffe2/c10:c10")
C10 = "//c10:c10" if IS_OSS else "//xplat/caffe2/c10:c10"
# a dictionary maps third party library name to fbsource and oss target
THIRD_PARTY_LIBS = {
@ -391,6 +391,8 @@ def get_aten_generated_files(enabled_backends):
"CompositeExplicitAutogradFunctions_inl.h",
"CompositeExplicitAutogradNonFunctionalFunctions.h",
"CompositeExplicitAutogradNonFunctionalFunctions_inl.h",
"ViewMetaClasses.h",
"ViewMetaClasses.cpp",
"VmapGeneratedPlumbing.h",
"core/ATenOpList.cpp",
"core/TensorBody.h",
@ -948,7 +950,6 @@ def define_buck_targets(
[
("torch/csrc/api/include", "torch/**/*.h"),
("", "torch/csrc/**/*.h"),
("", "torch/csrc/**/*.hpp"),
("", "torch/nativert/**/*.h"),
("", "torch/headeronly/**/*.h"),
("", "torch/script.h"),
@ -1193,6 +1194,7 @@ def define_buck_targets(
"NativeMetaFunctions.h": ":gen_aten[NativeMetaFunctions.h]",
"Operators.h": ":gen_aten[Operators.h]",
"RedispatchFunctions.h": ":gen_aten[RedispatchFunctions.h]",
"ViewMetaClasses.h": ":gen_aten[ViewMetaClasses.h]",
"core/TensorBody.h": ":gen_aten[core/TensorBody.h]",
"core/aten_interned_strings.h": ":gen_aten[core/aten_interned_strings.h]",
"core/enum_tag.h": ":gen_aten[core/enum_tag.h]",
@ -2048,7 +2050,6 @@ def define_buck_targets(
("", "caffe2/utils/*.h"),
("", "caffe2/core/*.h"),
("", "torch/csrc/*.h"),
("", "torch/csrc/*.hpp"),
("", "torch/csrc/api/include/torch/*.h"),
("", "torch/csrc/autograd/*.h"),
("", "torch/csrc/autograd/*/*.h"),

View File

@ -118,6 +118,9 @@ def define_targets(rules):
":LazyNonNativeIr.h",
":RegisterDispatchDefinitions.ini",
":RegisterDispatchKey.cpp",
":ViewMetaClassesPythonBinding.cpp",
":ViewMetaClasses.cpp",
":ViewMetaClasses.h",
":native_functions.yaml",
":shape_inference.h",
":tags.yaml",
@ -170,6 +173,7 @@ GENERATED_H = [
"FunctionalInverses.h",
"RedispatchFunctions.h",
"RegistrationDeclarations.h",
"ViewMetaClasses.h",
"VmapGeneratedPlumbing.h",
]
@ -246,6 +250,7 @@ GENERATED_CPP = [
"RegisterFunctionalization_1.cpp",
"RegisterFunctionalization_2.cpp",
"RegisterFunctionalization_3.cpp",
"ViewMetaClasses.cpp",
]
GENERATED_CPP_CORE = [
@ -307,6 +312,7 @@ _GENERATED_AUTOGRAD_PYTHON_CPP = [
"torch/csrc/autograd/generated/python_torch_functions_1.cpp",
"torch/csrc/autograd/generated/python_torch_functions_2.cpp",
"torch/csrc/autograd/generated/python_variable_methods.cpp",
"torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
]
GENERATED_AUTOGRAD_PYTHON = _GENERATED_AUTOGRAD_PYTHON_HEADERS + _GENERATED_AUTOGRAD_PYTHON_CPP

View File

@ -1010,6 +1010,7 @@ libtorch_python_core_sources = [
"torch/csrc/utils/disable_torch_function.cpp",
"torch/csrc/utils/verbose.cpp",
"torch/csrc/cpu/Module.cpp",
"torch/csrc/functionalization/Module.cpp",
"torch/csrc/instruction_counter/Module.cpp",
"torch/nativert/python/Bindings.cpp",
] + lazy_tensor_core_python_sources
@ -1052,6 +1053,7 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
"torch/csrc/autograd/generated/python_torch_functions_1.cpp",
"torch/csrc/autograd/generated/python_torch_functions_2.cpp",
"torch/csrc/autograd/generated/python_variable_methods.cpp",
"torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp",
]]
_libtorch_python_sources.extend(libtorch_python_core_sources)

View File

@ -3244,7 +3244,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
are_equal<sizeof(autograd_meta_), 4, FieldNameEnum::autograd_meta_>();
are_equal<sizeof(extra_meta_), 4, FieldNameEnum::extra_meta_>();
are_equal<sizeof(version_counter_), 4, FieldNameEnum::version_counter_>();
are_equal<sizeof(pyobj_slot_), 8, FieldNameEnum::pyobj_slot_>();
are_equal<sizeof(pyobj_slot_), 4, FieldNameEnum::pyobj_slot_>();
is_le<sizeof(sizes_and_strides_), 88, FieldNameEnum::sizes_and_strides_>();
are_equal<sizeof(storage_offset_), 8, FieldNameEnum::storage_offset_>();
are_equal<sizeof(numel_), 8, FieldNameEnum::numel_>();
@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
is_le<sizeof(autograd_meta_), 16, FieldNameEnum::autograd_meta_>();
is_le<sizeof(extra_meta_), 16, FieldNameEnum::extra_meta_>();
are_equal<sizeof(version_counter_), 8, FieldNameEnum::version_counter_>();
are_equal<sizeof(pyobj_slot_), 16, FieldNameEnum::pyobj_slot_>();
are_equal<sizeof(pyobj_slot_), 8, FieldNameEnum::pyobj_slot_>();
are_equal<sizeof(sizes_and_strides_), 88, FieldNameEnum::sizes_and_strides_>();
are_equal<sizeof(storage_offset_), 8, FieldNameEnum::storage_offset_>();
are_equal<sizeof(numel_), 8, FieldNameEnum::numel_>();

Some files were not shown because too many files have changed in this diff Show More