Compare commits

..

132 Commits

Author SHA1 Message Date
c0d981fb92 try 2025-08-14 07:24:10 -07:00
f5f3fd155f try 2025-08-14 07:23:24 -07:00
161048c20a add test 2025-08-14 00:40:58 -07:00
d8bc95af0b add test 2025-08-14 00:27:33 -07:00
4e4e1f81b3 add test 2025-08-13 22:31:19 -07:00
24580f98c2 add test 2025-08-13 22:30:02 -07:00
de4c6b3f3f add test
ghstack-source-id: b11f5e19e1b622610668017d89feac01750675cf
Pull-Request: https://github.com/pytorch/pytorch/pull/160583
2025-08-13 22:29:59 -07:00
ef6573629e setup test ci
ghstack-source-id: 913ccdea2bf59332313cf34c2a85b7d01d886ff6
Pull-Request: https://github.com/pytorch/pytorch/pull/160361
2025-08-13 22:29:38 -07:00
36e115eca5 vllm build workflow
ghstack-source-id: 8acb25f33d5232b80db025a8b840cf3fa5f30266
Pull-Request: https://github.com/pytorch/pytorch/pull/160116
2025-08-13 22:29:38 -07:00
dad3e16379 vllmbuild
ghstack-source-id: 61c87bcac7a2b2f47c2a03b8a1da8999dd648acd
Pull-Request: https://github.com/pytorch/pytorch/pull/160089
2025-08-13 22:29:38 -07:00
95490a1ad7 [ghstack] setup torch_cli build
ghstack-source-id: 33be5fbc9eb5f6e6519afc6153f727020d77461d
Pull-Request: https://github.com/pytorch/pytorch/pull/160043
2025-08-13 22:29:38 -07:00
19f1f9960d [mps] Turn on inductor dynamic shapes tests (#159456)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159456
Approved by: https://github.com/Skylion007, https://github.com/malfet
2025-08-04 22:44:31 +00:00
fd6655a0f5 Feature: Implement support for cudnn_batch_norm_out kernel to replace the autogen approach. (#123020)
Fixes #115611

Autogen kernel may cause redundant copy, so we develop the kernel to improve efficiency.

Test Case:

```c++
#include <torch/torch.h>
#include <iostream>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>

int main() {
    auto input = torch::rand({2, 3, 4, 4}, torch::device(torch::kCUDA));
    auto weight = torch::randn({3}, torch::device(torch::kCUDA));
    auto bias = torch::randn({3}, torch::device(torch::kCUDA));
    auto running_mean = torch::zeros({3}, torch::device(torch::kCUDA));
    auto running_var = torch::ones({3}, torch::device(torch::kCUDA));

    bool training = true;
    double exponential_average_factor = 0.1;
    double epsilon = 1e-5;

    auto output = torch::empty_like(input);
    auto save_mean = torch::empty({3}, torch::device(torch::kCUDA));
    auto save_var = torch::empty({3}, torch::device(torch::kCUDA));
    auto reserve = torch::empty({0}, torch::device(torch::kCUDA)); // empty place-holder

    at::native::cudnn_batch_norm_out(input, weight, bias, running_mean, running_var, training, exponential_average_factor, epsilon, output, save_mean, save_var, reserve);
    auto outputs = at::native::cudnn_batch_norm(input, weight, bias, running_mean, running_var, training, exponential_average_factor, epsilon);

    bool is_close_output = torch::allclose(output, std::get<0>(outputs));
    bool is_close_save_mean = torch::allclose(save_mean, std::get<1>(outputs));
    bool is_close_save_var = torch::allclose(save_var, std::get<2>(outputs));
    bool is_close_reserve = torch::allclose(reserve, std::get<3>(outputs));

    std::cout << "Is output close: " << is_close_output << std::endl;
    std::cout << "Is save_mean close: " << is_close_save_mean << std::endl;
    std::cout << "Is save_var close: " << is_close_save_var << std::endl;
    std::cout << "Is reserve close: " << is_close_reserve << std::endl;

    return 0;
}
```

Please CC @albanD

Pull Request resolved: https://github.com/pytorch/pytorch/pull/123020
Approved by: https://github.com/andrewor14, https://github.com/eqy, https://github.com/albanD
2025-08-04 22:40:33 +00:00
a7f3bdf550 [Dynamo][Better Engineering] Type coverage for torch/_dynamo/utils.py (#159580)
As part of better engineering effort, we would like to improve out type support to improve dev experience in dynamo

This PR adds strict typing support to `torch/_dynamo/utils.py`

Running
```
mypy torch/_dynamo/utils.py --linecount-report /tmp/coverage_log
```

| -------- | Lines Annotated | Lines Total | % lines covered | Funcs Annotated | Funcs Total | % funcs covered |
| -------- | ------- | -------- | ------- | ------- | ------- | ------- |
| Main  |  2163 | 4792 | 45.14% | 121 | 268 | 45.15% |
| This PR | 4818 | 4818 | 100.00% | 268 | 268 | 100.00% |
| Delta    | +2655 | +26 | +54.84% | +147 | 0 | +54.85% |

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159580
Approved by: https://github.com/williamwen42
2025-08-04 21:51:53 +00:00
510e8b4ae0 [inductor] use writable temp file on windows (#159738)
Use `WritableTempFile` on Windows, reference to: https://github.com/pytorch/pytorch/pull/159342

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159738
Approved by: https://github.com/angelayi, https://github.com/Skylion007
2025-08-04 21:51:02 +00:00
83ba3f1101 Revert "[inductor] allocate non-blocking copy destinations in pinned memory (#155121) (#158758)"
This reverts commit 6085bf7565fec0d2ed26e8590001f09c05adbbe4.

Reverted https://github.com/pytorch/pytorch/pull/158758 on behalf of https://github.com/davidberard98 due to I need to revert #158462 (it causes device-side asserts), and this PR causes a merge conflict in the test file. Sorry about that! ([comment](https://github.com/pytorch/pytorch/pull/158758#issuecomment-3152490371))
2025-08-04 21:47:11 +00:00
1fad16aacb Revert "[inductor] move all cpu scalars using pinned memory for graph partition (#155360) (#158983)"
This reverts commit 444e2381d07a14cb501c00d11f9e63a3f1d2c86e.

Reverted https://github.com/pytorch/pytorch/pull/158983 on behalf of https://github.com/davidberard98 due to I need to revert #158462 (it causes device-side asserts), and this PR causes a merge conflict in the test file. Sorry about that! ([comment](https://github.com/pytorch/pytorch/pull/158758#issuecomment-3152490371))
2025-08-04 21:47:11 +00:00
444e2381d0 [inductor] move all cpu scalars using pinned memory for graph partition (#155360) (#158983)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/158983
Approved by: https://github.com/eellison
ghstack dependencies: #158758
2025-08-04 21:42:05 +00:00
6085bf7565 [inductor] allocate non-blocking copy destinations in pinned memory (#155121) (#158758)
Fixes #155121

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158758
Approved by: https://github.com/EikanWang, https://github.com/eellison
2025-08-04 21:22:11 +00:00
8201dbf4bc check driver to be >=12.4 to use fabric handles (#159697)
Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159697
Approved by: https://github.com/malfet
2025-08-04 21:05:39 +00:00
26d045bb60 Linux py 3.14 wheel builds (#157559)
Related to https://github.com/pytorch/pytorch/issues/156856

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157559
Approved by: https://github.com/malfet, https://github.com/albanD
2025-08-04 20:55:19 +00:00
356ac3103a Revert "Stop parsing command line arguments every time common_utils is imported. (#156703)"
This reverts commit 310f901a71e53688866b14bb2f2b4c8eef9979b3.

Reverted https://github.com/pytorch/pytorch/pull/156703 on behalf of https://github.com/izaitsevfb due to breaking tests internally with `assert common_utils.SEED is not None` ([comment](https://github.com/pytorch/pytorch/pull/156703#issuecomment-3152337518))
2025-08-04 20:37:39 +00:00
d4109a0f99 [MPS] Add max_unpool1d/2d/3d (#159789)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159789
Approved by: https://github.com/malfet
2025-08-04 20:00:59 +00:00
7ea789ccfb Revert #156868: Bring back symint check for sharding propagation cache (#159671)
Fixes #159601

Unfortunately #156868 introduced a couple regressions (see #159590 and #159601). This reverts the commit while I am working on a permanent fix. This means the `in_compiled_autograd_initial_trace` global flag will be removed and the `_are_we_tracing()` will instead be replaced with the symint preprocessing step during sharding prop post init.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159671
Approved by: https://github.com/xmfan
2025-08-04 19:58:48 +00:00
7e8197e34d Revert "Migrate ScalarType to headeronly (#159416)"
This reverts commit 1371a98b0e727f8a8916dd473b6dd0cff78c0449.

Reverted https://github.com/pytorch/pytorch/pull/159416 on behalf of https://github.com/izaitsevfb due to breaking internal builds, see D79452481 ([comment](https://github.com/pytorch/pytorch/pull/159416#issuecomment-3152138508))
2025-08-04 19:55:09 +00:00
50eac811a6 [typing] Constrain OrderedSet generic to be Hashable (#159684)
Ran across this typing bug while creating an OrderedSet from a type I didn't realize wasn't hashable, which failed at runtime. With this constraint, typing would've failed pre-runtime.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159684
Approved by: https://github.com/Skylion007
2025-08-04 18:08:01 +00:00
4e0f179d0b Update the signature and test of torch.hamming_window() (#152682)
Fixes #146590

Pull Request resolved: https://github.com/pytorch/pytorch/pull/152682
Approved by: https://github.com/albanD
2025-08-04 17:50:42 +00:00
36e59d9b12 [c10d][nvshmem] fix missing override compilation error for nvshmem symmetric code (#159557)
Summary:
Fix error when compiling nvshmem code section `NVSHMEMSymmetricMemory.cu` with BUCK

```
fbcode/caffe2/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu:154:20: error: 'get_buffer' overrides a member function but is not marked 'override' [-Werror,-Winconsistent-missing-override]
  154 | virtual at::Tensor get_buffer(int
      |                    ^
fbcode/caffe2/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp:56:20: note: overridden virtual function is here
   56 | virtual at::Tensor get_buffer(int rank, c10::IntArrayRef sizes, c10::ScalarType dtype, int64_t storage_offset) = 0;
```

Test Plan:
Build test + CI

Rollback Plan:

Differential Revision: D78813586

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159557
Approved by: https://github.com/kwen2501
2025-08-04 17:46:30 +00:00
fc340d0ca3 [export] Allow comparing device w/o index with device w/ index (#159665)
In the case where we have expected device "cuda" and given device "cuda:0" I think we should succeed?
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159665
Approved by: https://github.com/yushangdi
2025-08-04 17:00:07 +00:00
53e47af0f7 [dynamo][guards] Read the attr name from GetAttrGuardAccessor (#159754)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159754
Approved by: https://github.com/jansel
ghstack dependencies: #159752
2025-08-04 16:51:27 +00:00
66ad881fc7 [dynamo][guards][refactor] Simplify type extraction from GuardManager (#159752)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159752
Approved by: https://github.com/jansel
2025-08-04 16:51:27 +00:00
1d3eef27ac [ROCm CI] Migrate to MI325 Capacity (#159649)
Migrate mi300s to gfx942.

Related to https://github.com/pytorch/pytorch/pull/159059

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159649
Approved by: https://github.com/huydhn
2025-08-04 16:48:12 +00:00
dd95900cec [AOTI] normalize_path_separator file path for Windows. (#159726)
`normalize_path_separator` file path for Windows.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159726
Approved by: https://github.com/angelayi, https://github.com/jansel
2025-08-04 15:57:19 +00:00
1cdd665526 fix test_verbose_logs_dynamic_shapes with MSVC (#159573)
Operator `typeid` have different outputs in different compiler. There is a good example in [cppreference](https://www.en.cppreference.com/w/cpp/language/typeid.html).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159573
Approved by: https://github.com/angelayi, https://github.com/jansel
2025-08-04 15:56:53 +00:00
7cb2dcd2dd [c10d][nvshmem] modify is_nvshmem_available runtime check to work with static-linked library (#159558) (#159561)
Summary:

Currently this function rely on the logic that we load `libnvshmem_device.a` statically and load `libnvshmem_host.so` at runtime. For loading `libnvshmem.a` (the combine 2 thing together) statically this will fail. Add a section to check if the symbol from host API exist at runtime to check if nvshmem is loaded statically

Test Plan:
CI + sample run

Rollback Plan:

Differential Revision: D79177525

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159561
Approved by: https://github.com/kwen2501
2025-08-04 15:40:29 +00:00
e5a81aa7ba Fix conversion of values in libtorch agnostic tests (#155115)
Due to different byteorder,
when copying data, it has to be put into last bytes to ensure that int32_t converted to int64_t keeps same value. Same has to be done when it's converted back.

This change fixes test
TestLibtorchAgnosticCPU::test_my_ones_like_cpu
from
cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py on s390x.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/155115
Approved by: https://github.com/huydhn
2025-08-04 13:40:22 +00:00
3e2aa4b0e3 Update pin to include Python 3.14 support (#159725)
Update Triton Pin to top of rel/3.4 branch : https://github.com/triton-lang/triton/tree/rel/3.4 . This is the same as release/3.4.x branch but also includes Python 3.14 support

This should unblock enablement of Python 3.14 support in this PR: https://github.com/pytorch/pytorch/pull/157559

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159725
Approved by: https://github.com/davidberard98
2025-08-04 13:30:12 +00:00
6646461764 S390X: fix detection of magic number placeholder in inductor (#157784)
This change fixes multiple tests in
test/inductor/test_aot_inductor_arrayref.py
such as
test_cond_with_parameters_cpu_with_stack_allocation,
test_issue_140766_cpu_with_stack_allocation,
test_model_modified_weights_cpu_with_stack_allocation,
test_nested_tensor_from_jagged_cpu_with_stack_allocation.

Enable tests in test/inductor/test_aot_inductor_arrayref.py

This change is split off from https://github.com/pytorch/pytorch/pull/150116

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157784
Approved by: https://github.com/huydhn
2025-08-04 12:42:31 +00:00
f74da2a136 [xla hash update] update the pinned xla hash (#159758)
This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned xla hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159758
Approved by: https://github.com/pytorchbot
2025-08-04 11:21:45 +00:00
eqy
d35b27dde5 [CUDA] Add some more missing @serialTest decorators (#159672)
Seems to fix #159663

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159672
Approved by: https://github.com/Skylion007
2025-08-04 07:44:35 +00:00
a9dc1566d4 [MTIA Aten Backend] Migrate arange.start_out (#159540)
Differential Revision: [D79317519](https://our.internmc.facebook.com/intern/diff/D79317519/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159540
Approved by: https://github.com/malfet, https://github.com/nautsimon
2025-08-04 07:38:05 +00:00
33a1996714 Fix perf downgrad by reverting template use in use_mkldnn_matmul (#159024)
This PR is to fix the performance downgrad by reverting template use in `use_mkldnn_matmul` in #157520 . Fix https://github.com/pytorch/pytorch/issues/159031 and https://github.com/pytorch/pytorch/issues/159551.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159024
Approved by: https://github.com/mingfeima
2025-08-04 05:49:46 +00:00
ee62177c19 [dynamo] Be consistent with storing func source for UserMethodVariable (#159696)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159696
Approved by: https://github.com/jansel
ghstack dependencies: #159534
2025-08-04 05:12:44 +00:00
64cbaa876c [dynamo][guards] Make class members go through obj.__class__.__dict__ (#159534)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159534
Approved by: https://github.com/jansel
2025-08-04 05:12:44 +00:00
4516c59f5f [dynamo][source] Add special source for __code__ and __closure__ (#159722)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159722
Approved by: https://github.com/jansel
2025-08-04 05:02:05 +00:00
8bc843a9ec [vllm hash update] update the pinned vllm hash (#159610)
This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned vllm hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159610
Approved by: https://github.com/pytorchbot
2025-08-04 04:06:09 +00:00
e39a62c70d Fix warnings in triton_helpers.py (#159719)
```
  /home/jansel/pytorch/torch/_inductor/runtime/triton_helpers.py:152: UserWarning: Logical operators 'and' and 'or' are deprecated for non-scalar tensors; please use '&' or '|' instead
    equal |= a_isnan and b_isnan
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159719
Approved by: https://github.com/Skylion007
2025-08-04 03:21:09 +00:00
978e3a9142 refresh expected results (#159727)
Just regular update due to recent <10% changes CI is stable.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159727
Approved by: https://github.com/anijain2305
2025-08-03 22:47:50 +00:00
e2a5c42e7e [BE][MPS] Build metal kernels of MacOS-14+ (#159733)
Which makes `#if __METAL_VERSION__ >= 310` guards for `bfloat` use support unnecessary.
Rename `kernels_bfloat.metallib` into `kernels_basic` and remove custom build/selection logic.

Part of https://github.com/pytorch/pytorch/issues/159275
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159733
Approved by: https://github.com/dcci
ghstack dependencies: #159731, #159732
2025-08-03 20:53:58 +00:00
5116c49b52 [BE] Remove macos-13 guard from bench_mps_ops (#159732)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159732
Approved by: https://github.com/dcci
ghstack dependencies: #159731
2025-08-03 20:53:58 +00:00
fecdebe385 [CI][MPS] Fix compile benchmark correctness (#159731)
By passing `fullgraph=True` attribute and increasing cache size limit to 2**16

Otherwise, compiler might decide not to fall back to eager to avoid recompilations
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159731
Approved by: https://github.com/dcci
2025-08-03 20:53:50 +00:00
e136a9175b [BE] Fix dev warning in Dependencies.cmake (#159702)
Namely
```
CMake Warning (dev) in cmake/Dependencies.cmake:
  A logical block opening on the line

    /Users/nshulga/git/pytorch/pytorch/cmake/Dependencies.cmake:261 (if)

  closes on the line

    /Users/nshulga/git/pytorch/pytorch/cmake/Dependencies.cmake:263 (endif)

  with mis-matching arguments.
```

Introduced by https://github.com/pytorch/pytorch/pull/143846

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159702
Approved by: https://github.com/cyyever, https://github.com/Skylion007
2025-08-03 18:45:07 +00:00
9a680e14b7 [bucketing] Reduce CPU overhead for reduce_scatter_merge_fn_to_trace (#159723)
The previous implementation was creating `n_gpu * n_tensors` intermediate tensors, which was adding a lot of CPU overhead, specially given that inductor was generating a number of individual tensor copy kernels for `torch.cat` .

This PR changes the implementation so that only `n_tensors` are created, making the CPU overhead proportional to the number of tensors being bucketed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159723
Approved by: https://github.com/IvanKobzarev
2025-08-03 09:16:55 +00:00
805a102beb Revert "[dynamo][guards] Make class members go through obj.__class__.__dict__ (#159534)"
This reverts commit 1616777cd2a3170ff76afa3e7860b0969420c445.

Reverted https://github.com/pytorch/pytorch/pull/159534 on behalf of https://github.com/malfet due to Broke some inductor test and lint among other things, see 9c18901bfd/1 ([comment](https://github.com/pytorch/pytorch/pull/159534#issuecomment-3146983186))
2025-08-03 04:58:32 +00:00
6e8d705a22 Revert "[dynamo] Be consistent with storing func source for UserMethodVariable (#159696)"
This reverts commit be71000ff5292293d1976f313218e2df4d5046d3.

Reverted https://github.com/pytorch/pytorch/pull/159696 on behalf of https://github.com/malfet due to Broke some inductor test and lint among other things, see 9c18901bfd/1 ([comment](https://github.com/pytorch/pytorch/pull/159534#issuecomment-3146983186))
2025-08-03 04:58:32 +00:00
9c18901bfd [MTIA Aten Backend] Migrate all.out (#159539)
Differential Revision: [D79317033](https://our.internmc.facebook.com/intern/diff/D79317033/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159539
Approved by: https://github.com/malfet
ghstack dependencies: #159098
2025-08-03 02:08:35 +00:00
a29ed5e1ac Add torch compile force disable caches alias (#158072)
Bunch of people keep thinking current alias only disables inductor cache because it has the name inductor in it. lets globalize the name

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158072
Approved by: https://github.com/ezyang
2025-08-02 23:23:17 +00:00
d2792f51b2 [bucketing] Use max of input/output size for bucketing (#159717)
The output of a reduce_scatter is n_gpu times smaller than its input, while the output of an all_gather is n_gpu times larger than its input. This means that in the current heuristic for bucketing reduce_scatter, we would need to use a bucket size which is n_gpu times larger than the bucket for all_gather, making it gpu-dependent and less intuitive. This PRs propose to use instead the max between the input and output sizes, so that one can use the same bucket_size value for both passes

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159717
Approved by: https://github.com/wconstab
2025-08-02 22:42:22 +00:00
be71000ff5 [dynamo] Be consistent with storing func source for UserMethodVariable (#159696)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159696
Approved by: https://github.com/jansel
ghstack dependencies: #159186, #159534
2025-08-02 21:40:38 +00:00
3f86076775 gc before warming up benchmarking (#159670)
#158649 turned off automatic GCs during cudagraph recording. This is causing a small uptick in some internal benchmark numbers because of memory the benchmark is leaving around before the benchmark starts - so GC before warming up the model.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159670
Approved by: https://github.com/oulgen
2025-08-02 19:37:24 +00:00
1616777cd2 [dynamo][guards] Make class members go through obj.__class__.__dict__ (#159534)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159534
Approved by: https://github.com/jansel
ghstack dependencies: #159186
2025-08-02 18:04:35 +00:00
38895c0ac2 Update RuntimeError message in is_nonzero(input) method from bool to Boolean (#159712)
RuntimeError message updated in is_nonzero(input) method from bool to Boolean.

**Case 1:**
t = torch.tensor([])
torch.is_nonzero(t)

**Case 2:**
t = torch.tensor([1,2])
torch.is_nonzero(t)

**Existing Error message in documentation:**

for case 1: RuntimeError: bool value of Tensor with no values is ambiguous
for case 2: RuntimeError: bool value of Tensor with more than one value is ambiguous

**Proposed Error message in documentation:**

for case 1: RuntimeError: Boolean value of Tensor with no values is ambiguous
for case 2: RuntimeError: Boolean value of Tensor with more than one value is ambiguous

Fixes #159710
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159712
Approved by: https://github.com/malfet
2025-08-02 17:23:45 +00:00
310f901a71 Stop parsing command line arguments every time common_utils is imported. (#156703)
Last PR in the series to re-submit https://github.com/pytorch/pytorch/pull/134592 as smaller PRs:

https://github.com/pytorch/pytorch/pull/154612
https://github.com/pytorch/pytorch/pull/154628
https://github.com/pytorch/pytorch/pull/154715
https://github.com/pytorch/pytorch/pull/154716
https://github.com/pytorch/pytorch/pull/154725
https://github.com/pytorch/pytorch/pull/154728

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156703
Approved by: https://github.com/clee2000
2025-08-02 16:38:54 +00:00
e11b1cd97e [ROCm] fix nightly wheel due to rocBLAS environment variable (#159570)
Fixes #159070

The TunableOp failure is due to missing rocBLAS files in our manywheels packaging. This bug has been present since June 7-8 time frame. It was caused by a typo in the rocBLAS environment variable that stores the list of files. It was introduced in this PR: https://github.com/pytorch/pytorch/pull/155388

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159570
Approved by: https://github.com/malfet
2025-08-02 06:54:43 +00:00
b599d91738 Log autotune choices and benchmark result to scuba/chrome trace (#159496)
Summary:
Report the kernel choices and benchmark data to better understand how kernels are selected and the performance gap between the best kernel (likely a CUDA kernel) and Triton kernels.

**Example**

Event: mm_template_autotuning
Column: autotune_choices

```json
{
  "num_choices": 52,
  "num_triton_choices": 19,
  "best_kernel": "cutlass_f6c25cf2",
  "best_kernel_desc": "cutlass3x_sm90_tensorop_gemm_f16_f16_f32_void_f16_128x256x64_2x1x1_0_tnn_align8_stream_k_warpspecialized_cooperative_epi_tma swizzle=8",
  "best_time": 0.6283040046691895,
  "best_triton_pos": 26,
  "best_triton_time": 0.6832960247993469,
  "best_triton_kernel": "triton_mm_17",
  "best_triton_kernel_desc": "ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4, num_consumer_groups=0, num_buffers_warp_spec=0"
}
```

Test Plan:
```
TORCHINDUCTOR_MAX_AUTOTUNE_REPORT_CHOICES_STATS =1 buck2 run //scripts/wychi:test_autotune_mm 2>&1 > /tmp/mylog.txt
```

Rollback Plan:

Differential Revision: D79235037

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159496
Approved by: https://github.com/masnesral
2025-08-02 05:34:17 +00:00
fd6a6658c3 Enable _int_mm on Intel GPU (#157769)
# Moativation

This PR is used to enable _int_mm on Intel GPU. And _int_mm is used by int8 quantization on torchao.

# Model Test Result:
We run meta-llama/Llama-3.1-8B-Instruct on Intel GPU and A100 using torchao int8-dynamic-quantization. The model configs as below:
Precision : torch.bfloat16
quantization configuration : Int8DynamicActivationInt8WeightConfig
dataset : wikitext

Result:
The perplexity values for Intel GPU and A100 are 9.582953453063965 and 9.57755184173584, respectively.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157769
Approved by: https://github.com/EikanWang, https://github.com/desertfire
2025-08-02 05:16:01 +00:00
04973496a8 [audio hash update] update the pinned audio hash (#159611)
This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned audio hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159611
Approved by: https://github.com/pytorchbot
2025-08-02 05:15:47 +00:00
1548b011ea Fix rand_like decomposition to preserve strides (#159294)
Summary: Like https://github.com/pytorch/pytorch/pull/158898, the rand_like variants are not preserving strides. Followed the pattern established in https://github.com/pytorch/pytorch/pull/158898.

Test Plan: New unit test (fails before this PR; but fixed after)

Differential Revision: [D79472604](https://our.internmc.facebook.com/intern/diff/D79472604)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159294
Approved by: https://github.com/eellison
2025-08-02 03:54:41 +00:00
e57a92734d [export] Fix nn_module_stack of assert_tensor_metadata nodes (#159625)
Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159625
Approved by: https://github.com/yushangdi
2025-08-02 02:52:42 +00:00
79ff3b320b Back out "[ez] get rid of unused var" (#159677)
Summary: turns out i added this to reduce the frequency we'd call try_update_max_size_at_index when a new maximum is found before the replan is called. oops.

Test Plan:
backout

Rollback Plan:

Differential Revision: D79474114

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159677
Approved by: https://github.com/georgiaphillips
2025-08-02 01:50:16 +00:00
426f249f20 Fix launch grid calculation (#159497)
Summary:

The launch grid calculation code is using a python trick to achieve CeilDiv() through negative integer division with FloorDiv(). This is language dependent behaviour that doesn't apply to all languages.

In the FXIR backend we negate this behaviour and replace the experssion with CeilDiv() operation so the computation is correct regardless of language used. Not directly directly changing the orginal computation as it leads to a performance degredation.

Test Plan:
CI

Rollback Plan:

Differential Revision: D79275534

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159497
Approved by: https://github.com/blaine-rister
2025-08-02 01:12:58 +00:00
d33a484763 Use boxed_nop_preserve_node_meta for aot_export_joint_with_descriptors (#159545)
Signed-off-by: Edward Z. Yang <ezyang@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159545
Approved by: https://github.com/xmfan, https://github.com/wconstab
ghstack dependencies: #159336, #159337
2025-08-02 00:33:41 +00:00
a81ffbc5f5 improve shape checks for grouped_mm (#159666)
Check that contraction dimension matches between tensors if it's known, and do device-side checks for correct offsets
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159666
Approved by: https://github.com/danielvegamyhre, https://github.com/eqy
2025-08-02 00:12:25 +00:00
465fe4d9f7 Enable sample nightly PT2 benchmark on B200 (#158011)
Per the discussion with @nWEIdia, this resumes the work on https://github.com/pytorch/pytorch/pull/157870 to enable PT2 benchmark on B200

### Testing

https://github.com/pytorch/pytorch/actions/runs/16615101382

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158011
Approved by: https://github.com/nWEIdia, https://github.com/atalman
2025-08-01 23:47:44 +00:00
9477af1063 fix compilation on cuda < 12.3 (#159657)
Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159657
Approved by: https://github.com/kwen2501
2025-08-01 23:40:55 +00:00
dcc36e38bb [Graph Breaks] Remove unsupported Additional Info field (#159658)
Race condition when landing PR#158800 caused us to add this field when it is deprecated, so remove it

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159658
Approved by: https://github.com/williamwen42
2025-08-01 23:25:50 +00:00
efd78584a8 [EZ] Add linux-aarch64.yml workflow to the viable/strict blocking set (#159668)
Since it's required to be run on every PR

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159668
Approved by: https://github.com/malfet
2025-08-01 23:19:08 +00:00
135762ea20 Unpin helion (#159579)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159579
Approved by: https://github.com/jansel
2025-08-01 23:08:06 +00:00
e2ee9cfaa2 [NativeRT] Turn on enableStaticCPUKernels by default (#159422)
Summary: As title.

Test Plan:
Need to manual test on production models.

Rollback Plan:

Differential Revision: D78747742

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159422
Approved by: https://github.com/dolpm
2025-08-01 22:27:07 +00:00
06d28de17a Update CK Kernel generation and update ck submodule (#157964)
changes required to reduce the number of ck kernels generated. This change depends on https://github.com/ROCm/composable_kernel/pull/2480 to be merged first.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/157964
Approved by: https://github.com/842974287
2025-08-01 22:24:27 +00:00
df9720b8b5 [MTIA Aten Backend] Migrate all foreach ops (#159098)
# Context

See the first PR https://github.com/pytorch/pytorch/pull/153670

# This diff

 Migrate all foreach operators to in-tree, including:
  - _foreach_abs
  - _foreach_abs_
  - _foreach_add.List
  - _foreach_add_.List
  - _foreach_add_.Scalar
  - _foreach_add_.Tensor
  - _foreach_addcmul.Scalar
  - _foreach_addcmul_.Scalar
  - _foreach_copy
  - _foreach_copy_
  - _foreach_mul.List
  - _foreach_mul_.List
  - _foreach_mul_.Scalar
  - _foreach_mul.Tensor
  - _foreach_mul_.Tensor
  - _foreach_norm.Scalar
  - _foreach_sqrt_

Differential Revision: [D78913847](https://our.internmc.facebook.com/intern/diff/D78913847/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159098
Approved by: https://github.com/malfet
2025-08-01 22:10:12 +00:00
85e74d5ace [inductor] Add logging for distributed collective ops for multi‑rank diagnostics (#159190)
This change introduces structured logging of the collective communication schedule, enabling downstream tools (e.g. TLParse) to ingest and analyze per‑rank collective‐order information for multi‑rank jobs.

- Iterates over scheduler.nodes, filters for _CollectiveKernel nodes
- Extracts each op’s python_kernel_name
- Emits a structured JSON payload under the inductor_collective_schedule artifact name
- Dumps the full schedule list to collective_schedule.json via the PyTorch trace‑structured artifact
- Added comprehensive unit tests for collective schedule tracing: Created test_collective_schedule_empty() and test_collective_schedule_real() tests to verify structured trace logging works correctly for both empty collective schedules and real collective operations (like all_reduce and wait_tensor from _c10d_functional ops).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159190
Approved by: https://github.com/yushangdi, https://github.com/xmfan
2025-08-01 21:51:42 +00:00
0450f05658 Output tensor meta data for FX graph node (#159311)
FX graph segment in CompiledFxGraph does not include tensor meta data, for example, tensor shape, tensor stride, tensor data type, tensor device. AI system co-design team requested to include these information in FX graph segment so they can use FX graph segment to project the performance on different hardware.
This DIFF is to modify the Graph::Node::format_node to include tensor meta data.
Before this DIFF, the triton kernel FX graph segment looks like the following:
```
# %mm : Tensor "f32[4, 4][4, 1]cuda:0" = PlaceHolder[target=mm]
# %arg2_1 : Tensor "f32[4, 4][4, 1]cuda:0" = PlaceHolder[target=arg2_1]
# %sin : Tensor "f32[4, 4][4, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sin.default](args = (%mm,), kwargs = {})
# %permute_1 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%sin, [1, 0]), kwargs = {})
# %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg2_1, 1111), kwargs = {})
# %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%permute_1, %mul), kwargs = {})
# %cos : cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cos.default](args = (%add,), kwargs = {})
# return %cos
After this DIFF:
# %mm : Tensor "f32[4, 4][4, 1]cuda:0" = PlaceHolder[target=mm]
# %arg2_1 : Tensor "f32[4, 4][4, 1]cuda:0" = PlaceHolder[target=arg2_1]
# %sin : Tensor "f32[4, 4][4, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sin.default](args = (%mm,), kwargs = {})
# %permute_1 : Tensor "f32[4, 4][1, 4]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%sin, [1, 0]), kwargs = {})
# %mul : Tensor "f32[4, 4][4, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg2_1, 1111), kwargs = {})
# %add : Tensor "f32[4, 4][1, 4]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%permute_1, %mul), kwargs = {})
# %cos : Tensor "f32[4, 4][1, 4]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cos.default](args = (%add,), kwargs = {})
# return %cos
```
If format_node can not be changed, I can copy the code to caffe2/torch/_inductor/utils.py.

Differential Revision: D77973076

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159311
Approved by: https://github.com/angelayi
2025-08-01 21:40:29 +00:00
595a65f5c2 [dynamo] Replace unimplemented with unimplemented_v2 in torch/_dynamo/variables/script_object.py (#159343)
Fixes part of #147913

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159343
Approved by: https://github.com/williamwen42

Co-authored-by: William Wen <william.wen42@gmail.com>
2025-08-01 21:30:41 +00:00
8c6c2e40eb Edit a test case to detect potential bugs in all-gathering noncontiguous inputs in the Gloo backend (#159542)
As suggested in the pull request #158903 by @H-huang, this pull request edits a test case to detect potential bugs in all-gathering noncontiguous inputs in the Gloo backend.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159542
Approved by: https://github.com/d4l3k, https://github.com/H-Huang
2025-08-01 21:20:25 +00:00
32840d19f9 [cutlass backend] skip stream k if shape is dynamic (#159442)
Differential Revision: [D79229210](https://our.internmc.facebook.com/intern/diff/D79229210/)

Motivation is workspace size is hard to determine, and varies for different shape. What I observed is sometimes the shape got smaller, but the workspace can increase. So it is hard to upper bound it.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159442
Approved by: https://github.com/ColinPeppler
2025-08-01 20:42:24 +00:00
2040f00112 [BE][Easy] respect os.environ in subprocess calls in tools/nightly.py (#159572)
Respect parent shell's envvars, such as `UV_INDEX_STRATEGY`, `http{,s}_proxy`, etc.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159572
Approved by: https://github.com/Skylion007
2025-08-01 20:40:31 +00:00
c137f9da0b [Dynamo][Better Engineering] Add type coverage to dynamo/compiled_autograd.py (#159518)
As part of better engineering effort, we would like to improve out type support to improve dev experience in dynamo

This PR adds strict typing support to `torch/_dynamo/compiled_autograd.py`

Running
```
mypy torch/_dynamo/compiled_autograd.py --linecount-report /tmp/coverage_log
```

| -------- | Lines Annotated | Lines Total | % lines covered | Funcs Annotated | Funcs Total | % funcs covered |
| -------- | ------- | -------- | ------- | ------- | ------- | ------- |
| Main  |  425 | 1553 | 27.37% | 17 | 62 | 27.42% |
| This PR | 1623 | 1623 | 100.00% | 62 | 62 | 100.00% |
| Delta    | +1198| +0 | +72.63% | +45 | 0 | +72.58% |

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159518
Approved by: https://github.com/xmfan
2025-08-01 20:24:58 +00:00
5e8b95605f [PP] Support OVERLAP_F_B computation type (#158978)
Some changes to validation code and visualizer to support a new computation type that will be used in DualPipeV (see https://github.com/pytorch/pytorch/pull/159591)

The IR looks like:

```
[0F0, 0F1, 0F2, 0F3, 0F4, 0F5, 0F6, 7F0, 7I0, 7W0, 7F1, 7I1, 7W1, 7F2, 7I2, 7W2, 7F3, (0F7;7B3)OVERLAP_F_B, (7F4;0B0)OVERLAP_F_B, (0F8;7B4)OVERLAP_F_B, (7F5;0B1)OVERLAP_F_B, (0F9;7B5)OVERLAP_F_B, (7F6;0B2)OVERLAP_F_B, 7B6, (7F7;0B3)OVERLAP_F_B, 7B7, (7F8;0B4)OVERLAP_F_B, 7B8, (7F9;0B5)OVERLAP_F_B, 7B9, 0I6, 0W6, 0I7, 0W7, 0I8, 0W8, 0I9, 0W9]
[1F0, 1F1, 1F2, 1F3, 1F4, 6F0, 1F5, 6F1, 6I0, 6W0, 6F2, 6I1, 6W1, 6F3, (1F6;6B2)OVERLAP_F_B, (6F4;1B0)OVERLAP_F_B, (1F7;6B3)OVERLAP_F_B, (6F5;1B1)OVERLAP_F_B, (1F8;6B4)OVERLAP_F_B, (6F6;1B2)OVERLAP_F_B, (1F9;6B5)OVERLAP_F_B, (6F7;1B3)OVERLAP_F_B, 6B6, (6F8;1B4)OVERLAP_F_B, 6B7, (6F9;1B5)OVERLAP_F_B, 6B8, 1B6, 6I9, 1I7, 6W9, 1I8, 1W7, 1I9, 1W8, 1W9]
[2F0, 2F1, 2F2, 5F0, 2F3, 5F1, 2F4, 5F2, 5I0, 5W0, 5F3, (2F5;5B1)OVERLAP_F_B, (5F4;2B0)OVERLAP_F_B, (2F6;5B2)OVERLAP_F_B, (5F5;2B1)OVERLAP_F_B, (2F7;5B3)OVERLAP_F_B, (5F6;2B2)OVERLAP_F_B, (2F8;5B4)OVERLAP_F_B, (5F7;2B3)OVERLAP_F_B, (2F9;5B5)OVERLAP_F_B, (5F8;2B4)OVERLAP_F_B, 5B6, (5F9;2B5)OVERLAP_F_B, 5B7, 2B6, 5B8, 2I7, 5I9, 2I8, 2W7, 2I9, 5W9, 2W8, 2W9]
[3F0, 4F0, 3F1, 4F1, 3F2, 4F2, 3F3, 4F3, 3F4, 4B0, (4F4;3B0)OVERLAP_F_B, (3F5;4B1)OVERLAP_F_B, (4F5;3B1)OVERLAP_F_B, (3F6;4B2)OVERLAP_F_B, (4F6;3B2)OVERLAP_F_B, (3F7;4B3)OVERLAP_F_B, (4F7;3B3)OVERLAP_F_B, (3F8;4B4)OVERLAP_F_B, (4F8;3B4)OVERLAP_F_B, (3F9;4B5)OVERLAP_F_B, (4F9;3B5)OVERLAP_F_B, 4B6, 3B6, 4B7, 3B7, 4I8, 3I8, 4I9, 3I9, 4W8, 3W8, 4W9, 3W9]
```

In this PR, the schedule execution will just treat the OVERLAP_F_B as two separate operations of F and B (so there is no actual overlap). The next step is to allow users to create a custom function to plug in what this operation does.

814629043a/torch/distributed/pipelining/schedules.py (L1205-L1216)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158978
Approved by: https://github.com/wconstab
2025-08-01 20:22:30 +00:00
8ea86a6e31 Actually test STD_TORCH_CHECK, add testfile to CMake (#159603)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159603
Approved by: https://github.com/Skylion007, https://github.com/albanD
2025-08-01 19:53:41 +00:00
acad808545 Revert "[inductor] consolidate common GEMM triton param retrieval (#159383)"
This reverts commit e7cc42df58a86bee05944f6e80c535aa1d099443.

Reverted https://github.com/pytorch/pytorch/pull/159383 on behalf of https://github.com/jataylo due to sorry but rocm CI is broken due to this PR ([comment](https://github.com/pytorch/pytorch/pull/159383#issuecomment-3145604831))
2025-08-01 19:49:21 +00:00
c687446374 Revert "Fix rand_like decomposition to preserve strides (#159294)"
This reverts commit 2c46922ce4b33c39b1c48c302604805510a3f889.

Reverted https://github.com/pytorch/pytorch/pull/159294 on behalf of https://github.com/yangw-dev due to breaking internal test ([comment](https://github.com/pytorch/pytorch/pull/159294#issuecomment-3145541845))
2025-08-01 19:19:51 +00:00
dd22ba09b4 [C10D] Document barrier interaction with device_id (#159389)
Addresses #159262

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159389
Approved by: https://github.com/malfet, https://github.com/H-Huang, https://github.com/kwen2501, https://github.com/fduwjj
2025-08-01 18:12:21 +00:00
c0e0126399 Remove unused input parameter in ExpandableSegment (#159356)
# Motivation
While refactoring the caching allocator, I noticed that the `ExpandableSegment` constructor on CUDA had an unused parameter. This change removes that unused argument to avoid potential confusion.

# Additional Context
I noticed that `ExpandableSegment` is defined in cpp file, so it should be safe to make this change.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159356
Approved by: https://github.com/ngimel, https://github.com/albanD
ghstack dependencies: #159159
2025-08-01 17:47:51 +00:00
e4b123b5e4 Revert direct updates (#159654)
reverts:
```

commit 5711a8f06948eeee56ed5f53f171fa519f78491c (tag: trunk/5711a8f06948eeee56ed5f53f171fa519f78491c, origin/main, main)
Author: Jovian Anthony Jaison <38627145+jovianjaison@users.noreply.github.com>
Date:   Fri Aug 1 09:32:52 2025 -0700

    Update test_utils.py

commit b4b71d011ed07a41c2086ff0dec2988a63662877 (tag: trunk/b4b71d011ed07a41c2086ff0dec2988a63662877)
Author: Jovian Anthony Jaison <38627145+jovianjaison@users.noreply.github.com>
Date:   Fri Aug 1 09:27:54 2025 -0700

    Update utils.py

commit 52376b9b6fbf9fe24f5d82038dc520f0c64b6f8d (tag: trunk/52376b9b6fbf9fe24f5d82038dc520f0c64b6f8d)
Author: Jovian Anthony Jaison <38627145+jovianjaison@users.noreply.github.com>
Date:   Fri Aug 1 09:26:05 2025 -0700
```

(commits pushed directly to main by mistake)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159654
Approved by: https://github.com/atalman
2025-08-01 16:54:51 +00:00
5711a8f069 Update test_utils.py 2025-08-01 09:32:52 -07:00
b4b71d011e Update utils.py 2025-08-01 09:27:54 -07:00
52376b9b6f Update convert_frame.py 2025-08-01 09:26:05 -07:00
1371a98b0e Migrate ScalarType to headeronly (#159416)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159416
Approved by: https://github.com/albanD
ghstack dependencies: #159415, #159411
2025-08-01 16:07:01 +00:00
2a286cbdf4 Allow register_buffer with Tensor-like object (#159455)
As torch allows extending the tensor with `__torch_function__`, it would be desirable to allow registering it as a buffer.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159455
Approved by: https://github.com/mikaylagawarecki
2025-08-01 15:31:38 +00:00
7c37b8e1e0 [ROCm][Windows] Switch __builtin_clz ifdef from WIN32 to MSC_VER. (#159273)
PyTorch with ROCm on Windows is built with clang-cl and not MSVC. This code path is specific to the MSVC compiler so it should be checking for MSC_VER, not just WIN32. The change here is similar to https://github.com/pytorch/pytorch/pull/146606.

This fixes downstream build errors using clang-cl like https://github.com/ROCm/TheRock/actions/runs/16569646709/job/46858176812 (patched and tested downstream at https://github.com/ROCm/TheRock/pull/1140):
```
[7099/7147] Building CXX object functorch\CMakeFiles\functorch.dir\csrc\dim\dim.cpp.obj
FAILED: functorch/CMakeFiles/functorch.dir/csrc/dim/dim.cpp.obj
C:\home\runner\_work\_tool\Python\3.11.9\x64\Lib\site-packages\_rocm_sdk_devel\lib\llvm\bin\clang-cl.exe  /nologo -TP -DEXPORT_AOTI_FUNCTIONS -DFUNCTORCH_BUILD_MAIN_LIB -DMINIZ_DISABLE_ZIP_READER_CRC32_CHECKS -DNOMINMAX -DONNXIFI_ENABLE_EXT=1 -DONNX_ML=1 -DONNX_NAMESPACE=onnx_torch -DROCM_ON_WINDOWS -DROCM_USE_FLOAT16 -DROCM_VERSION=70000 -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=_C -DTORCH_HIP_VERSION=700 -DUSE_EXTERNAL_MZCRC -DUSE_MIMALLOC -DUSE_PROF_API=1 -DWIN32_LEAN_AND_MEAN -D_CRT_SECURE_NO_DEPRECATE=1 -D_UCRT_LEGACY_INFINITY -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_AMD__=1 -Dfunctorch_EXPORTS -IB:\src\torch\build\aten\src -IB:\src\torch\aten\src -IB:\src\torch\build -IB:\src\torch -IB:\src\torch\nlohmann -IB:\src\torch\moodycamel -IB:\src\torch\third_party\mimalloc\include -IB:\src\torch\functorch -IB:\src\torch\torch\csrc\api -IB:\src\torch\torch\csrc\api\include -IB:\src\torch\c10\.. -IB:\src\torch\c10\hip\..\.. -IB:\src\torch\torch\.. -IB:\src\torch\torch\..\aten\src -IB:\src\torch\torch\..\aten\src\TH -IB:\src\torch\build\caffe2\aten\src -IB:\src\torch\build\third_party -IB:\src\torch\build\third_party\onnx -IB:\src\torch\torch\..\third_party\valgrind-headers -IB:\src\torch\torch\..\third_party\gloo -IB:\src\torch\torch\..\third_party\onnx -IB:\src\torch\torch\..\third_party\flatbuffers\include -IB:\src\torch\torch\..\third_party\kineto\libkineto\include -IB:\src\torch\torch\..\third_party\cpp-httplib -IB:\src\torch\torch\..\third_party\nlohmann\include -IB:\src\torch\torch\csrc -IB:\src\torch\torch\lib -IB:\src\torch\torch\standalone -IB:\src\torch\torch\lib\libshm_windows -imsvcC:\home\runner\_work\_tool\Python\3.11.9\x64\Lib\site-packages\_rocm_sdk_devel\include -imsvcB:\src\torch\third_party\protobuf\src -imsvcB:\src\torch\third_party\XNNPACK\include -imsvcB:\src\torch\third_party\ittapi\include -imsvcB:\src\torch\cmake\..\third_party\eigen -imsvcB:\src\torch\third_party\ideep\mkl-dnn\include\oneapi\dnnl -imsvcB:\src\torch\third_party\ideep\include -imsvcB:\src\torch\INTERFACE -imsvcB:\src\torch\third_party\nlohmann\include -imsvcB:\src\torch\third_party\concurrentqueue -imsvcC:\home\runner\_work\_tool\Python\3.11.9\x64\Lib\site-packages\_rocm_sdk_devel\include\hiprand -imsvcC:\home\runner\_work\_tool\Python\3.11.9\x64\Lib\site-packages\_rocm_sdk_devel\include\rocrand -imsvcB:\src\torch\cmake\..\third_party\pybind11\include -imsvcC:\home\runner\_work\_tool\Python\3.11.9\x64\include /DWIN32 /D_WINDOWS /EHsc /Zc:__cplusplus /bigobj /FS /utf-8 -DUSE_PTHREADPOOL -DNDEBUG -DUSE_FBGEMM -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE /wd4624 /wd4068 /wd4067 /wd4267 /wd4661 /wd4717 /wd4244 /wd4804 /wd4273 /O2 /Ob2 /DNDEBUG /bigobj -DNDEBUG -std:c++17 -MD -Z7 -Wmissing-prototypes -Werror=missing-prototypes /permissive- /d2implyavx512upperregs- /EHsc /bigobj -fms-runtime-lib=dll -D__HIP_PLATFORM_AMD__=1 -DCUDA_HAS_FP16=1 -DUSE_ROCM -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -DTORCH_HIP_VERSION=700 -Wno-shift-count-negative -Wno-shift-count-overflow -Wno-duplicate-decl-specifier -DCAFFE2_USE_MIOPEN -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP -std=c++17 -DHIPBLAS_V2 -DHIP_ENABLE_WARP_SYNC_BUILTINS -fms-extensions -Wno-ignored-attributes /showIncludes /Fofunctorch\CMakeFiles\functorch.dir\csrc\dim\dim.cpp.obj /Fdfunctorch\CMakeFiles\functorch.dir\ -c -- B:\src\torch\functorch\csrc\dim\dim.cpp
clang-cl: warning: unknown argument ignored in clang-cl: '-std=c++17' [-Wunknown-argument]
clang-cl: warning: argument unused during compilation: '/d2implyavx512upperregs-' [-Wunused-command-line-argument]
In file included from B:\src\torch\functorch\csrc\dim\dim.cpp:36:
B:\src\torch\functorch\csrc\dim\arena.h(14,21): error: functions that differ only in their return type cannot be overloaded
   14 | inline unsigned int __builtin_clz(unsigned int x) {
      |        ~~~~~~~~~~~~ ^
C:\home\runner\_work\_tool\Python\3.11.9\x64\Lib\site-packages\_rocm_sdk_devel\lib\llvm\lib\clang\20\include\ia32intrin.h(60,15): note: '__builtin_clz' is a builtin with type 'int (unsigned int) noexcept'
   60 |   return 31 - __builtin_clz((unsigned int)__A);
      |               ^
1 error generated.
[7100/7147] Building CXX object caffe2\torch\CMakeFiles\torch_python.dir\csrc\utils\tensor_list.cpp.obj
```

> [!NOTE]
> I haven't been able to reproduce those errors locally, but we have CI jobs that consistently fail when building for Python 3.11 but not 3.12 or 3.13. I'm not sure what is different between those builds, but the code fix seems correct.

There are a few other variations on fixes to this floating around, such as:
* a97a957af0/lz4.c (L34-L43) (checking with `__has_builtin`)
* c98c55ec7e/lj92.c (L31-L46) (the same code as here, but with `_MSC_VER`)
* 2760e5a2bb/def.h (L23-L25) (using `__lzcnt` instead of a custom implementation)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159273
Approved by: https://github.com/Skylion007, https://github.com/m-gallus
2025-08-01 15:21:26 +00:00
ee2649219c Fix max_width computation in _tensor_str._Formatter (#126859)
Previous version of `torch._tensor_str._Formatter` was not using `PRINT_OPTS.sci_mode` for the `max_width` computation but was using it for the formatting of values leading to a weird discrepancy.

Now, the code first checks if it should be in sci_mode, then compute `max_width`

Here is an example to test the behavior:
```python
A = torch.tensor([10, 1e-1, 1e-2])
B = torch.tensor([10, 1e-1, 1e-1])

print("================= Default =================")
print(A, f"Formatter max_width: {torch._tensor_str._Formatter(A).max_width}")
print(B, f"Formatter max_width: {torch._tensor_str._Formatter(B).max_width}")

print("================= sci_mode=False =================")
with torch._tensor_str.printoptions(sci_mode=False):
    print(A, f"Formatter max_width: {torch._tensor_str._Formatter(A).max_width}")
    print(B, f"Formatter max_width: {torch._tensor_str._Formatter(B).max_width}")

print("================= sci_mode=True =================")
with torch._tensor_str.printoptions(sci_mode=True):
    print(A, f"Formatter max_width: {torch._tensor_str._Formatter(A).max_width}")
    print(B, f"Formatter max_width: {torch._tensor_str._Formatter(B).max_width}")
```

In the current version this prints:
```
================= Default =================
tensor([1.0000e+01, 1.0000e-01, 1.0000e-02]) Formatter max_width: 10
tensor([10.0000,  0.1000,  0.1000]) Formatter max_width: 7
================= sci_mode=False =================
tensor([   10.0000,     0.1000,     0.0100]) Formatter max_width: 10
tensor([10.0000,  0.1000,  0.1000]) Formatter max_width: 7
================= sci_mode=True =================
tensor([1.0000e+01, 1.0000e-01, 1.0000e-02]) Formatter max_width: 10
tensor([1.0000e+01, 1.0000e-01, 1.0000e-01]) Formatter max_width: 7
```

On can see that in `sci_mode=False`, the values of A are prefixed with unneeded 0 and does not have the same `max_width` as B (It keeps the `max_width` from `sci_mode = None`)

Also in `sci_mode = True`, for B, the `max_width` is 7 but each value takes 10 chars... (But it is fine as the code that uses `max_width` do not rely much on it, but still, this is missleading)

After this commit, this will print
```
================= Default =================
tensor([1.0000e+01, 1.0000e-01, 1.0000e-02]) Formatter max_width: 10
tensor([10.0000,  0.1000,  0.1000]) Formatter max_width: 7
================= sci_mode=False =================
tensor([10.0000,  0.1000,  0.0100]) Formatter max_width: 7
tensor([10.0000,  0.1000,  0.1000]) Formatter max_width: 7
================= sci_mode=True =================
tensor([1.0000e+01, 1.0000e-01, 1.0000e-02]) Formatter max_width: 10
tensor([1.0000e+01, 1.0000e-01, 1.0000e-01]) Formatter max_width: 10
```

This also allows to align A with B for `sci_mode=False`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/126859
Approved by: https://github.com/malfet
2025-08-01 15:05:41 +00:00
b0b3e6e48b [PP] Refactor test_schedule_multiproc (#158780)
This refactors the pipelining schedule tests since a lot of them have the same repeated code of:
1. Create pipelined model and reference model
2. Run reference model and pipelined model
3. compare gradients

So this refactors those parts above into helper methods and reduces ~300 LOC. Also adds a better gradient check to resolve flakiness (fixes https://github.com/pytorch/pytorch/issues/154408).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158780
Approved by: https://github.com/wconstab
2025-08-01 15:02:18 +00:00
3967dbedf4 [ContextParallel][FlexAttention] Prototype of supporting FlexAttention in Context Parallel (#158692)
**Summary**
This PR adds an all-gather based FlexAttention and uses TorchFunctionMode to dispatch
`FlexAttentionHOP.__call__` to it.

This PR makes the following changes:

- add a user-facing API `create_cp_block_mask` for creating CP-specific `BlockMask`
which masks over the attention result of Q shard and KV global.
- add `_ContextParallelGlobalVars` to store all necessary global vars that CP FlexAttention
requires. `torch_function_mode` is critical to maintain singleton mode to avoid dynamo
recompilations.
- add a dispatch path for `FlexAttentionForwardHOP.__call__` (TorchFunctionMode dispatch
won't work correctly without this line)

What's not in this PR:
- QKV load balancing
- Test on other masking besides `causal_mask`.
- Support on small attention (i.e. qkv size is smaller than 128) because the block mask
rewrite function requires `Q_BLOCK_SIZE == KV_BLOCK_SIZE == 128`.

**Test**
`pytest test/distributed/tensor/test_attention.py -s -k test_ring_flex_attention`

**Followup**
1. create an issue to reproduce the error in `create_fw_bw_graph()` when trying to call `create_block_mask`
to re-write `block_mask` in `FlexAttentionHOP` dispatch in `TorchFunctionMode`.
2. Merge `_ContextParallelGlobalVars` and `_cp_options`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158692
Approved by: https://github.com/drisspg
2025-08-01 06:49:01 +00:00
4396b15aa7 remove co_lnotab in favor of co_linetable (#159227)
Fixes #158833
DeprecationWarning: remove co_lnotab in favor of co_linetable

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159227
Approved by: https://github.com/ezyang
2025-08-01 06:34:38 +00:00
bb6766053b fix strategy hashing arg mismatch (#159506)
Reland https://github.com/pytorch/pytorch/pull/159289.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159506
Approved by: https://github.com/XilunWu
2025-08-01 05:42:40 +00:00
a4fc051c9a Fix a bug of distributed 'gather' with noncontiguous tensors on the NCCL backend. (#159549)
Fixes #159548

* Throw an error message when the input tensors for the distributed `gather` are noncontiguous. This behaviour is consistent with the distributed `all_gather`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159549
Approved by: https://github.com/d4l3k
2025-08-01 03:26:06 +00:00
5cc6a0abc1 Revert "Refactor CUDAAllocatorConfig to reuse AcceleratorAllocatorConfig (#150312)"
This reverts commit dfacf11f66d6512396382bdf5088f0ba9de00406.

Reverted https://github.com/pytorch/pytorch/pull/150312 on behalf of https://github.com/guangyey due to Static initialization order issue impact the downstream repo ([comment](https://github.com/pytorch/pytorch/pull/150312#issuecomment-3142035444))
2025-08-01 03:24:54 +00:00
90f13f3b2a Revert "Deprecate overleap functions in CUDAAllocatorConfig, use AcceleratorAllocatorConfig instead (#156165)"
This reverts commit 1fc010a9d8ea95bb74e54b31d17eba56ef16c27c.

Reverted https://github.com/pytorch/pytorch/pull/156165 on behalf of https://github.com/guangyey due to Static initialization order issue impact the downstream repo ([comment](https://github.com/pytorch/pytorch/pull/150312#issuecomment-3142035444))
2025-08-01 03:24:54 +00:00
cb9b74872b Revert "Generalize torch._C._set_allocator_settings to be generic (#156175)"
This reverts commit d3ce45012ed42cd1e13d5048b046b781f0feabe0.

Reverted https://github.com/pytorch/pytorch/pull/156175 on behalf of https://github.com/guangyey due to Static initialization order issue impact the downstream repo ([comment](https://github.com/pytorch/pytorch/pull/150312#issuecomment-3142035444))
2025-08-01 03:24:54 +00:00
c964204829 [CI] Disable executorch jobs (#159595)
The current executorch pin needs to be updated

The next time the docker image gets rebuilt, the executorch docker build is going to fail like https://github.com/pytorch/pytorch/actions/runs/16626853655/job/47137807966

The failure is that the pin uses a version of the nightly that has been removed from the nightly index
```
#62 72.30 ERROR: Could not find a version that satisfies the requirement torch==2.8.0.dev20250601 (from versions: 1.11.0, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 2.0.0, 2.0.1, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0, 2.7.1, 2.8.0.dev20250602+cpu, 2.8.0.dev20250603+cpu, 2.8.0.dev20250604+cpu, 2.8.0.dev20250605+cpu, 2.8.0.dev20250606+cpu, 2.8.0.dev20250607+cpu, 2.8.0.dev20250608+cpu, 2.8.0.dev20250609+cpu, 2.8.0.dev20250610+cpu, 2.8.0.dev20250611+cpu, 2.8.0.dev20250612+cpu, 2.8.0.dev20250613+cpu, 2.8.0.dev20250614+cpu, 2.8.0.dev20250615+cpu, 2.8.0.dev20250616+cpu, 2.8.0.dev20250617+cpu, 2.8.0.dev20250618+cpu, 2.8.0.dev20250619+cpu, 2.8.0.dev20250620+cpu, 2.8.0.dev20250621+cpu, 2.8.0.dev20250622+cpu, 2.8.0.dev20250623+cpu, 2.8.0.dev20250624+cpu, 2.8.0.dev20250625+cpu, 2.8.0.dev20250626+cpu, 2.8.0.dev20250627+cpu, 2.9.0.dev20250628+cpu, 2.9.0.dev20250629+cpu, 2.9.0.dev20250630+cpu, 2.9.0.dev20250701+cpu, 2.9.0.dev20250702+cpu, 2.9.0.dev20250703+cpu, 2.9.0.dev20250704+cpu, 2.9.0.dev20250705+cpu, 2.9.0.dev20250706+cpu, 2.9.0.dev20250707+cpu, 2.9.0.dev20250708+cpu, 2.9.0.dev20250709+cpu, 2.9.0.dev20250710+cpu, 2.9.0.dev20250711+cpu, 2.9.0.dev20250712+cpu, 2.9.0.dev20250713+cpu, 2.9.0.dev20250714+cpu, 2.9.0.dev20250715+cpu, 2.9.0.dev20250716+cpu, 2.9.0.dev20250717+cpu, 2.9.0.dev20250718+cpu, 2.9.0.dev20250719+cpu, 2.9.0.dev20250720+cpu, 2.9.0.dev20250722+cpu, 2.9.0.dev20250723+cpu, 2.9.0.dev20250724+cpu, 2.9.0.dev20250725+cpu, 2.9.0.dev20250726+cpu, 2.9.0.dev20250727+cpu, 2.9.0.dev20250728+cpu, 2.9.0.dev20250729+cpu, 2.9.0.dev20250730+cpu, 2.9.0.dev20250731+cpu)
#62 72.30 ERROR: No matching distribution found for torch==2.8.0.dev20250601
```

The executorch hash update currently fails due to https://github.com/pytorch/pytorch/actions/runs/16636773244/job/47079169392
```
2025-07-31T01:56:57.0249165Z + echo 'expecting triton to not be installed, but it is'
2025-07-31T01:56:57.0249614Z expecting triton to not be installed, but it is
2025-07-31T01:56:57.0249969Z + exit 1
2025-07-31T01:58:27.6764352Z ##[error]Final attempt failed. Child_process exited with error code 1
```
I believe the cause is https://github.com/pytorch/executorch/pull/11653 where the nightly pytorch is installed from our index, but then requirements-examples installs timm from pypi, which reinstalls pytorch, except its the release build for cuda from pypi?  Which then causes triton to be installed.

I don't know what the intended behavior is so I'm disabling the executorch docker build, executorch build, and the nightly hash update, and apparently the test was already disabled because it was failing
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159595
Approved by: https://github.com/malfet
2025-08-01 02:18:03 +00:00
2ac45c2752 Fix autocast context manager when there is exception (#159565)
Summary: When exception occurs inside context manager, we need to either return False OR properly propagage exceptions via __exit__(exc_type, exc_val). But previously while tracing, we don't actually run the exit node so we end up swallowing the exception in a very weird way as outlined in https://github.com/pytorch/pytorch/issues/153202. This PR fixes it

Test Plan:
new test case

Rollback Plan:

Differential Revision: D79348382

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159565
Approved by: https://github.com/zou3519, https://github.com/yushangdi
2025-08-01 02:12:24 +00:00
83e2ea8135 [CPU] fix _weight_int8pack_mm with large output shape (#158341)
**Summary**
`_weight_int8pack_mm` on CPU may cause segmentation fault if output shape is large (i.e., M * N is large). It's because the kernel compute output buffer address by
```c++
auto* C_ptr = C_data + mb_start * N + nb_start;
```
where both `mb_start` and `N` are `int` and when they are large their product may overflow.
The solution is simple: declare these variables as `int64_t` so that the product won't overflow.

**Test plan**
```
pytest -sv test/test_linalg.py -k test__int8_mm_large_shape
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158341
Approved by: https://github.com/mingfeima, https://github.com/drisspg
2025-08-01 01:55:48 +00:00
d994027a41 [Doc fix] fix spelling of enough (#159587)
fixes typo in word `enought` to correct `enough` at 3 places in these files
```
aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
aten/src/ATen/native/cuda/CuFFTPlanCache.h
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159587
Approved by: https://github.com/ezyang
2025-08-01 01:50:57 +00:00
cb4f41e125 Revert "[dynamo] [guard] Add caching for inside torch.compile.disable function to avoid unnecessary recompilation. (#157566)"
This reverts commit 8e07c9870d07c5a318ab21bb16b3fa27576851e6.

Reverted https://github.com/pytorch/pytorch/pull/157566 on behalf of https://github.com/yangw-dev due to failed an odd internal test, please reach out to metamate to fix it, D79112610 ([comment](https://github.com/pytorch/pytorch/pull/157566#issuecomment-3141840110))
2025-08-01 01:27:45 +00:00
690fc9cf88 [merge_rules] add some expected failure and skips (#159581)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159581
Approved by: https://github.com/anijain2305
2025-08-01 01:18:40 +00:00
eb853e222b [cutlass upgrade] Ignore unused-but-set-variable for AsyncMM.cu (#159578)
Fixes inductor-perf-nightly-h100. This was caused by cutlass upgrade https://github.com/pytorch/pytorch/pull/158854. I missed it in https://github.com/pytorch/pytorch/pull/159276

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159578
Approved by: https://github.com/Skylion007
2025-08-01 00:10:59 +00:00
06395276e4 Remove dynamo_timed from the CachingAutotuner.coordinate_descent_tuning() hot path. (#159588)
Summary: When coordinate_descent_tuning==True, CachingAutotuner.coordinate_descent_tuning() is called for every call of CachingAutotuner.run() (at least for Triton templates), but immediately returns the launcher. Move the dynamo_timed call after the check for triton template so we don't incur the context manager overhead on every call.

Fixes https://github.com/pytorch/pytorch/issues/159525

Test Plan: Used the repro in https://github.com/pytorch/pytorch/issues/159525 to make sure the overhead goes away.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159588
Approved by: https://github.com/eellison
2025-07-31 23:33:10 +00:00
8becf646ef [dynamo] Make filter handle None as filter function (#159500)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159500
Approved by: https://github.com/guilhermeleobas, https://github.com/zou3519
ghstack dependencies: #158774, #159102
2025-07-31 23:28:57 +00:00
fa68216ca1 [itertools] Implement itertools.cycle with a polyfill (#159102)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159102
Approved by: https://github.com/guilhermeleobas, https://github.com/zou3519
ghstack dependencies: #158774
2025-07-31 23:28:57 +00:00
25ef3d315d [aoti][mps] Dynamic reductions (#159355)
Dynamic kernel:
```cpp
[[max_total_threads_per_threadgroup(1024)]]
kernel void generated_kernel(
    device float* out_ptr0,
    constant float* in_ptr0,
    constant long& r0_numel,
    uint2 thread_pos [[thread_position_in_grid]],
    uint2 group_pos [[thread_position_in_threadgroup]]
) {
    auto xindex = thread_pos.x;
    auto r0_index = thread_pos.y;
    int x0 = xindex;
    threadgroup float tmp_acc_0[32];
    float tmp_acc_1 = 0;
    for(auto r0_1_cnt = 0; r0_1_cnt < static_cast<int>(metal::floor(static_cast<float>(0.99902343750000000 + 0.00097656250000000000*r0_numel))); ++r0_1_cnt) {
        int r0_1 = 1024 * r0_1_cnt + r0_index;
        if (r0_1 >= r0_numel) break;
        auto tmp0 = in_ptr0[x0 + 5*r0_1];
        tmp_acc_1 += tmp0;
    }
    auto tmp1 = c10:🤘:threadgroup_sum(tmp_acc_0, tmp_acc_1, r0_index * 1, metal::min(static_cast<decltype(1024+r0_numel)>(1024), static_cast<decltype(1024+r0_numel)>(r0_numel)));
    if (r0_index == 0) out_ptr0[x0] = static_cast<float>(tmp1);
}

void AOTInductorModel::run_impl(...) {
    ...
    auto arg0_1_size = arg0_1.sizes();
    int64_t s77 = arg0_1_size[0];
    inputs.clear();
    [[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());
    static constexpr int64_t int_array_0[] = {5LL, };
    static constexpr int64_t int_array_1[] = {1LL, };
    AtenTensorHandle buf0_handle;
    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(1, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_mps, this->device_idx_, &buf0_handle));
    RAIIAtenTensorHandle buf0(buf0_handle);
    auto mps_lib_0_func = mps_lib_0.getKernelFunction("generated_kernel");
    auto mps_lib_0_func_handle = AOTIMetalKernelFunctionHandle(mps_lib_0_func.get());
    mps_lib_0_func->runCommandBlock([&] {
        mps_lib_0_func->startEncoding();
        aoti_torch_mps_set_arg_tensor(mps_lib_0_func_handle, 0, buf0);
        aoti_torch_mps_set_arg_tensor(mps_lib_0_func_handle, 1, arg0_1);
        aoti_torch_mps_set_arg_int(mps_lib_0_func_handle, 2, s77);
        mps_lib_0_func->dispatch({static_cast<uint64_t>(5LL), static_cast<uint64_t>(std::min(static_cast<int64_t>(1024LL), static_cast<int64_t>(s77)))}, {static_cast<uint64_t>(1), static_cast<uint64_t>(std::min(static_cast<int64_t>(1024LL), static_cast<int64_t>(s77)))});

    });
    arg0_1.reset();
    output_handles[0] = buf0.release();
} // AOTInductorModel::run_impl
```

Static kernel:
```cpp
kernel void generated_kernel(
    device float* out_ptr0,
    constant float* in_ptr0,
    uint xindex [[thread_position_in_grid]]
) {
    int x0 = xindex;
    auto tmp0 = in_ptr0[x0];
    auto tmp1 = in_ptr0[5 + x0];
    auto tmp3 = in_ptr0[10 + x0];
    auto tmp5 = in_ptr0[15 + x0];
    auto tmp2 = tmp0 + tmp1;
    auto tmp4 = tmp2 + tmp3;
    auto tmp6 = tmp4 + tmp5;
    out_ptr0[x0] = static_cast<float>(tmp6);
}

void AOTInductorModel::run_impl(...) {
    ...
    static constexpr int64_t int_array_0[] = {5LL, };
    static constexpr int64_t int_array_1[] = {1LL, };
    AtenTensorHandle buf0_handle;
    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(1, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_mps, this->device_idx_, &buf0_handle));
    RAIIAtenTensorHandle buf0(buf0_handle);
    auto mps_lib_0_func = mps_lib_0.getKernelFunction("generated_kernel");
    auto mps_lib_0_func_handle = AOTIMetalKernelFunctionHandle(mps_lib_0_func.get());
    mps_lib_0_func->runCommandBlock([&] {
        mps_lib_0_func->startEncoding();
        aoti_torch_mps_set_arg_tensor(mps_lib_0_func_handle, 0, buf0);
        aoti_torch_mps_set_arg_tensor(mps_lib_0_func_handle, 1, arg0_1);
        mps_lib_0_func->dispatch({static_cast<uint64_t>(5LL)});

    });
    arg0_1.reset();
    output_handles[0] = buf0.release();
} // AOTInductorModel::run_impl
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159355
Approved by: https://github.com/malfet
2025-07-31 23:15:02 +00:00
7e00f2ec9d [AOTI] add zero size consts asm handler (#159225)
Add `get_zero_consts_asm_code` to handle zero size consts to object.
This function is used to handle zero consts situation. Because cpp standard does not allow zero size array:
https://stackoverflow.com/questions/9722632/what-happens-if-i-define-a-0-size-array-in-c-c
1. On Windows, MSVC will report error C2466:
https://learn.microsoft.com/en-us/cpp/error-messages/compiler-errors-1/compiler-error-c2466?view=msvc-170
So, we can use assmbely compiler to handle this situation.
2. On Windows, why not use Win32 asm to handle all path? Because ml64 only supports up to align `16`, it is
not aligned to pytorch's `64`. Reference: https://learn.microsoft.com/en-us/cpp/assembler/masm/ml-and-ml64-command-line-reference?view=msvc-170
```
Packs structures on the specified byte boundary. The alignment can be 1, 2, 4, 8, or 16.
```
3. It function can handle zero size case on both Windows and Linux, as that:
    A. On Linux, we added `-pedantic` to disable zero size array on C++ compiler. 8e07c9870d/torch/_inductor/cpp_builder.py (L580)
    B. On Windows, msvc is not support zero size array by default.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159225
Approved by: https://github.com/desertfire
2025-07-31 22:46:33 +00:00
490cb3f1a4 Revert "[inductor] Add logging for distributed collective ops for multi‑rank diagnostics (#159190)"
This reverts commit bb62e1f769ef51e2ec149d7256c135d09425aaa0.

Reverted https://github.com/pytorch/pytorch/pull/159190 on behalf of https://github.com/clee2000 due to broke [GH job link](https://github.com/pytorch/pytorch/actions/runs/16658705097/job/47150840171) [HUD commit link](bb62e1f769) on mac ([comment](https://github.com/pytorch/pytorch/pull/159190#issuecomment-3141513921))
2025-07-31 22:22:13 +00:00
b95cf5c91d Move complex to headeronly (#159411)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159411
Approved by: https://github.com/albanD
ghstack dependencies: #159415
2025-07-31 22:05:43 +00:00
5e2ef2a465 Move Float8 variations to headeronly (#159415)
This PR is a big copy pasta from `c10/util/Float8*` -> `torch/headeronly/util/` which is why we are breaking PR sanity :C (sorry @albanD!).

Why is it not a clean copy paste?
- For BC reasons, we have to keep the old c10 file around so that OSS devs relying on those files can still get the same APIs
- Because we reexpose APIs that are headeronly through torch::headeronly, so there is an extra chunk of code in the new torch::headeronly files to do that.

Outside of the copy paste, I:
- changed the tests to call torch::headeronly instead of c10
- updated header_only_apis.txt
- added `// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)` to pass lint (which was previously skipped for -inl.h files)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159415
Approved by: https://github.com/albanD
2025-07-31 22:05:43 +00:00
9f753f8c0d [DTensor] Improve sort strategy (#159189)
- Sort strategy now supports sharding on non sorted dim.
~~- Fix histc xfail.~~
  - ~~Previously `python test/distributed/tensor/test_dtensor_ops.py TestDTensorOpsCPU.test_dtensor_op_db_histc_cpu_float32` will fail with `PYTORCH_OPINFO_SAMPLE_INPUT_INDEX=18`. However, if we run `PYTORCH_OPINFO_SAMPLE_INPUT_INDEX=18 python test/distributed/tensor/test_dtensor_ops.py TestDTensorOpsCPU.test_dtensor_op_db_histc_cpu_float32`, the test will pass. This kind of error is due to DTensor reuses the strategy schema hashing. It turns out that not only the strategy,  the result correctness also depends on `static_argnum` or the op will reuse the previous args from hashed schema and output wrong results. I updated the document also.~~ (fixed in https://github.com/pytorch/pytorch/pull/159289)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159189
Approved by: https://github.com/XilunWu
2025-07-31 21:52:42 +00:00
db437690d1 Add myself as a reviewer for when someone touches headeronly or stable (#159583)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159583
Approved by: https://github.com/mikaylagawarecki
2025-07-31 21:30:05 +00:00
669009bcd1 [inductor] respect layout tags for ops with registered lowerings (#159134)
scaled_grouped_mm's kernel only supports column-major on the second operand. I -think- this is just for efficiency reasons. But inductor treats that buffer as flexible and may tweak the strides to be row-major instead, as seen in the issue.

~Tagging the op as "needs_fixed_stride_order"/"needs_exact_strides" does not work. Inductor only considers those tags for ops that don't have registered lowering (not sure if this is intended). scaled_grouped_mm does have a lowering, so we never check its tags.~ From discussion below, the op tags are expected to work.

FIXES https://github.com/pytorch/pytorch/issues/159097

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159134
Approved by: https://github.com/eellison
2025-07-31 21:29:40 +00:00
e4e2701429 Add the RunLLM widget to the website (#152055)
Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/152055
Approved by: https://github.com/albanD
2025-07-31 20:53:53 +00:00
64cc649275 [itertools] Fix accumulate (#158774)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/158774
Approved by: https://github.com/guilhermeleobas, https://github.com/zou3519
2025-07-31 20:32:02 +00:00
b1fb552974 Revert "Fix ep deepcopy when there is python builitin name (#159478)"
This reverts commit de7376537f2a11783169fee2b3bc276d266898bf.

Reverted https://github.com/pytorch/pytorch/pull/159478 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/159478#issuecomment-3141228423))
2025-07-31 20:20:53 +00:00
bb62e1f769 [inductor] Add logging for distributed collective ops for multi‑rank diagnostics (#159190)
This change introduces structured logging of the collective communication schedule, enabling downstream tools (e.g. TLParse) to ingest and analyze per‑rank collective‐order information for multi‑rank jobs.

- Iterates over scheduler.nodes, filters for _CollectiveKernel nodes
- Extracts each op’s python_kernel_name
- Emits a structured JSON payload under the inductor_collective_schedule artifact name
- Dumps the full schedule list to collective_schedule.json via the PyTorch trace‑structured artifact
- Added comprehensive unit tests for collective schedule tracing: Created test_collective_schedule_empty() and test_collective_schedule_real() tests to verify structured trace logging works correctly for both empty collective schedules and real collective operations (like all_reduce and wait_tensor from _c10d_functional ops).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/159190
Approved by: https://github.com/yushangdi, https://github.com/xmfan
2025-07-31 19:58:07 +00:00
282 changed files with 12924 additions and 8703 deletions

View File

@ -1 +1 @@
11ec6354315768a85da41032535e3b7b99c5f706
f7888497a1eb9e98d4c07537f0d0bcfe180d1363

View File

@ -103,5 +103,5 @@ fi
# It depends on torch and triton. We don't want to install
# triton and torch from production on Docker CI images
if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
pip_install helion==0.0.10 --no-deps
pip_install helion --no-deps
fi

View File

@ -1,7 +1,7 @@
sphinx==5.3.0
#Description: This is used to generate PyTorch docs
#Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
@ -50,7 +50,7 @@ IPython==8.12.0
#Pinned versions: 8.12.0
myst-nb==0.17.2
#Description: This is used to generate PyTorch functorch and torch.compile docs
#Description: This is used to generate PyTorch functorch and torch.compile docs.
#Pinned versions: 0.17.2
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs

31
.ci/lumen_cli/README.md Normal file
View File

@ -0,0 +1,31 @@
# 🔧 Lumen_cli
A Python CLI tool for building and testing PyTorch-based components, using a YAML configuration file for structured, repeatable workflows.
## Features
- **Build**
- external projects (e.g. vLLM)
## 📦 Installation
at the root of the pytorch repo
```bash
pip install -e .ci/lumen_cli
```
## Run the cli tool
The cli tool must be used at root of pytorch repo, as example to run build external vllm:
```bash
python -m cli.run build external vllm
```
this will run the build steps with default behaviour for vllm project.
to see help messages, run
```bash
python3 -m cli.run --help
```
## Add customized external build logics
To add a new external build, for instance, add a new external build logics:
1. create the build function in cli/lib folder
2. register your target and the main build function at EXTERNAL_BUILD_TARGET_DISPATCH in `cli/build_cli/register_build.py`
3. [optional] create your ci config file in .github/ci_configs/${EXTERNAL_PACKAGE_NAME}.yaml

View File

@ -0,0 +1,37 @@
import argparse
import logging
from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec
from cli.lib.core.vllm import VllmBuildRunner
logger = logging.getLogger(__name__)
# Maps targets to their argparse configuration and runner
# it adds new target to path python -m cli.run build external {target} with buildrunner
_TARGETS: dict[str, TargetSpec] = {
"vllm": {
"runner": VllmBuildRunner,
"help": "Build vLLM using docker buildx.",
}
# add yours ...
}
def register_build_commands(subparsers: argparse._SubParsersAction) -> None:
build_parser = subparsers.add_parser(
"build",
help="Build related commands",
formatter_class=RichHelp,
)
build_subparsers = build_parser.add_subparsers(dest="build_command", required=True)
overview = "\n".join(
f" {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items()
)
external_parser = build_subparsers.add_parser(
"external",
help="Build external targets",
description="Build third-party targets.\n\nAvailable targets:\n" + overview,
formatter_class=RichHelp,
)
register_targets(external_parser, _TARGETS)

View File

@ -0,0 +1,71 @@
"""
Cli Argparser Utility helpers for CLI tasks.
"""
import argparse
from abc import ABC, abstractmethod
try:
from typing import Any, Callable, Required, TypedDict # Python 3.11+
except ImportError:
from typing import Any, Callable, TypedDict
from typing_extensions import Required # Fallback for Python <3.11
class BaseRunner(ABC):
def __init__(self, args: Any) -> None:
self.args = args
@abstractmethod
def run(self) -> None:
"""runs main logics, required"""
# Pretty help: keep newlines + show defaults
class RichHelp(
argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter
):
pass
class TargetSpec(TypedDict, total=False):
"""CLI subcommand specification with bA."""
runner: Required[type[BaseRunner]]
help: str
description: str
add_arguments: Callable[[argparse.ArgumentParser], None]
def register_targets(
parser: argparse.ArgumentParser,
target_specs: dict[str, TargetSpec],
common_args: Callable[[argparse.ArgumentParser], None] = lambda _: None,
) -> None:
"""Register target subcommands."""
targets = parser.add_subparsers(
dest="target",
required=True,
metavar="{" + ",".join(target_specs.keys()) + "}",
)
for name, spec in target_specs.items():
desc = spec.get("description") or spec["runner"].__doc__ or ""
p = targets.add_parser(
name,
help=spec.get("help", ""),
description=desc.strip(),
formatter_class=RichHelp,
)
p.set_defaults(
func=lambda args, cls=spec["runner"]: cls(args).run(),
_runner_class=spec["runner"],
)
if "add_arguments" in spec and callable(spec["add_arguments"]):
spec["add_arguments"](p)
if common_args:
common_args(p)

View File

@ -0,0 +1,42 @@
"""
Docker Utility helpers for CLI tasks.
"""
import logging
from typing import Optional
import docker
from docker.errors import APIError, NotFound
logger = logging.getLogger(__name__)
# lazy singleton so we don't reconnect every call
_docker_client: Optional[docker.DockerClient] = None
def _get_client() -> docker.DockerClient:
global _docker_client
if _docker_client is None:
_docker_client = docker.from_env()
return _docker_client
def local_image_exists(
image_name: str, client: Optional[docker.DockerClient] = None
) -> bool:
"""Return True if a local Docker image exists."""
if not image_name:
return False
client = client or _get_client()
try:
client.images.get(image_name)
return True
except (NotFound, APIError) as e:
logger.error(
"Error when checking Docker image '%s': %s",
image_name,
e.explanation if hasattr(e, "explanation") else str(e),
)
return False

View File

@ -0,0 +1,110 @@
"""
Environment Variables and Dataclasses Utility helpers for CLI tasks.
"""
import os
from dataclasses import field, fields, is_dataclass, MISSING
from pathlib import Path
from textwrap import indent
from typing import Optional, Union
from cli.lib.common.utils import str2bool
def get_env(name: str, default: str = "") -> str:
"""Get environment variable with default fallback."""
return os.environ.get(name) or default
def env_path_optional(
name: str,
default: Optional[Union[str, Path]] = None,
resolve: bool = True,
) -> Optional[Path]:
"""Get environment variable as optional Path."""
val = get_env(name) or default
if not val:
return None
path = Path(val)
return path.resolve() if resolve else path
def env_path(
name: str,
default: Optional[Union[str, Path]] = None,
resolve: bool = True,
) -> Path:
"""Get environment variable as Path, raise if missing."""
path = env_path_optional(name, default, resolve)
if not path:
raise ValueError(f"Missing path value for {name}")
return path
def env_bool(
name: str,
default: bool = False,
) -> bool:
val = get_env(name)
if not val:
return default
return str2bool(val)
def env_bool_field(
name: str,
default: bool = False,
):
return field(default_factory=lambda: env_bool(name, default))
def env_path_field(
name: str,
default: Union[str, Path] = "",
*,
resolve: bool = True,
) -> Path:
return field(default_factory=lambda: env_path(name, default, resolve=resolve))
def env_str_field(
name: str,
default: str = "",
) -> str:
return field(default_factory=lambda: get_env(name, default))
def generate_dataclass_help(cls) -> str:
"""Auto-generate help text for dataclass fields."""
if not is_dataclass(cls):
raise TypeError(f"{cls} is not a dataclass")
def get_value(f):
if f.default is not MISSING:
return f.default
if f.default_factory is not MISSING:
try:
return f.default_factory()
except Exception as e:
return f"<error: {e}>"
return "<required>"
lines = [f"{f.name:<22} = {repr(get_value(f))}" for f in fields(cls)]
return indent("\n".join(lines), " ")
def with_params_help(params_cls: type, title: str = "Parameter defaults"):
"""
Class decorator that appends a help table generated from another dataclass
(e.g., VllmParameters) to the decorated class's docstring.
"""
if not is_dataclass(params_cls):
raise TypeError(f"{params_cls} must be a dataclass")
def _decorator(cls: type) -> type:
block = generate_dataclass_help(params_cls)
cls.__doc__ = (cls.__doc__ or "") + f"\n\n{title}:\n{block}"
return cls
return _decorator

View File

@ -0,0 +1,84 @@
"""
Git Utility helpers for CLI tasks.
"""
import logging
from pathlib import Path
from cli.lib.common.path_helper import remove_dir
from cli.lib.common.utils import run_command
from git import GitCommandError, RemoteProgress, Repo
logger = logging.getLogger(__name__)
class PrintProgress(RemoteProgress):
"""Simple progress logger for git operations."""
def __init__(self, interval: int = 5):
super().__init__()
self._last_percent = -1
self._interval = interval
def update(self, op_code, cur, max=None, message=""):
msg = self._cur_line or message
if max and cur:
percent = int(cur / max * 100)
if percent != self._last_percent and percent % self._interval == 0:
self._last_percent = percent
logger.info("Progress: %d%% - %s", percent, msg)
elif msg:
logger.info(msg)
def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules=False):
"""Clone repository with pinned commit and optional submodules."""
dst = dst or target
try:
logger.info("Cloning %s to %s", target, dst)
# Clone and fetch
remove_dir(dst)
r = Repo.clone_from(repo, dst, progress=PrintProgress())
r.git.fetch("--all", "--tags")
# Checkout pinned commit
commit = get_post_build_pinned_commit(target)
logger.info("Checking out pinned commit %s", commit)
r.git.checkout(commit)
# Update submodules if requested
if update_submodules and r.submodules:
logger.info("Updating %d submodule(s)", len(r.submodules))
for sm in r.submodules:
sm.update(init=True, recursive=True, progress=PrintProgress())
logger.info("Successfully cloned %s", target)
return r
except GitCommandError as e:
logger.error("Git operation failed: %s", e)
raise
def clone_vllm_pure(commit: str):
"""
cloning vllm and checkout pinned commit
"""
print("clonening vllm....", flush=True)
cwd = "vllm"
# delete the directory if it exists
remove_dir(cwd)
# Clone the repo & checkout commit
run_command("git clone https://github.com/vllm-project/vllm.git")
run_command(f"git checkout {commit}", cwd=cwd)
run_command("git submodule update --init --recursive", cwd=cwd)
def get_post_build_pinned_commit(name: str, prefix=".github/ci_commit_pins") -> str:
path = Path(prefix) / f"{name}.txt"
if not path.exists():
raise FileNotFoundError(f"Pin file not found: {path}")
return path.read_text(encoding="utf-8").strip()

View File

@ -0,0 +1,14 @@
"""
Logger Utility helpers for CLI tasks.
"""
import logging
import sys
def setup_logging(level: int = logging.INFO):
logging.basicConfig(
level=level,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
stream=sys.stdout,
)

View File

@ -0,0 +1,62 @@
"""Path utility helpers for CLI tasks."""
import logging
import shutil
from pathlib import Path
from typing import Union
logger = logging.getLogger(__name__)
def get_path(path: Union[str, Path], resolve: bool = False) -> Path:
"""Convert to Path object, optionally resolving to absolute path."""
if not path:
raise ValueError("Path cannot be None or empty")
result = Path(path)
return result.resolve() if resolve else result
def ensure_dir_exists(path: Union[str, Path]) -> Path:
"""Create directory if it doesn't exist."""
path_obj = get_path(path)
path_obj.mkdir(parents=True, exist_ok=True)
return path_obj
def remove_dir(path: Union[str, Path, None]) -> None:
"""Remove directory if it exists."""
if not path:
return
path_obj = get_path(path)
if path_obj.exists():
shutil.rmtree(path_obj)
def force_create_dir(path: Union[str, Path]) -> Path:
"""Remove directory if exists, then create fresh empty directory."""
remove_dir(path)
return ensure_dir_exists(path)
def copy(src: Union[str, Path], dst: Union[str, Path]) -> None:
"""Copy file or directory from src to dst."""
src_path = get_path(src, resolve=True)
dst_path = get_path(dst, resolve=True)
if not src_path.exists():
raise FileNotFoundError(f"Source does not exist: {src_path}")
dst_path.parent.mkdir(parents=True, exist_ok=True)
if src_path.is_file():
shutil.copy2(src_path, dst_path)
elif src_path.is_dir():
shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
else:
raise ValueError(f"Unsupported path type: {src_path}")
def is_path_exist(path: Union[str, Path, None]) -> bool:
"""Check if path exists."""
return bool(path and get_path(path).exists())

View File

@ -0,0 +1,69 @@
import glob
import logging
import shlex
import shutil
import sys
from collections.abc import Iterable
from typing import Optional, Union
from cli.lib.common.utils import run_command
logger = logging.getLogger(__name__)
def pip_install_packages(
packages: Iterable[str] = (),
env=None,
*,
requirements: Optional[str] = None,
constraints: Optional[str] = None,
prefer_uv: bool = False,
) -> None:
use_uv = prefer_uv and shutil.which("uv") is not None
base = (
[sys.executable, "-m", "uv", "pip", "install"]
if use_uv
else [sys.executable, "-m", "pip", "install"]
)
if use_uv:
logger.info("Installing packages using uv pip")
cmd = base[:]
if requirements:
cmd += ["-r", requirements]
if constraints:
cmd += ["-c", constraints]
cmd += list(packages)
logger.info("pip installing packages: %s", " ".join(map(shlex.quote, cmd)))
run_command(" ".join(map(shlex.quote, cmd)), env=env)
logger.info("Done installing packages")
def pip_install_first_match(pattern: str, extras: Optional[str] = None, pref_uv=False):
"""
Install the first local whl that matches the given glob pattern.
Args:
pattern (str): Glob pattern for the wheel file(s).
extras (str | None): Optional extras (e.g., "opt_einsum") to install with the wheel.
"""
matches = sorted(glob.glob(pattern))
if not matches:
raise FileNotFoundError(f"No files match: {pattern}")
wheel = matches[0]
target = f"{wheel}[{extras}]" if extras else wheel
logger.info("Installing wheel: %s", target)
pip_install_packages([target], prefer_uv=pref_uv)
def run_python(args: Union[str, list[str]], env=None):
"""
Run the python in the current environment.
"""
if isinstance(args, str):
args = shlex.split(args)
cmd = [sys.executable] + args
run_command(" ".join(map(shlex.quote, cmd)), env=env)

View File

@ -0,0 +1,117 @@
"""
General Utility helpers for CLI tasks.
"""
import logging
import os
import shlex
import subprocess
import sys
from contextlib import contextmanager
from typing import Optional
logger = logging.getLogger(__name__)
def run_command(
cmd: str,
use_shell: bool = False,
log_cmd: bool = True,
cwd: Optional[str] = None,
env: Optional[dict] = None,
check: bool = True,
) -> int:
"""Run a command with optional shell execution."""
if use_shell:
args = cmd
log_prefix = "[shell]"
executable = "/bin/bash"
else:
args = shlex.split(cmd)
log_prefix = "[cmd]"
executable = None
if log_cmd:
display_cmd = cmd if use_shell else " ".join(args)
logger.info("%s %s", log_prefix, display_cmd)
run_env = {**os.environ, **(env or {})}
proc = subprocess.run(
args,
shell=use_shell,
executable=executable,
stdout=sys.stdout,
stderr=sys.stderr,
cwd=cwd,
env=run_env,
check=False,
)
if check and proc.returncode != 0:
logger.error(
"%s Command failed (exit %s): %s", log_prefix, proc.returncode, cmd
)
raise subprocess.CalledProcessError(
proc.returncode, args if not use_shell else cmd
)
return proc.returncode
def str2bool(value: Optional[str]) -> bool:
"""Convert environment variables to boolean values."""
if not value:
return False
if not isinstance(value, str):
raise ValueError(
f"Expected a string value for boolean conversion, got {type(value)}"
)
value = value.strip().lower()
true_value_set = {"1", "true", "t", "yes", "y", "on", "enable", "enabled", "found"}
false_value_set = {"0", "false", "f", "no", "n", "off", "disable"}
if value in true_value_set:
return True
if value in false_value_set:
return False
raise ValueError(f"Invalid string value for boolean conversion: {value}")
@contextmanager
def temp_environ(updates: dict[str, str]):
"""
Temporarily set environment variables and restore them after the block.
Args:
updates: Dict of environment variables to set.
"""
missing = object()
old: dict[str, str | object] = {k: os.environ.get(k, missing) for k in updates}
try:
os.environ.update(updates)
yield
finally:
for k, v in old.items():
if v is missing:
os.environ.pop(k, None)
else:
os.environ[k] = v # type: ignore[arg-type]
@contextmanager
def working_directory(path: str):
"""
Temporarily change the working directory inside a context.
"""
if not path:
# No-op context
yield
return
prev_cwd = os.getcwd()
try:
os.chdir(path)
yield
finally:
os.chdir(prev_cwd)

View File

@ -0,0 +1,650 @@
import logging
import os
import re
import subprocess
import sys
import textwrap
from collections.abc import Iterable
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import Any, Optional
from cli.lib.common.cli_helper import BaseRunner
from cli.lib.common.docker_helper import local_image_exists
from cli.lib.common.envs_helper import (
env_bool_field,
env_path_field,
env_str_field,
get_env,
with_params_help,
)
from cli.lib.common.git_helper import clone_vllm_pure, get_post_build_pinned_commit
from cli.lib.common.path_helper import (
copy,
ensure_dir_exists,
force_create_dir,
get_path,
is_path_exist,
remove_dir,
)
from cli.lib.common.pip_helper import (
pip_install_first_match,
pip_install_packages,
run_python,
)
from cli.lib.common.utils import run_command, temp_environ, working_directory
logger = logging.getLogger(__name__)
# Default path for docker build artifacts
_DEFAULT_RESULT_PATH = "./shared"
# Temp folder in vllm work place to cp torch whls in vllm work directory for docker build
_VLLM_TEMP_FOLDER = "tmp"
@dataclass
class VllmBuildParameters:
"""
Parameters defining the vllm external input configurations.
Combine with VllmDockerBuildArgs to define the vllm build environment
"""
# USE_TORCH_WHEEL: when true, use local Torch wheels; requires TORCH_WHEELS_PATH.
# Otherwise docker build pull torch nightly during build
# TORCH_WHEELS_PATH: directory containing local torch wheels when use_torch_whl is True
use_torch_whl: bool = env_bool_field("USE_TORCH_WHEEL", True)
torch_whls_path: Path = env_path_field("TORCH_WHEELS_PATH", "./dist")
# USE_LOCAL_BASE_IMAGE: when true, use an existing local Docker base image; requires BASE_IMAGE
# Otherwise, pull dockerfile's default image remotely
# BASE_IMAGE: name:tag (only needed when use_local_base_image is True)
use_local_base_image: bool = env_bool_field("USE_LOCAL_BASE_IMAGE", True)
base_image: str = env_str_field("BASE_IMAGE")
# USE_LOCAL_DOCKERFILE: when true("1"), use a local Dockerfile; requires DOCKERFILE_PATH.
# otherwise, use vllm's default dockerfile.torch_nightly for build
# DOCKERFILE_PATH: path to Dockerfile used when use_local_dockerfile is True"
use_local_dockerfile: bool = env_bool_field("USE_LOCAL_DOCKERFILE", True)
dockerfile_path: Path = env_path_field(
"DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
)
# OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
output_dir: Path = env_path_field("OUTPUT_DIR", "shared")
# --- Build args ----------------------------------------------------------
target_stage: str = env_str_field("TARGET_STAGE", "export-wheels")
tag_name: str = env_str_field("TAG", "vllm-wheels")
cuda_version: str = env_str_field("CUDA_VERSION", "12.8.1")
python_version: str = env_str_field("PYTHON_VERSION", "3.12")
max_jobs: str = env_str_field("MAX_JOBS", "64")
sccache_bucket: str = env_str_field("SCCACHE_BUCKET")
sccache_region: str = env_str_field("SCCACHE_REGION")
torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
def __post_init__(self):
checks = [
(
self.use_torch_whl, # flag
True, # trigger_value
"torch_whls_path", # resource
is_path_exist, # check_func
"TORCH_WHEELS_PATH is not provided, but USE_TORCH_WHEEL is set to 1",
),
(
self.use_local_base_image,
True,
"base_image",
local_image_exists,
f"BASE_IMAGE {self.base_image} does not found, but USE_LOCAL_BASE_IMAGE is set to 1",
),
(
self.use_local_dockerfile,
True,
"dockerfile_path",
is_path_exist,
" DOCKERFILE_PATH path does not found, but USE_LOCAL_DOCKERFILE is set to 1",
),
]
for flag, trigger_value, attr_name, check_func, error_msg in checks:
value = getattr(self, attr_name)
if flag == trigger_value:
if not value or not check_func(value):
raise ValueError(error_msg)
else:
logger.info("flag %s is not set", flag)
if not self.output_dir:
raise ValueError("missing required output_dir")
@with_params_help(VllmBuildParameters)
class VllmBuildRunner(BaseRunner):
"""
Build vLLM using docker buildx.
Environment variable options:
"USE_TORCH_WHEEL": "1: use local wheels; 0: pull nightly from pypi",
"TORCH_WHEELS_PATH": "Path to local wheels (when USE_TORCH_WHEEL=1)",
"USE_LOCAL_BASE_IMAGE": "1: use local base image; 0: default image",
"BASE_IMAGE": "name:tag to indicate base image the dockerfile depends on (when USE_LOCAL_BASE_IMAGE=1)",
"USE_LOCAL_DOCKERFILE": "1: use local Dockerfile; 0: vllm repo default dockerfile.torch_nightly",
"DOCKERFILE_PATH": "Path to Dockerfile (when USE_LOCAL_DOCKERFILE=1)",
"OUTPUT_DIR": "e.g. './shared'",
"TORCH_CUDA_ARCH_LIST": "e.g. '8.0' or '8.0;9.0'",
"CUDA_VERSION": "e.g. '12.8.1'",
"PYTHON_VERSION": "e.g. '3.12'",
"MAX_JOBS": "e.g. '64'",
"SCCACHE_BUCKET": "e.g. 'my-bucket'",
"SCCACHE_REGION": "e.g. 'us-west-2'",
"""
def __init__(self, args=None):
self.work_directory = "vllm"
def run(self):
"""
main function to run vllm build
1. prepare vllm build environment
2. prepare the docker build command args
3. run docker build
"""
inputs = VllmBuildParameters()
logger.info("Running vllm build with inputs: %s", inputs)
clone_vllm()
self.cp_dockerfile_if_exist(inputs)
# cp torch wheels from root direct to vllm workspace if exist
self.cp_torch_whls_if_exist(inputs)
ensure_dir_exists(inputs.output_dir)
cmd = self._generate_docker_build_cmd(inputs)
logger.info("Running docker build: \n %s", cmd)
run_command(cmd, cwd="vllm", env=os.environ.copy())
def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:
if not inputs.use_torch_whl:
return ""
tmp_dir = f"./{self.work_directory}/{_VLLM_TEMP_FOLDER}"
tmp_path = Path(tmp_dir)
force_create_dir(tmp_path)
copy(inputs.torch_whls_path, tmp_dir)
return tmp_dir
def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):
if not inputs.use_local_dockerfile:
logger.info("using vllm default dockerfile.torch_nightly for build")
return
dockerfile_path = get_path(inputs.dockerfile_path, resolve=True)
vllm_torch_dockerfile = Path(
f"./{self.work_directory}/docker/Dockerfile.nightly_torch"
)
copy(dockerfile_path, vllm_torch_dockerfile)
def get_result_path(self, path):
"""
Get the absolute path of the result path
"""
if not path:
path = _DEFAULT_RESULT_PATH
abs_path = get_path(path, resolve=True)
return abs_path
def _get_torch_wheel_path_arg(self, torch_whl_dir: Optional[Path]) -> str:
if not torch_whl_dir:
return ""
return f"--build-arg TORCH_WHEELS_PATH={_VLLM_TEMP_FOLDER}"
def _get_base_image_args(self, inputs: VllmBuildParameters) -> tuple[str, str, str]:
"""
Returns:
- base_image_arg: docker buildx arg string for base image
- final_base_image_arg: docker buildx arg string for vllm-base stage
- pull_flag: --pull=true or --pull=false depending on whether the image exists locally
"""
if not inputs.use_local_base_image:
return "", "", ""
base_image = inputs.base_image
# set both base image and final base image to the same local image
base_image_arg = f"--build-arg BUILD_BASE_IMAGE={base_image}"
final_base_image_arg = f"--build-arg FINAL_BASE_IMAGE={base_image}"
if local_image_exists(base_image):
pull_flag = "--pull=false"
return base_image_arg, final_base_image_arg, pull_flag
logger.info(
"[INFO] Local image not found:%s will try to pull from remote", {base_image}
)
return base_image_arg, final_base_image_arg, ""
def _generate_docker_build_cmd(
self,
inputs: VllmBuildParameters,
) -> str:
base_image_arg, final_base_image_arg, pull_flag = self._get_base_image_args(
inputs
)
torch_arg = self._get_torch_wheel_path_arg(inputs.torch_whls_path)
return textwrap.dedent(
f"""
docker buildx build \
--output type=local,dest={inputs.output_dir} \
-f docker/Dockerfile.nightly_torch \
{pull_flag} \
{torch_arg} \
{base_image_arg} \
{final_base_image_arg} \
--build-arg max_jobs={inputs.max_jobs} \
--build-arg CUDA_VERSION={inputs.cuda_version} \
--build-arg PYTHON_VERSION={inputs.python_version} \
--build-arg USE_SCCACHE={int(bool(inputs.sccache_bucket and inputs.sccache_region))} \
--build-arg SCCACHE_BUCKET_NAME={inputs.sccache_bucket} \
--build-arg SCCACHE_REGION_NAME={inputs.sccache_region} \
--build-arg torch_cuda_arch_list='{inputs.torch_cuda_arch_list}' \
--target {inputs.target_stage} \
-t {inputs.tag_name} \
--progress=plain .
"""
).strip()
@dataclass
class VllmTestParameters:
"""
Parameters defining the vllm external test input
!!!DO NOT ADD SECRETS IN THIS CLASS!!!
you can put environment variable name in VllmTestParameters if it's not the same as the secret one
fetch secrests directly from env variables during runtime
"""
torch_whls_path: Path = env_path_field("TORCH_WHEELS_PATH", "./dist")
vllm_whls_path: Path = env_path_field("VLLM_WHEELS_PATH", "./shared")
torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
def __post_init__(self):
if not self.torch_whls_path.exists():
raise ValueError("missing torch_whls_path")
if not self.vllm_whls_path.exists():
raise ValueError("missing vllm_whls_path")
class TestInpuType(Enum):
TEST_PLAN = "test_plan"
UNKNOWN = "unknown"
class VllmTestRunner(BaseRunner):
def __init__(self, args: Any):
self.work_directory = "vllm"
self.test_plan = ""
self.test_type = TestInpuType.UNKNOWN
if args.test_plan:
self.test_plan = args.test_plan
self.test_type = TestInpuType.TEST_PLAN
# Matches the structeur in the artifacts.zip from torcb build
self.TORCH_WHL_PATH_REGEX = "torch*.whl"
self.TORCH_WHL_EXTRA = "opt-einsum"
self.TORCH_ADDITIONAL_WHLS_REGEX = [
"vision/torchvision*.whl",
"audio/torchaudio*.whl",
]
# Match the structure of the artifacts.zip from vllm external build
self.VLLM_TEST_WHLS_REGEX = [
"wheels/xformers/xformers*.whl",
"wheels/vllm/vllm*.whl",
"wheels/flashinfer-python/flashinfer*.whl",
]
def prepare(self):
"""
prepare test environment for vllm. This includes clone vllm repo, install all wheels, test dependencies and set env
"""
params = VllmTestParameters()
logger.info("Display VllmTestParameters %s", params)
self._set_envs(params)
clone_vllm(dst=self.work_directory)
with working_directory(self.work_directory):
remove_dir(Path("vllm"))
self._install_wheels(params)
self._install_dependencies()
# verify the torches are not overridden by test dependencies
self.check_versions()
def run(self):
"""
main function to run vllm test
"""
self.prepare()
with working_directory(self.work_directory):
if self.test_type == TestInpuType.TEST_PLAN:
self.run_test_plan(self.test_plan)
else:
raise ValueError(f"Unknown test type {self.test_type}")
def _install_wheels(self, params: VllmTestParameters):
logger.info("Running vllm test with inputs: %s", params)
logger.info("Installing torch wheel")
# torch_p = f"{str(params.torch_whls_path)}/{self.TORCH_WHL_PATH_REGEX}"
# pip_install_first_match(torch_p, self.TORCH_WHL_EXTRA)
logger.info("Installing other torch-related wheels")
torch_whls_path = [
f"{str(params.torch_whls_path)}/{whl_path}"
for whl_path in self.TORCH_ADDITIONAL_WHLS_REGEX
]
for torch_whl in torch_whls_path:
pip_install_first_match(torch_whl)
logger.info("Done. Installed torch and other torch-related wheels ")
logger.info("Installing vllm wheels")
vllm_whls_path = [
f"{str(params.vllm_whls_path)}/{whl_path}"
for whl_path in self.VLLM_TEST_WHLS_REGEX
]
for vllm_whl in vllm_whls_path:
pip_install_first_match(vllm_whl)
logger.info("Done. Installed vllm wheels")
def _install_test_dependencies(self):
"""
Install test dependencies for vllm test
This method replaces torch dependencies with local torch wheel info in
requirements/test.in file from vllm repo.
Then generates the test.txt file using uv pip compile, along with requirements/test.txt,
which is generated by the test.in with torch stable as soft constraint to match
packages' version
"""
# TODO(elainewy): move this as part of vllm build, to generate the test.txt file
logger.info("generate test.txt from requirements/test.in with local torch whls")
preprocess_test_in()
copy(
Path("requirements/test.txt"),
Path("snapshot_constraint.txt"),
)
run_command(
f"{sys.executable} -m uv pip compile requirements/test.in "
"-o test.txt "
"--index-strategy unsafe-best-match "
"--constraint snapshot_constraint.txt "
"--torch-backend cu128"
)
logger.info("install requirements from test.txt")
pip_install_packages(requirements="test.txt", prefer_uv=True)
logger.info("Done. install requirements from test.txt")
# install mambda from source since it does not work now with pip
# TODO(elainewy): move this as part of vllm build
pip_install_packages(
packages=[
"--no-build-isolation",
"git+https://github.com/state-spaces/mamba@v2.2.4",
],
prefer_uv=True,
)
logger.info("Done. installed requirements from test.txt")
def _install_dependencies(self):
pip_install_packages(packages=["-e", "tests/vllm_test_utils"], prefer_uv=True)
pip_install_packages(packages=["hf_transfer"], prefer_uv=True)
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
# using script from vllm repo to remove all torch packages from requirements txt
run_python("use_existing_torch.py")
# install common packages
for requirements in ["requirements/common.txt", "requirements/build.txt"]:
pip_install_packages(
requirements=requirements,
prefer_uv=True,
)
# install test packages
self._install_test_dependencies()
def check_versions(self):
"""
check installed packages version
"""
logger.info("double check installed packages")
patterns = ["torch", "xformers", "torchvision", "torchaudio", "vllm"]
for pkg in patterns:
try:
module = __import__(pkg)
version = getattr(module, "__version__", None)
version = version if version else "Unknown version"
logger.info("%s: %s", pkg, version)
except ImportError:
logger.info(" %s: Not installed", pkg)
logger.info("Done. checked installed packages")
def _set_envs(self, inputs: VllmTestParameters):
os.environ["TORCH_CUDA_ARCH_LIST"] = inputs.torch_cuda_arch_list
if not self.validate_cuda(get_env("TORCH_CUDA_ARCH_LIST")):
logger.warning(
"Missing supported TORCH_CUDA_ARCH_LIST. "
"Currently support TORCH_CUDA_ARCH_LIST env var "
"with supported arch [8.0, 8.9, 9.0]"
)
self.validate_cuda(get_env("TORCH_CUDA_ARCH_LIST"))
os.environ["HF_TOKEN"] = os.getenv("VLLM_TEST_HUGGING_FACE_TOKEN", "")
if not get_env("HF_TOKEN"):
raise ValueError(
"missing required HF_TOKEN, please set VLLM_TEST_HUGGING_FACE_TOKEN env var"
)
if not get_env("TORCH_CUDA_ARCH_LIST"):
raise ValueError(
"missing required TORCH_CUDA_ARCH_LIST, please set TORCH_CUDA_ARCH_LIST env var"
)
def run_test_plan(self, test_plan: str):
"""
a method to run list of tests based on the test plan. currently this only
used to run vllm tests.
"""
logger.info("run vllm tests.....")
tests_map = sample_test_plans()
if test_plan not in tests_map:
raise RuntimeError(
f"test {test_plan} not found, please add it to test plan pool"
)
tests = tests_map[test_plan]
logger.info("Running tests: %s", tests["title"])
pkgs = tests.get("package_install", [])
if pkgs:
logger.info("Installing packages: %s", pkgs)
pip_install_packages(packages=pkgs, prefer_uv=True)
with (
temp_environ(tests.get("env_var", {})),
working_directory(tests.get("working_directory", "tests")),
):
failures = []
for step in tests["steps"]:
with temp_environ(step.get("env_var", {})):
code = run_command(cmd=step["command"], check=False)
if code != 0:
failures.append(step)
if failures:
logger.error("Failed tests: %s", failures)
raise RuntimeError(f"{len(failures)} pytest runs failed: {failures}")
logger.info("Done. All tests passed")
def validate_cuda(self, value: str) -> bool:
VALID_VALUES = {"8.0", "8.9", "9.0"}
return all(v in VALID_VALUES for v in value.split())
def clone_vllm(dst: str = "vllm"):
clone_vllm_pure(get_post_build_pinned_commit(dst))
"""
clone_external_repo(
target="vllm",
repo="https://github.com/vllm-project/vllm.git",
dst=dst,
update_submodules=True,
)
"""
def preprocess_test_in(
target_file: str = "requirements/test.in", additional_packages: Iterable[str] = ()
):
"""
This modifies the target_file file in place. by default, it points to vllm's requirements/test.in
It removes torch packages in target_file and replace with local torch whls
"""
additional_package_to_move = list(additional_packages or ())
pkgs_to_remove = [
"torch",
"torchvision",
"torchaudio",
"xformers",
"mamba_ssm",
] + additional_package_to_move
# Read current requirements
target_path = Path(target_file)
lines = target_path.read_text().splitlines()
# Remove lines starting with the package names (==, @, >=) — case-insensitive
pattern = re.compile(rf"^({'|'.join(pkgs_to_remove)})\s*(==|@|>=)", re.IGNORECASE)
kept_lines = [line for line in lines if not pattern.match(line)]
# Get local torch/vision/audio installs from pip freeze
# this is hacky, but it works
pip_freeze = subprocess.check_output(["pip", "freeze"], text=True)
header_lines = [
line
for line in pip_freeze.splitlines()
if re.match(
r"^(torch|torchvision|torchaudio)\s*@\s*file://", line, re.IGNORECASE
)
]
# Write back: header_lines + blank + kept_lines
out = "\n".join(header_lines + [""] + kept_lines) + "\n"
target_path.write_text(out)
logger.info("[INFO] Updated %s", target_file)
def sample_test_plans():
"""
Simple sample to unblock the vllm ci development, which is mimic to
https://github.com/vllm-project/vllm/blob/main/.buildkite/test-pipeline.yaml
"""
# TODO(elainewy): Read from yaml file to handle the env and tests for vllm
# TODO(elainewy): implement logics to handle package_install
return {
# test plan:
# required id, title, and steps
# optional: env_var, package_install, working_directory
# by default the working_drectory is "tests/", but it can be changed based on tests, for instance,
# vllm sample test happens in samples/
"vllm_basic_correctness_test": {
"title": "Basic Correctness Test",
"id": "vllm_basic_correctness_test",
"env_var": {
"VLLM_WORKER_MULTIPROC_METHOD": "spawn",
},
# test step:
# required: command
# available fields: env_var (env_var only set within the scope of the test step), package_install(pip package)
"steps": [
{
"command": "pytest -v -s basic_correctness/test_cumem.py",
},
{
"command": "pytest -v -s basic_correctness/test_basic_correctness.py",
},
{
"command": "pytest -v -s basic_correctness/test_cpu_offload.py",
},
{
"command": "pytest -v -s basic_correctness/test_preemption.py",
"env_var": {
"VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT": "1",
},
},
],
},
"vllm_basic_models_test": {
"title": "Basic models test",
"id": "vllm_basic_models_test",
"steps": [
{"command": "pytest -v -s models/test_transformers.py"},
{"command": "pytest -v -s models/test_registry.py"},
{"command": "pytest -v -s models/test_utils.py"},
{"command": "pytest -v -s models/test_vision.py"},
{"command": "pytest -v -s models/test_initialization.py"},
],
},
"vllm_entrypoints_test": {
"title": "Entrypoints Test ",
"id": "vllm_entrypoints_test",
"env_var": {
"VLLM_WORKER_MULTIPROC_METHOD": "spawn",
},
"steps": [
{
"command": " ".join(
[
"pytest",
"-v",
"-s",
"entrypoints/llm",
"--ignore=entrypoints/llm/test_lazy_outlines.py",
"--ignore=entrypoints/llm/test_generate.py",
"--ignore=entrypoints/llm/test_generate_multiple_loras.py",
"--ignore=entrypoints/llm/test_collective_rpc.py",
]
)
},
{"command": "pytest -v -s entrypoints/llm/test_lazy_outlines.py"},
{"command": "pytest -v -s entrypoints/llm/test_generate.py "},
{
"command": "pytest -v -s entrypoints/llm/test_generate_multiple_loras.py"
},
{
"env_var": {"VLLM_USE_V1": "0"},
"command": "pytest -v -s entrypoints/offline_mode",
},
],
},
"vllm_regression_test": {
"title": "Regression Test",
"id": "vllm_regression_test",
"package_install": ["modelscope"],
"steps": [
{"command": "pytest -v -s test_regression.py"},
],
},
}

40
.ci/lumen_cli/cli/run.py Normal file
View File

@ -0,0 +1,40 @@
# main.py
import argparse
import logging
from cli.build_cli.register_build import register_build_commands
from cli.lib.common.logger import setup_logging
from cli.test_cli.register_test import register_test_commands
logger = logging.getLogger(__name__)
def main():
# Define top-level parser
parser = argparse.ArgumentParser(description="Lumos CLI")
subparsers = parser.add_subparsers(dest="command", required=True)
parser.add_argument(
"--log-level", default="INFO", help="Log level (DEBUG, INFO, WARNING, ERROR)"
)
# registers second-level subcommands
register_build_commands(subparsers)
register_test_commands(subparsers)
# parse args after all options are registered
args = parser.parse_args()
# setup global logging
setup_logging(getattr(logging, args.log_level.upper(), logging.INFO))
logger.debug("Parsed args: %s", args)
if hasattr(args, "func"):
args.func(args)
else:
parser.print_help()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,51 @@
import argparse
import logging
from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec
from cli.lib.core.vllm import VllmTestRunner
logger = logging.getLogger(__name__)
# Maps targets to their argparse configuration and runner
# it adds new target to path python -m cli.run build external {target} with buildrunner
_TARGETS: dict[str, TargetSpec] = {
"vllm": {
"runner": VllmTestRunner,
"help": "test vLLM unittests",
}
# add yours ...
}
def common_args(parser: argparse.ArgumentParser) -> None:
"""
Add common CLI arguments to the given parser.
"""
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
"-tp",
"--test-plan",
type=str,
help="a pre-defined test plan to run, e.g. 'basic_correctness_test'",
)
# TODO(elainewy):add another common option that user can trigger a specific test with test config
def register_test_commands(subparsers: argparse._SubParsersAction) -> None:
build_parser = subparsers.add_parser(
"test",
help="test related commands",
formatter_class=RichHelp,
)
build_subparsers = build_parser.add_subparsers(dest="test_command", required=True)
overview = "\n".join(
f" {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items()
)
external_parser = build_subparsers.add_parser(
"external",
help="Test external targets",
description="Test third-party targets.\n\nAvailable targets:\n" + overview,
formatter_class=RichHelp,
)
register_targets(external_parser, _TARGETS, common_args=common_args)

View File

@ -0,0 +1,23 @@
[project]
name = "lumen-ci"
version = "0.1.0"
dependencies = [
"pyyaml==6.0.2",
"GitPython==3.1.45",
"docker==7.1.0",
"pytest==7.3.2",
"uv==0.8.4"
]
[tool.setuptools]
packages = ["cli"]
[tool.setuptools.package-dir]
cli = "cli"
[tool.ruff.lint]
# Enable preview mode for linting
preview = true
# Now you can select your preview rules, like RUF048
extend-select = ["RUF048"]

View File

@ -0,0 +1,47 @@
# tests/test_cli.py
import io
import sys
import unittest
from contextlib import redirect_stderr, redirect_stdout
from unittest.mock import patch
from cli.run import main
class TestArgparseCLI(unittest.TestCase):
@patch("cli.build_cli.register_build.VllmBuildRunner.run", return_value=None)
@patch("cli.build_cli.register_build.VllmBuildRunner.__init__", return_value=None)
def test_cli_run_build_external(self, mock_init, mock_run):
from cli.run import main # import after patches if needed
test_args = ["cli.run", "build", "external", "vllm"]
with patch.object(sys, "argv", test_args):
# argparse may call sys.exit on error; capture to avoid test aborts
try:
main()
except SystemExit:
pass
mock_init.assert_called_once() # got constructed
mock_run.assert_called_once_with() # run() called
def test_build_help(self):
test_args = ["cli.run", "build", "--help"]
with patch.object(sys, "argv", test_args):
stdout = io.StringIO()
stderr = io.StringIO()
# --help always raises SystemExit(0)
with self.assertRaises(SystemExit) as cm:
with redirect_stdout(stdout), redirect_stderr(stderr):
main()
self.assertEqual(cm.exception.code, 0)
output = stdout.getvalue()
self.assertIn("usage", output)
self.assertIn("external", output)
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,115 @@
import argparse
import io
import unittest
from contextlib import redirect_stderr
from unittest.mock import patch
from cli.lib.common.cli_helper import BaseRunner, register_targets, RichHelp, TargetSpec
# ---- Dummy runners for unittests----
class FooRunner(BaseRunner):
"""Foo description from docstring."""
def run(self) -> None: # replaced by mock
pass
class BarRunner(BaseRunner):
def run(self) -> None: # replaced by mock
pass
def add_foo_args(p: argparse.ArgumentParser) -> None:
p.add_argument("--x", type=int, required=True, help="x value")
def common_args(p: argparse.ArgumentParser) -> None:
p.add_argument("--verbose", action="store_true", help="verbose flag")
def build_parser(specs: dict[str, TargetSpec]) -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(prog="app", formatter_class=RichHelp)
register_targets(
parser=parser,
target_specs=specs,
common_args=common_args,
)
return parser
def get_subparser(
parser: argparse.ArgumentParser, name: str
) -> argparse.ArgumentParser:
subparsers_action = next(
a
for a in parser._subparsers._group_actions # type: ignore[attr-defined]
if isinstance(a, argparse._SubParsersAction)
)
return subparsers_action.choices[name]
class TestRegisterTargets(unittest.TestCase):
def test_metavar_lists_targets(self):
specs: dict[str, TargetSpec] = {
"foo": {"runner": FooRunner, "add_arguments": add_foo_args},
"bar": {"runner": BarRunner},
}
parser = build_parser(specs)
subparsers_action = next(
a
for a in parser._subparsers._group_actions # type: ignore[attr-defined]
if isinstance(a, argparse._SubParsersAction)
)
self.assertEqual(subparsers_action.metavar, "{foo,bar}")
def test_add_arguments_and_common_args_present(self):
specs: dict[str, TargetSpec] = {
"foo": {"runner": FooRunner, "add_arguments": add_foo_args},
}
parser = build_parser(specs)
foo = get_subparser(parser, "foo")
help_text = foo.format_help()
self.assertIn("--x", help_text)
self.assertIn("--verbose", help_text)
def test_runner_constructed_with_ns_and_run_called(self):
specs: dict[str, TargetSpec] = {
"foo": {"runner": FooRunner, "add_arguments": add_foo_args},
}
parser = build_parser(specs)
with (
patch.object(FooRunner, "__init__", return_value=None) as mock_init,
patch.object(FooRunner, "run", return_value=None) as mock_run,
):
ns = parser.parse_args(["foo", "--x", "3", "--verbose"])
ns.func(ns) # set by register_targets
# __init__ received the Namespace
self.assertEqual(mock_init.call_count, 1)
(called_ns,), _ = mock_init.call_args
self.assertIsInstance(called_ns, argparse.Namespace)
# run() called with no args
mock_run.assert_called_once_with()
def test_runner_docstring_used_as_description_when_missing(self):
specs: dict[str, TargetSpec] = {
"foo": {"runner": FooRunner, "add_arguments": add_foo_args},
}
parser = build_parser(specs)
foo = get_subparser(parser, "foo")
help_text = foo.format_help()
self.assertIn("Foo description from docstring.", help_text)
def test_missing_target_raises_systemexit_with_usage(self):
specs: dict[str, TargetSpec] = {"foo": {"runner": FooRunner}}
parser = build_parser(specs)
buf = io.StringIO()
with self.assertRaises(SystemExit), redirect_stderr(buf):
parser.parse_args([])
err = buf.getvalue()
self.assertIn("usage:", err)
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,75 @@
import unittest
from unittest import mock
from unittest.mock import MagicMock
import docker.errors as derr
from cli.lib.common.docker_helper import _get_client, local_image_exists
class TestDockerImageHelpers(unittest.TestCase):
def setUp(self):
# Reset the singleton in the target module
patcher = mock.patch("cli.lib.common.docker_helper._docker_client", None)
self.addCleanup(patcher.stop)
patcher.start()
def test_local_image_exists_true(self):
# Mock a docker client whose images.get returns an object (no exception)
mock_client = MagicMock()
mock_client.images.get.return_value = object()
ok = local_image_exists("repo:tag", client=mock_client)
self.assertTrue(ok)
def test_local_image_exists_not_found_false(self):
mock_client = MagicMock()
# Raise docker.errors.NotFound
mock_client.images.get.side_effect = derr.NotFound("nope")
ok = local_image_exists("missing:latest", client=mock_client)
self.assertFalse(ok)
def test_local_image_exists_api_error_false(self):
mock_client = MagicMock()
mock_client.images.get.side_effect = derr.APIError("boom", None)
ok = local_image_exists("broken:tag", client=mock_client)
self.assertFalse(ok)
def test_local_image_exists_uses_lazy_singleton(self):
# Patch docker.from_env used by _get_client()
with mock.patch(
"cli.lib.common.docker_helper.docker.from_env"
) as mock_from_env:
mock_docker_client = MagicMock()
mock_from_env.return_value = mock_docker_client
# First call should create and cache the client
c1 = _get_client()
self.assertIs(c1, mock_docker_client)
mock_from_env.assert_called_once()
# Second call should reuse cached client (no extra from_env calls)
c2 = _get_client()
self.assertIs(c2, mock_docker_client)
mock_from_env.assert_called_once() # still once
def test_local_image_exists_without_client_param_calls_get_client_once(self):
# Ensure _get_client is called and cached; local_image_exists should reuse it
with mock.patch("cli.lib.common.docker_helper._get_client") as mock_get_client:
mock_client = MagicMock()
mock_get_client.return_value = mock_client
# 1st call
local_image_exists("repo:tag")
# 2nd call
local_image_exists("repo:tag2")
# local_image_exists should call _get_client each time,
# but your _get_client itself caches docker.from_env.
self.assertEqual(mock_get_client.call_count, 2)
self.assertEqual(mock_client.images.get.call_count, 2)
mock_client.images.get.assert_any_call("repo:tag")
mock_client.images.get.assert_any_call("repo:tag2")
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,149 @@
import os
import unittest
from dataclasses import dataclass
from pathlib import Path
from unittest.mock import patch
import cli.lib.common.envs_helper as m
class TestEnvHelpers(unittest.TestCase):
def setUp(self):
# Keep a copy of the original environment to restore later
self._env_backup = dict(os.environ)
def tearDown(self):
# Restore environment to original state
os.environ.clear()
os.environ.update(self._env_backup)
# -------- get_env --------
def test_get_env_unset_returns_default(self):
with patch.dict(os.environ, {}, clear=True):
self.assertEqual(m.get_env("FOO", "default"), "default")
def test_get_env_empty_returns_default(self):
with patch.dict(os.environ, {"FOO": ""}, clear=True):
self.assertEqual(m.get_env("FOO", "default"), "default")
def test_get_env_set_returns_value(self):
with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
self.assertEqual(m.get_env("FOO", "default"), "bar")
def test_get_env_not_exist_returns_default(self):
with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
self.assertEqual(m.get_env("TEST_NOT_EXIST", "default"), "default")
def test_get_env_not_exist_without_default(self):
with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
self.assertEqual(m.get_env("TEST_NOT_EXIST"), "")
# -------- env_bool --------
def test_env_bool_uses_default_when_unset(self):
with patch.dict(os.environ, {}, clear=True):
self.assertTrue(m.env_bool("FLAG", default=True))
self.assertFalse(m.env_bool("FLAG", default=False))
def test_env_bool_uses_str2bool_when_set(self):
# Patch str2bool used by env_bool so we don't depend on its exact behavior
def fake_str2bool(s: str) -> bool:
return s.lower() in {"1", "true", "yes", "on", "y"}
with (
patch.dict(os.environ, {"FLAG": "yEs"}, clear=True),
patch.object(m, "str2bool", fake_str2bool),
):
self.assertTrue(m.env_bool("FLAG", default=False))
# -------- env_path_optional / env_path --------
def test_env_path_optional_unset_returns_none_by_default(self):
with patch.dict(os.environ, {}, clear=True):
self.assertIsNone(m.env_path_optional("P"))
def test_env_path_optional_unset_returns_none_when_env_var_is_empty(self):
with patch.dict(os.environ, {"P": ""}, clear=True):
self.assertIsNone(m.env_path_optional("P"))
def test_env_path_optional_unset_returns_default_str(self):
# default as string; resolve=True by default -> absolute path
default_str = "x/y"
with patch.dict(os.environ, {}, clear=True):
p = m.env_path_optional("P", default=default_str)
self.assertIsInstance(p, Path)
self.assertIsNotNone(p)
if p:
self.assertTrue(p.is_absolute())
self.assertEqual(p.parts[-2:], ("x", "y"))
def test_env_path_optional_unset_returns_default_path_no_resolve(self):
d = Path("z")
with patch.dict(os.environ, {}, clear=True):
p = m.env_path_optional("P", default=d, resolve=False)
self.assertEqual(p, d)
def test_env_path_optional_respects_resolve_true(self):
with patch.dict(os.environ, {"P": "a/b"}, clear=True):
p = m.env_path_optional("P", resolve=True)
self.assertIsInstance(p, Path)
if p:
self.assertTrue(p.is_absolute())
def test_env_path_optional_respects_resolve_false(self):
with patch.dict(os.environ, {"P": "rel/dir"}, clear=True):
p = m.env_path_optional("P", resolve=False)
self.assertEqual(p, Path("rel/dir"))
if p:
self.assertFalse(p.is_absolute())
def test_env_path_raises_when_missing_and_default_none(self):
with patch.dict(os.environ, {}, clear=True):
with self.assertRaises(ValueError):
m.env_path("P", None, resolve=True)
def test_env_path_returns_path_when_present(self):
tmp = Path("./b").resolve()
with patch.dict(os.environ, {"P": str(tmp)}, clear=True):
p = m.env_path("P", None, resolve=True)
self.assertEqual(p, tmp)
# -------- dataclass field helpers --------
def test_dataclass_fields_read_env_at_instantiation(self):
@dataclass
class Cfg:
flag: bool = m.env_bool_field("FLAG", default=False)
out: Path = m.env_path_field("OUT", default="ab", resolve=True)
name: str = m.env_str_field("NAME", default="anon")
# First instantiation
with patch.dict(
os.environ, {"FLAG": "true", "OUT": "outdir", "NAME": "alice"}, clear=True
):
cfg1 = Cfg()
self.assertTrue(cfg1.flag)
self.assertIsInstance(cfg1.out, Path)
self.assertTrue(cfg1.out.is_absolute())
self.assertEqual(cfg1.name, "alice")
cfg1.name = "bob" # change instance value
self.assertEqual(cfg1.name, "bob") # change is reflected
# Change env; new instance should reflect new values
with patch.dict(os.environ, {"FLAG": "false", "NAME": ""}, clear=True):
cfg2 = Cfg()
self.assertFalse(cfg2.flag) # str2bool("false") -> False
self.assertTrue("ab" in str(cfg2.out))
self.assertIsInstance(cfg2.out, Path)
self.assertTrue(cfg2.out.is_absolute())
self.assertEqual(cfg2.name, "anon") # empty -> fallback to default
def test_dataclass_path_field_with_default_value(self):
@dataclass
class C2:
out: Path = m.env_path_field("OUT", default="some/dir", resolve=False)
with patch.dict(os.environ, {}, clear=True):
c = C2()
self.assertEqual(c.out, Path("some/dir"))
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,122 @@
# test_path_utils.py
# Run: pytest -q
import os
import unittest
from pathlib import Path
from tempfile import TemporaryDirectory
from cli.lib.common.path_helper import (
copy,
ensure_dir_exists,
force_create_dir,
get_path,
is_path_exist,
remove_dir,
)
class TestPathHelper(unittest.TestCase):
def setUp(self):
self.tmpdir = TemporaryDirectory()
self.tmp_path = Path(self.tmpdir.name)
def tearDown(self):
self.tmpdir.cleanup()
# -------- get_path --------
def test_get_path_returns_path_for_str(self):
# Use relative path to avoid absolute-ness
rel_str = "sub/f.txt"
os.chdir(self.tmp_path)
p = get_path(rel_str, resolve=False)
self.assertIsInstance(p, Path)
self.assertFalse(p.is_absolute())
self.assertEqual(str(p), rel_str)
def test_get_path_resolves(self):
rel_str = "sub/f.txt"
p = get_path(str(self.tmp_path / rel_str), resolve=True)
self.assertTrue(p.is_absolute())
self.assertTrue(str(p).endswith(rel_str))
def test_get_path_with_path_input(self):
p_in = self.tmp_path / "sub/f.txt"
p_out = get_path(p_in, resolve=False)
self.assertTrue(str(p_out) == str(p_in))
def test_get_path_with_none_raises(self):
with self.assertRaises(ValueError):
get_path(None) # type: ignore[arg-type]
def test_get_path_invalid_type_raises(self):
with self.assertRaises(TypeError):
get_path(123) # type: ignore[arg-type]
# -------- ensure_dir_exists / force_create_dir / remove_dir --------
def test_ensure_dir_exists_creates_and_is_idempotent(self):
d = self.tmp_path / "made"
ensure_dir_exists(d)
self.assertTrue(d.exists() and d.is_dir())
ensure_dir_exists(d)
def test_force_create_dir_clears_existing(self):
d = self.tmp_path / "fresh"
(d / "inner").mkdir(parents=True)
(d / "inner" / "f.txt").write_text("x")
force_create_dir(d)
self.assertTrue(d.exists())
self.assertEqual(list(d.iterdir()), [])
def test_remove_dir_none_is_noop(self):
remove_dir(None) # type: ignore[arg-type]
def test_remove_dir_nonexistent_is_noop(self):
ghost = self.tmp_path / "ghost"
remove_dir(ghost)
def test_remove_dir_accepts_str(self):
d = self.tmp_path / "to_rm"
d.mkdir()
remove_dir(str(d))
self.assertFalse(d.exists())
# -------- copy --------
def test_copy_file_to_file(self):
src = self.tmp_path / "src.txt"
dst = self.tmp_path / "out" / "dst.txt"
src.write_text("hello")
copy(src, dst)
self.assertEqual(dst.read_text(), "hello")
def test_copy_dir_to_new_dir(self):
src = self.tmp_path / "srcdir"
(src / "a").mkdir(parents=True)
(src / "a" / "f.txt").write_text("content")
dst = self.tmp_path / "destdir"
copy(src, dst)
self.assertEqual((dst / "a" / "f.txt").read_text(), "content")
def test_copy_dir_into_existing_dir_overwrite_true_merges(self):
src = self.tmp_path / "srcdir"
dst = self.tmp_path / "destdir"
(src / "x").mkdir(parents=True)
(src / "x" / "new.txt").write_text("new")
dst.mkdir()
(dst / "existing.txt").write_text("old")
copy(src, dst)
self.assertEqual((dst / "existing.txt").read_text(), "old")
self.assertEqual((dst / "x" / "new.txt").read_text(), "new")
def test_is_str_path_exist(self):
p = self.tmp_path / "x.txt"
p.write_text("1")
self.assertTrue(is_path_exist(str(p)))
self.assertTrue(is_path_exist(p))
self.assertFalse(is_path_exist(str(self.tmp_path / "missing")))
self.assertFalse(is_path_exist(self.tmp_path / "missing"))
self.assertFalse(is_path_exist(""))
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,181 @@
import os
import tempfile
import unittest
from pathlib import Path
from unittest.mock import MagicMock, patch
import cli.lib.core.vllm as vllm
class TestVllmBuildParameters(unittest.TestCase):
@patch("cli.lib.core.vllm.local_image_exists", return_value=True)
@patch("cli.lib.core.vllm.is_path_exist", return_value=True)
@patch(
"cli.lib.common.envs_helper.env_path_optional",
side_effect=lambda name, default=None, resolve=True: {
"DOCKERFILE_PATH": Path("/abs/vllm/Dockerfile"),
"TORCH_WHEELS_PATH": Path("/abs/dist"),
"OUTPUT_DIR": Path("/abs/shared"),
}.get(name, Path(default) if default is not None else None),
)
@patch.dict(
os.environ,
{
"USE_TORCH_WHEEL": "1",
"USE_LOCAL_BASE_IMAGE": "1",
"USE_LOCAL_DOCKERFILE": "1",
"BASE_IMAGE": "my/image:tag",
"DOCKERFILE_PATH": "vllm/Dockerfile",
"TORCH_WHEELS_PATH": "dist",
"OUTPUT_DIR": "shared",
},
clear=True,
)
def test_params_success_normalizes_and_validates(
self, mock_env_path, mock_is_path, mock_local_img
):
params = vllm.VllmBuildParameters()
self.assertEqual(params.torch_whls_path, Path("/abs/dist"))
self.assertEqual(params.dockerfile_path, Path("/abs/vllm/Dockerfile"))
self.assertEqual(params.output_dir, Path("/abs/shared"))
self.assertEqual(params.base_image, "my/image:tag")
@patch("cli.lib.core.vllm.is_path_exist", return_value=False)
@patch.dict(
os.environ, {"USE_TORCH_WHEEL": "1", "TORCH_WHEELS_PATH": "dist"}, clear=True
)
def test_params_missing_torch_whls_raises(self, _is_path):
with tempfile.TemporaryDirectory() as td:
os.chdir(td)
with self.assertRaises(ValueError) as cm:
vllm.VllmBuildParameters(
use_local_base_image=False,
use_local_dockerfile=False,
)
err = cm.exception
self.assertIn("TORCH_WHEELS_PATH", str(err))
@patch("cli.lib.core.vllm.local_image_exists", return_value=False)
@patch.dict(
os.environ, {"USE_LOCAL_BASE_IMAGE": "1", "BASE_IMAGE": "img:tag"}, clear=True
)
def test_params_missing_local_base_image_raises(self, _local_img):
with tempfile.TemporaryDirectory() as td:
os.chdir(td)
with self.assertRaises(ValueError) as cm:
vllm.VllmBuildParameters(
use_torch_whl=False,
use_local_dockerfile=False,
)
err = cm.exception
self.assertIn("BASE_IMAGE", str(err))
@patch("cli.lib.core.vllm.is_path_exist", return_value=False)
@patch.dict(
os.environ,
{"USE_LOCAL_DOCKERFILE": "1", "DOCKERFILE_PATH": "Dockerfile"},
clear=True,
)
def test_params_missing_dockerfile_raises(self, _is_path):
with tempfile.TemporaryDirectory() as td:
os.chdir(td)
with self.assertRaises(ValueError) as cm:
vllm.VllmBuildParameters(
use_torch_whl=False,
use_local_base_image=False,
)
err = cm.exception
self.assertIn("DOCKERFILE_PATH", str(err))
@patch("cli.lib.core.vllm.is_path_exist", return_value=False)
@patch.dict(
os.environ,
{"OUTPUT_DIR": ""},
clear=True,
)
def test_params_missing_output_dir(self, _is_path):
with self.assertRaises(FileNotFoundError):
vllm.VllmBuildParameters()
class TestBuildCmdAndRun(unittest.TestCase):
@patch("cli.lib.core.vllm.local_image_exists", return_value=True)
def test_generate_docker_build_cmd_includes_bits(self, _exists):
runner = vllm.VllmBuildRunner()
# Craft inputs that simulate a prepared build
inputs = MagicMock()
inputs.output_dir = Path("/abs/out")
inputs.use_local_base_image = True
inputs.base_image = "img:tag"
inputs.torch_whls_path = Path("./vllm/tmp")
inputs.max_jobs = 64
inputs.cuda_version = "12.8.1"
inputs.python_version = "3.12"
inputs.sccache_bucket = "my-bucket"
inputs.sccache_region = "us-west-2"
inputs.torch_cuda_arch_list = "8.0;9.0"
inputs.target_stage = "export-wheels"
inputs.tag_name = "vllm-wheels"
cmd = runner._generate_docker_build_cmd(inputs)
squashed = " ".join(cmd.split()) # normalize whitespace for matching
self.assertIn("--output type=local,dest=/abs/out", squashed)
self.assertIn("-f docker/Dockerfile.nightly_torch", squashed)
self.assertIn("--pull=false", squashed)
self.assertIn("--build-arg TORCH_WHEELS_PATH=tmp", squashed)
self.assertIn("--build-arg BUILD_BASE_IMAGE=img:tag", squashed)
self.assertIn("--build-arg FINAL_BASE_IMAGE=img:tag", squashed)
self.assertIn("--build-arg max_jobs=64", squashed)
self.assertIn("--build-arg CUDA_VERSION=12.8.1", squashed)
self.assertIn("--build-arg PYTHON_VERSION=3.12", squashed)
self.assertIn("--build-arg USE_SCCACHE=1", squashed)
self.assertIn("--build-arg SCCACHE_BUCKET_NAME=my-bucket", squashed)
self.assertIn("--build-arg SCCACHE_REGION_NAME=us-west-2", squashed)
self.assertIn("--build-arg torch_cuda_arch_list='8.0;9.0'", squashed)
self.assertIn("--target export-wheels", squashed)
self.assertIn("-t vllm-wheels", squashed)
@patch("cli.lib.core.vllm.run_command")
@patch("cli.lib.core.vllm.ensure_dir_exists")
@patch("cli.lib.core.vllm.clone_vllm")
@patch.object(
vllm.VllmBuildRunner,
"_generate_docker_build_cmd",
return_value="docker buildx ...",
)
@patch.dict(
os.environ,
{
# Make __post_init__ validations pass cheaply
"USE_TORCH_WHEEL": "0",
"USE_LOCAL_BASE_IMAGE": "0",
"USE_LOCAL_DOCKERFILE": "0",
"OUTPUT_DIR": "shared",
},
clear=True,
)
def test_run_calls_clone_prepare_and_build(
self, mock_gen, mock_clone, mock_ensure, mock_run
):
# Stub parameters instance so we avoid FS/Docker accesses in run()
params = MagicMock()
params.output_dir = Path("shared")
params.use_local_dockerfile = False
params.use_torch_whl = False
with patch("cli.lib.core.vllm.VllmBuildParameters", return_value=params):
runner = vllm.VllmBuildRunner()
runner.run()
mock_clone.assert_called_once()
mock_ensure.assert_called_once_with(Path("shared"))
mock_gen.assert_called_once_with(params)
mock_run.assert_called_once()
# ensure we run in vllm workdir
_, kwargs = mock_run.call_args
assert kwargs.get("cwd") == "vllm"
if __name__ == "__main__":
unittest.main()

View File

@ -194,7 +194,7 @@ ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
ROCBLAS_LIB_DST=lib/rocblas/library
ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES)
ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $ROCBLAS_OTHER_FILES)
# hipblaslt library files
HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library

View File

@ -627,6 +627,8 @@ test_perf_for_dashboard() {
device=cuda_a10g
elif [[ "${TEST_CONFIG}" == *h100* ]]; then
device=cuda_h100
elif [[ "${TEST_CONFIG}" == *b200* ]]; then
device=cuda_b200
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
device=rocm
fi
@ -801,6 +803,16 @@ test_dynamo_benchmark() {
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
# TODO (huydhn): Just smoke test some sample models
if [[ "${TEST_CONFIG}" == *b200* ]]; then
if [[ "${suite}" == "huggingface" ]]; then
export TORCHBENCH_ONLY_MODELS="DistillGPT2"
elif [[ "${suite}" == "timm_models" ]]; then
export TORCHBENCH_ONLY_MODELS="inception_v3"
elif [[ "${suite}" == "torchbench" ]]; then
export TORCHBENCH_ONLY_MODELS="hf_Bert"
fi
fi
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
else
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
@ -1627,6 +1639,16 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then
install_torchvision
build_xla
test_xla
elif [[ "$TEST_CONFIG" == *vllm* ]]; then
(cd .ci/lumen_cli && python -m pip install -e .)
if [[ "$BUILD_ENVIRONMENT" == *sm80* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0"
elif [[ "$BUILD_ENVIRONMENT" == *sm90* ]]; then
export TORCH_CUDA_ARCH_LIST="9.0"
else
export TORCH_CUDA_ARCH_LIST="8.9"
fi
python -m cli.run test external vllm --test-plan "$TEST_CONFIG"
elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
test_executorch
elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then

View File

@ -1 +1 @@
bf305f538005f2e900f8850ed57146024a8bc559
9b57c7bd5ad4db093c5bb31c802df9f04d933ac9

View File

@ -1 +1 @@
ca9e2be3ed6320b51f52f536595cd24e254f8bb2
53d7c39271aeb0568afcae337396a972e1848586

View File

@ -1 +1 @@
29ae4c76c026185f417a25e841d2cd5e65f087a3
b6a5b82b9948b610fa4c304d0d869c82b8f17db1

View File

@ -0,0 +1,414 @@
# TODO(elainwy): remove this file after the torch nightly dockerfile is in sync in vllm repo
# The vLLM Dockerfile is used to construct vLLM image against torch nightly and torch main that can be directly used for testing
ARG CUDA_VERSION=12.8.1
ARG PYTHON_VERSION=3.12
# BUILD_BASE_IMAGE: used to setup python build xformers, and vllm wheels, It can be replaced with a different base image from local machine,
# by default, it uses the torch-nightly-base stage from this docker image
ARG BUILD_BASE_IMAGE=torch-nightly-base
# FINAL_BASE_IMAGE: used to set up vllm-instaled environment and build flashinfer,
# by default, it uses devel-ubuntu22.04 official image.
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
#################### TORCH NIGHTLY BASE IMAGE ####################
# A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci
From nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
ARG CUDA_VERSION=12.8.1
ARG PYTHON_VERSION=3.12
ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
# Install Python and other dependencies if it does not existed
RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
echo "Installing Python ${PYTHON_VERSION}..." && \
echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
apt-get update -y && \
apt-get install -y ccache software-properties-common git curl sudo && \
for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done && \
apt-get update -y && \
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
else \
echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
fi \
&& python3 --version && python3 -m pip --version
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
# Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519)
RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \
if [ "$current_gcc_version" -lt 10 ]; then \
echo "GCC version is $current_gcc_version, installing gcc-10..."; \
apt-get update && \
apt-get install -y gcc-10 g++-10 && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 && \
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
else \
echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
fi && \
gcc --version && g++ --version
# install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv==0.8.4
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
#################### TORCH NIGHTLY BASE IMAGE ####################
#################### BASE BUILD IMAGE ####################
# A base image for building vLLM with torch nightly or torch wheels
# prepare basic build environment
FROM ${BUILD_BASE_IMAGE} AS base
USER root
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
if ! python3 -m uv --version >/dev/null 2>&1; then \
python3 -m pip install uv==0.8.4; \
fi
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
WORKDIR /workspace
# install build and runtime dependencies
COPY requirements/common.txt requirements/common.txt
COPY use_existing_torch.py use_existing_torch.py
COPY pyproject.toml pyproject.toml
# install build and runtime dependencies without stable torch version
RUN python3 use_existing_torch.py
# default mount file as placeholder, this just avoid the mount error
# change to a different vllm folder if this does not exist anymore
ARG TORCH_WHEELS_PATH="./requirements"
ARG PINNED_TORCH_VERSION
# Install torch, torchaudio and torchvision based on the input
# if TORCH_WHEELS_PATH is default "./requirements", it will pull thethe nightly versions using pip
# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine
RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
--mount=type=cache,target=/root/.cache/uv \
if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
uv pip install --system "${torch_whl}[opt-einsum]"; \
uv pip install --system "${vision_whl}"; \
uv pip install --system "${audio_whl}"; \
elif [ -n "$PINNED_TORCH_VERSION" ]; then \
echo "[INFO] Installing pinned torch nightly version: $PINNED_TORCH_VERSION"; \
uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu128; \
else \
echo "[INFO] Installing torch nightly with latest one"; \
uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128; \
fi
# Install numba 0.61.2 for cuda environment
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system numba==0.61.2
# Install common dependencies from vllm common.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/common.txt
# Must put before installing xformers, so it can install the correct version of xfomrers.
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
ARG max_jobs=16
ENV MAX_JOBS=${max_jobs}
# Build xformers with cuda and torch nightly/wheel
# following official xformers guidance: https://github.com/facebookresearch/xformers#build
ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/uv \
echo 'git clone xformers...' \
&& git clone https://github.com/facebookresearch/xformers.git --recursive \
&& cd xformers \
&& git checkout ${XFORMERS_COMMIT} \
&& git submodule update --init --recursive \
&& echo 'finish git clone xformers...' \
&& rm -rf build \
&& python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
&& cd .. \
&& rm -rf xformers
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system xformers-dist/*.whl --verbose
# Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
RUN cat torch_build_versions.txt
RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
#################### BASE BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE ####################
# Image used to build vllm wheel
FROM base AS build
ARG TARGETPLATFORM
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
COPY . .
RUN python3 use_existing_torch.py
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
# Max jobs used by Ninja to build extensions
ARG max_jobs=16
ENV MAX_JOBS=${max_jobs}
ARG nvcc_threads=2
ENV NVCC_THREADS=$nvcc_threads
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
ARG USE_SCCACHE
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
&& tar -xzf sccache.tar.gz \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
&& export SCCACHE_IDLE_TIMEOUT=0 \
&& export CMAKE_BUILD_TYPE=Release \
&& sccache --show-stats \
&& python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38 \
&& sccache --show-stats; \
fi
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" != "1" ]; then \
# Clean any existing CMake artifacts
rm -rf .deps && \
mkdir -p .deps && \
python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
fi
RUN echo "[DEBUG] Listing current directory:" && \
ls -al && \
echo "[DEBUG] Showing torch_build_versions.txt content:" && \
cat torch_build_versions.txt
#################### WHEEL BUILD IMAGE ####################
################### VLLM INSTALLED IMAGE ####################
# Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer
FROM ${FINAL_BASE_IMAGE} AS vllm-base
USER root
# prepare for environment starts
WORKDIR /workspace
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
# Install Python and other dependencies if it does not existed
RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
echo "Installing Python ${PYTHON_VERSION}..." && \
echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
apt-get update -y && \
apt-get install -y ccache software-properties-common git curl sudo && \
for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done && \
apt-get update -y && \
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
else \
echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
fi \
&& python3 --version && python3 -m pip --version
# Get the torch versions, and whls used in previous stagtes for consistency
COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
COPY --from=base /workspace/xformers-dist /wheels/xformers
COPY --from=build /workspace/vllm-dist /wheels/vllm
RUN echo "[DEBUG] Listing current directory before torch install step:" && \
ls -al && \
echo "[DEBUG] Showing torch_build_versions.txt content:" && \
cat torch_build_versions.txt
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
if ! python3 -m uv --version > /dev/null 2>&1; then \
python3 -m pip install uv==0.8.4; \
fi
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Default mount file as placeholder, this just avoid the mount error
ARG TORCH_WHEELS_PATH="./requirements"
# Install torch, torchaudio and torchvision
# if TORCH_WHEELS_PATH is default "./requirements", it will pull the nightly versions using pip using torch_build_versions.txt
# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine
RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
--mount=type=cache,target=/root/.cache/uv \
if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
echo "Found: '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \
uv pip install --system "${torch_whl}[opt-einsum]"; \
uv pip install --system "${vision_whl}"; \
uv pip install --system "${audio_whl}"; \
else \
echo "[INFO] Installing torch versions from torch_build_versions.txt"; \
uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128; \
fi
# Install the vllm wheel from previous stage
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system /wheels/vllm/*.whl --verbose
# Install xformers wheel from previous stage
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system /wheels/xformers/*.whl --verbose
# Build flashinfer from source.
ARG torch_cuda_arch_list='8.0;8.9;9.0a'
# install package for build flashinfer
# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
RUN pip install build==1.3.0
RUN pip freeze | grep -E 'setuptools|packaging|build'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Build flashinfer for torch nightly from source around 10 mins
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
ARG FLASHINFER_GIT_REF="v0.2.9rc2"
RUN --mount=type=cache,target=/root/.cache/uv \
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
${FLASHINFER_GIT_REPO} flashinfer \
&& echo "Building FlashInfer with AOT for arches: ${torch_cuda_arch_list}" \
&& cd flashinfer \
&& python3 -m flashinfer.aot \
&& python3 -m build --no-isolation --wheel --outdir ../wheels/flashinfer \
&& cd .. \
&& rm -rf flashinfer
# install flashinfer python
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system wheels/flashinfer/*.whl --verbose
# Logging to confirm the torch versions
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
################### VLLM INSTALLED IMAGE ####################
#################### UNITTEST IMAGE #############################
FROM vllm-base as test
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
COPY tests/ tests/
COPY examples examples
COPY benchmarks benchmarks
COPY ./vllm/collect_env.py .
COPY requirements/common.txt requirements/common.txt
COPY use_existing_torch.py use_existing_torch.py
COPY pyproject.toml pyproject.toml
# Install build and runtime dependencies without stable torch version
COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
RUN python3 use_existing_torch.py
# install packages
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/common.txt
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/nightly_torch_test.txt
# Workaround for #17068
# pinned commit for v2.2.4
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@95d8aba8a8c75aedcaa6143713b11e745e7cd0d9#egg=mamba-ssm"
# Logging to confirm the torch versions
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
# Logging to confirm all the packages are installed
RUN pip freeze
#################### UNITTEST IMAGE #############################
#################### EXPORT STAGE ####################
FROM scratch as export-wheels
# Just copy the wheels we prepared in previous stages
COPY --from=base /workspace/xformers-dist /wheels/xformers
COPY --from=build /workspace/vllm-dist /wheels/vllm
COPY --from=vllm-base /workspace/wheels/flashinfer /wheels/flashinfer-python

View File

@ -488,6 +488,10 @@
- torch/_dynamo/**
- torch/csrc/dynamo/**
- test/dynamo/**
- test/dynamo_expected_failures/**
- test/dynamo_skips/**
- test/inductor_expected_failures/**
- test/inductor_skips/**
approved_by:
- guilhermeleobas
mandatory_checks_name:

View File

@ -26,6 +26,7 @@ ciflow_push_tags:
- ciflow/trunk
- ciflow/unstable
- ciflow/xpu
- ciflow/vllm
- ciflow/torchbench
- ciflow/op-benchmark
- ciflow/pull

View File

@ -193,7 +193,7 @@ LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
"cpu": "libtorch-cxx11-builder:cpu",
}
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@ -315,6 +315,11 @@ def generate_wheels_matrix(
# TODO: Enable python 3.13t on cpu-s390x
if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
continue
# TODO: Enable python 3.14 on non linux OSes
if os != "linux" and (
python_version == "3.14" or python_version == "3.14t"
):
continue
if use_split_build and (
arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux"

View File

@ -0,0 +1,292 @@
name: linux-external-build
on:
workflow_call:
inputs:
build-environment:
required: true
type: string
description: Top-level label for what's being built/tested.
build-target:
required: true
type: string
description: target library to build
build-generates-artifacts:
required: false
type: boolean
default: true
description: If set, upload generated build artifacts.
artifacts-folder-name:
required: false
type: string
description: must be different from build-environment
default: ""
docker-image:
required: true
type: string
description: Docker image to run in or replace the external base image.
cuda-arch-list:
required: false
type: string
default: "8.9"
description: |
List of CUDA architectures CI build should target.
runner_prefix:
required: false
default: ""
type: string
description: Prefix for runner label
runner:
required: false
type: string
default: "linux.2xlarge"
description: |
Label of the runner this job should run on.
s3-bucket:
description: S3 bucket to download artifact
required: false
type: string
default: "gha-artifacts"
use-gha:
required: false
type: string
default: ""
description: f set to any value, use GHA to download the artifact. Otherwise use s3.
aws-role-to-assume:
description: Role to assume for downloading artifacts
required: false
type: string
default: ""
disable-monitor:
description: |
Disable utilization monitoring for build job
required: false
type: boolean
default: false
monitor-log-interval:
description: |
Set the interval for the monitor script to log utilization.
required: false
type: number
default: 5
monitor-data-collect-interval:
description: |
Set the interval for the monitor script to collect data.
required: false
type: number
default: 1
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
SCRIBE_GRAPHQL_ACCESS_TOKEN:
required: false
description: |
FB app token to write to scribe endpoint
jobs:
build-external-lib:
environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }}
# Don't run on forked repos
if: github.repository_owner == 'pytorch'
runs-on: ${{ inputs.runner_prefix}}${{ inputs.runner }}
timeout-minutes: 240
steps:
- name: Setup SSH (Click me for login details)
uses: pytorch/test-infra/.github/actions/setup-ssh@main
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
instructions: |
Build is done inside the container, to start an interactive session run:
docker exec -it $(docker container ps --format '{{.ID}}') bash
# [pytorch repo ref]
# Use a pytorch/pytorch reference instead of a reference to the local
# checkout because when we run this action we don't *have* a local
# checkout. In other cases you should prefer a local checkout.
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
no-sudo: true
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: configure aws credentials
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
if: ${{ inputs.aws-role-to-assume != ''}}
with:
role-to-assume: ${{ inputs.aws-role-to-assume }}
role-session-name: gha-linux-build
aws-region: us-east-1
- name: Setup Linux
uses: ./.github/actions/setup-linux
- name: Login to Amazon ECR
if: ${{ inputs.aws-role-to-assume != ''}}
id: login-ecr
continue-on-error: true
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
- name: Parse ref
id: parse-ref
run: .github/scripts/parse_ref.py
- name: Start monitoring script
id: monitor-script
if: ${{ !inputs.disable-monitor }}
shell: bash
continue-on-error: true
env:
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
WORKFLOW_NAME: ${{ github.workflow }}
WORKFLOW_RUN_ID: ${{github.run_id}}
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
run: |
mkdir -p ../../usage_logs
python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
python3 -m tools.stats.monitor \
--log-interval "$MONITOR_LOG_INTERVAL" \
--data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \
> "../../usage_logs/usage_log_build_${JOB_ID}.txt" 2>&1 &
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-image-name: ${{ inputs.docker-image }}
- name: Use following to pull public copy of the image
id: print-ghcr-mirror
env:
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
shell: bash
run: |
tag=${ECR_DOCKER_IMAGE##*:}
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Download pytorch build artifacts
uses: ./.github/actions/download-build-artifacts
with:
name: ${{ inputs.build-environment }}
s3-bucket: ${{ inputs.s3-bucket }}
use-gha: ${{ inputs.use-gha }}
- name: Download TD artifacts
continue-on-error: true
uses: ./.github/actions/download-td-artifacts
- name: Build external project
id: build
env:
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
BRANCH: ${{ steps.parse-ref.outputs.branch }}
PR_NUMBER: ${{ github.event.pull_request.number }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
# Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SCCACHE_REGION: us-east-1
PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
BASE_IMAGE: ${{ inputs.docker-image }}
BUILD_TARGET: ${{ inputs.build-target }}
run: |
set -euo pipefail
python3 --version
docker images
START_TIME=$(date +%s)
(
cd .ci/lumen_cli
python3 -m pip install -e .
)
MAX_JOBS="$(nproc --ignore=6)"
export MAX_JOBS
python3 -m cli.run build external "$BUILD_TARGET"
END_TIME=$(date +%s)
echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"
- name: Archive artifacts into zip
if: ${{ inputs.build-generates-artifacts && steps.build.outcome && steps.build.outcome != 'skipped'}}
run: |
zip -1 -r artifacts.zip shared/
# By default it will upload the artifacts to <github_org>/<github_repo>/<workflow_id>/<name>-<target>-additional-build/
# to avoid override the pytorch build artifacts
- name: Store External Build Artifacts on S3
if: ${{ inputs.build-generates-artifacts }}
uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
with:
name: ${{ inputs.artifacts-folder-name || format('{0}-{1}-additional-build', inputs.build-environment, inputs.build-target) }}
retention-days: 14
if-no-files-found: warn
path: artifacts.zip
s3-bucket: ${{ inputs.s3-bucket }}
- name: Stop monitoring script
if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
shell: bash
continue-on-error: true
env:
MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
run: |
kill "$MONITOR_SCRIPT_PID"
- name: Copy logs
shell: bash
if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor}}
continue-on-error: true
run: |
rm -f ./usage_logs
mkdir -p ./usage_logs
cp ../../usage_logs/usage_log_build_*.txt ./usage_logs/
- name: Upload raw usage log to s3
if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor}}
uses: seemethere/upload-artifact-s3@v5
with:
s3-prefix: |
${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
retention-days: 14
if-no-files-found: warn
path: usage_logs/usage_log_build_*.txt
- name: Upload utilization stats
if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor }}
continue-on-error: true
uses: ./.github/actions/upload-utilization-stats
with:
job_id: ${{ steps.get-job-id.outputs.job-id }}
job_name: ${{ steps.get-job-id.outputs.job-name }}
workflow_name: ${{ github.workflow }}
workflow_run_id: ${{github.run_id}}
workflow_attempt: ${{github.run_attempt}}
artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }}
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()
- name: Cleanup docker
if: always()
shell: bash
run: |
docker stop -a || true
docker kill -a || true

View File

@ -47,6 +47,12 @@ on:
required: false
type: string
default: ""
additional-artifact-name:
description: |
additional artifacts needed to be downloaded for testing
required: false
type: string
default: ""
disable-monitor:
description: |
[Experimental] Disable utilization monitoring for tests.
@ -72,6 +78,10 @@ on:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
VLLM_TEST_HUGGING_FACE_TOKEN:
required: false
description: |
HF Auth token to test vllm
SCRIBE_GRAPHQL_ACCESS_TOKEN:
required: false
description: |
@ -96,7 +106,7 @@ jobs:
steps:
- name: Setup SSH (Click me for login details)
uses: pytorch/test-infra/.github/actions/setup-ssh@main
if: ${{ matrix.runner != 'B200' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
instructions: |
@ -109,7 +119,7 @@ jobs:
no-sudo: true
- name: Setup Python
if: matrix.runner == 'B200'
if: contains(matrix.runner, 'b200')
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.12'
@ -117,7 +127,7 @@ jobs:
- name: Setup Linux
uses: ./.github/actions/setup-linux
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && matrix.runner != 'B200'
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && !contains(matrix.runner, 'b200')
- name: configure aws credentials
if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
@ -128,7 +138,7 @@ jobs:
aws-region: us-east-1
- name: Login to Amazon ECR
if: ${{ inputs.aws-role-to-assume != '' && matrix.runner == 'B200' }}
if: ${{ inputs.aws-role-to-assume != '' && contains(matrix.runner, 'b200') }}
id: login-ecr
continue-on-error: true
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
@ -166,17 +176,17 @@ jobs:
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
with:
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && matrix.runner != 'B200' }}
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
- name: Setup GPU_FLAG for docker run
id: setup-gpu-flag
run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || matrix.runner == 'B200') }}
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || contains(matrix.runner, 'b200')) }}
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
id: setup-sscache-port-flag
run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && matrix.runner != 'B200' }}
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && !contains(matrix.runner, 'b200') }}
- name: Lock NVIDIA A100 40GB Frequency
run: |
@ -216,6 +226,14 @@ jobs:
s3-bucket: ${{ inputs.s3-bucket }}
use-gha: ${{ inputs.use-gha }}
- name: Download additional build artifacts
if: ${{ inputs.additional-artifact-name != ''}}
uses: ./.github/actions/download-build-artifacts
with:
name: ${{ inputs.additional-artifact-name }}
s3-bucket: ${{ inputs.s3-bucket }}
use-gha: ${{ inputs.use-gha }}
- name: Download TD artifacts
continue-on-error: true
uses: ./.github/actions/download-td-artifacts
@ -277,8 +295,8 @@ jobs:
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
# Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
SCCACHE_BUCKET: ${{ matrix.runner != 'B200' && 'ossci-compiler-cache-circleci-v2' || '' }}
SCCACHE_REGION: ${{ matrix.runner != 'B200' && 'us-east-1' || '' }}
SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
DOCKER_IMAGE: ${{ inputs.docker-image }}
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
@ -286,6 +304,7 @@ jobs:
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
@ -362,6 +381,7 @@ jobs:
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
-e SKIP_SCCACHE_INITIALIZATION=1 \
-e HUGGING_FACE_HUB_TOKEN \
-e VLLM_TEST_HUGGING_FACE_TOKEN \
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
-e DASHBOARD_TAG \
-e ARTIFACTS_FILE_SUFFIX \
@ -403,7 +423,7 @@ jobs:
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
- name: Authenticate with AWS
if: ${{ matrix.runner == 'B200' }}
if: ${{ contains(matrix.runner, 'b200') }}
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results

View File

@ -76,7 +76,8 @@ jobs:
pytorch-linux-jammy-py3-clang12-onnx,
pytorch-linux-jammy-linter,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
pytorch-linux-jammy-py3-clang12-executorch,
# Executorch pin needs update
# pytorch-linux-jammy-py3-clang12-executorch,
pytorch-linux-jammy-py3.12-triton-cpu
]
include:

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,154 @@
name: inductor-perf-b200
on:
schedule:
- cron: 0 7 * * 1-6
- cron: 0 7 * * 0
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
workflow_dispatch:
inputs:
training:
description: Run training (on by default)?
required: false
type: boolean
default: true
inference:
description: Run inference (on by default)?
required: false
type: boolean
default: true
default:
description: Run inductor_default?
required: false
type: boolean
default: false
dynamic:
description: Run inductor_dynamic_shapes?
required: false
type: boolean
default: false
cppwrapper:
description: Run inductor_cpp_wrapper?
required: false
type: boolean
default: false
cudagraphs:
description: Run inductor_cudagraphs?
required: false
type: boolean
default: true
freezing_cudagraphs:
description: Run inductor_cudagraphs with freezing for inference?
required: false
type: boolean
default: false
aotinductor:
description: Run aot_inductor for inference?
required: false
type: boolean
default: false
maxautotune:
description: Run inductor_max_autotune?
required: false
type: boolean
default: false
benchmark_configs:
description: The list of configs used the benchmark
required: false
type: string
default: inductor_huggingface_perf_cuda_b200,inductor_timm_perf_cuda_b200,inductor_torchbench_perf_cuda_b200
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
build:
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
# Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
# or newer GPUs, so it doesn't benefit much from existing compiler cache
# from trunk. Also use a memory-intensive runner here because memory is
# usually the bottleneck
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
{ config: "inductor_timm_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
{ config: "inductor_torchbench_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
]}
selected-test-configs: ${{ inputs.benchmark_configs }}
build-additional-packages: "vision audio fbgemm torchao"
secrets: inherit
test-periodically:
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '0 7 * * 1-6'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-weekly:
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '0 7 * * 0'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
timeout-minutes: 1440
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test:
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-test.yml
needs: build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -81,21 +81,21 @@ jobs:
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
]}
secrets: inherit

View File

@ -75,10 +75,11 @@ jobs:
repo-owner: pytorch
branch: main
pin-folder: .github/ci_commit_pins
- repo-name: executorch
repo-owner: pytorch
branch: main
pin-folder: .ci/docker/ci_commit_pins
# executorch jobs are disabled since it needs some manual work for the hash update
# - repo-name: executorch
# repo-owner: pytorch
# branch: main
# pin-folder: .ci/docker/ci_commit_pins
- repo-name: triton
repo-owner: triton-lang
branch: main

View File

@ -434,6 +434,7 @@ jobs:
secrets: inherit
linux-jammy-py3-clang12-executorch-build:
if: false # Docker build needs pin update
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type

70
.github/workflows/tools-unit-tests.yml vendored Normal file
View File

@ -0,0 +1,70 @@
name: test-scripts-and-ci-tools
on:
push:
branches:
- main
paths:
- scripts/lumen_cli/**
- .github/workflows/tools-unit-tests.yml
pull_request:
paths:
- scripts/lumen_cli/**
- .github/workflows/tools-unit-tests.yml
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
jobs:
lumen-cli-unit-tests-python312:
permissions:
contents: read
pull-requests: write
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: ubuntu-latest
steps:
- name: Checkout pytorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
submodules: true
fetch-depth: 0
- name: Setup Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.12'
cache: pip
- name: Run tests
continue-on-error: true
run: |
set -ex
python3 -m venv /tmp/venv
source /tmp/venv/bin/activate
pip install -e .ci/lumen_cli/
pytest -v -s .ci/lumen_cli/tests/*
lumen-cli-compatible-python39:
permissions:
contents: read
pull-requests: write
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: ubuntu-latest
steps:
- name: Checkout pytorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
submodules: true
fetch-depth: 0
- name: Setup Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.9'
cache: 'pip'
- name: Run tests
continue-on-error: true
run: |
set -ex
python3 -m venv /tmp/venv
source /tmp/venv/bin/activate
pip install -e .ci/lumen_cli/

View File

@ -23,7 +23,7 @@ jobs:
with:
repository: pytorch/pytorch
stable-branch: viable/strict
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]'
secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}

123
.github/workflows/vllm.yml vendored Normal file
View File

@ -0,0 +1,123 @@
name: vllm-test
on:
push:
tags:
- ciflow/vllm/*
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
torch-build-sm89:
name: ci-vllm-test-sm89
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-additional-packages: "vision audio torchao"
build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm89
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
cuda-arch-list: '8.9'
test-matrix: |
{ include: [
{ config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "vllm_entrypoints_test", shard: 1, num_shards: 1,runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
]}
secrets: inherit
torch-build-sm80:
name: ci-vllm-test-sm80
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-additional-packages: "vision audio torchao"
build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
{ config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
{ config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
{ config: "vllm_entrypoints_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
]}
secrets: inherit
vllm-build-sm89:
name: ci-vllm-test-sm89
uses: ./.github/workflows/_linux-external-build-main.yml
needs: [
get-label-type,
torch-build-sm89
]
with:
build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm89
build-target: vllm
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
docker-image: ${{ needs.torch-build-sm89.outputs.docker-image }}
cuda-arch-list: '8.9'
runner: linux.24xlarge.memory
secrets: inherit
vllm-test-sm89:
name: ci-vllm-test-sm89
uses: ./.github/workflows/_linux-test.yml
needs: [
torch-build-sm89,
vllm-build-sm89
]
with:
build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm89
docker-image: ${{ needs.torch-build-sm89.outputs.docker-image }}
test-matrix: ${{ needs.torch-build-sm89.outputs.test-matrix }}
additional-artifact-name: linux-jammy-cuda12.8-py3.12-gcc11-sm89-vllm-additional-build
secrets: inherit
vllm-build-sm80:
name: ci-vllm-test-sm80
uses: ./.github/workflows/_linux-external-build-main.yml
needs: [
get-label-type,
torch-build-sm80
]
with:
build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm80
build-target: vllm
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
docker-image: ${{ needs.torch-build-sm80.outputs.docker-image }}
runner: linux.24xlarge.memory
cuda-arch-list: '8.0'
secrets: inherit
vllm-test-sm80:
name: ci-vllm-test-sm80
uses: ./.github/workflows/_linux-test.yml
needs: [
torch-build-sm80,
vllm-build-sm80
]
with:
build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm80
docker-image: ${{ needs.torch-build-sm80.outputs.docker-image }}
test-matrix: ${{ needs.torch-build-sm80.outputs.test-matrix }}
additional-artifact-name: linux-jammy-cuda12.8-py3.12-gcc11-sm80-vllm-additional-build
secrets: inherit

View File

@ -14,7 +14,6 @@
/torch/csrc/autograd/ @albanD @soulitzer
/torch/autograd/ @albanD @soulitzer
/tools/autograd/ @albanD @soulitzer
/torch/header_only_apis.txt @janeyx99
/torch/nn/ @albanD @jbschlosser @mikaylagawarecki
/torch/optim/ @albanD @janeyx99
/test/test_public_bindings.py @albanD
@ -196,3 +195,8 @@ torch/backends/cudnn/ @eqy @syed-ahmed
/torch/utils/_cxx_pytree.py @XuehaiPan
/torch/utils/pytree/ @XuehaiPan
/torch/_dynamo/polyfills/pytree.py @XuehaiPan
# Relating to libtorch ABI
/torch/csrc/stable/ @janeyx99 @mikaylagawarecki
/torch/headeronly/ @janeyx99
/torch/header_only_apis.txt @janeyx99

View File

@ -439,6 +439,7 @@ if(USE_ROCM)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/example/ck_tile/01_fmha)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
_pytorch_rocm_generate_ck_conf()
@ -703,21 +704,17 @@ if(USE_MPS)
if(CAN_COMPILE_METAL)
foreach(SHADER ${native_mps_metal})
cmake_path(GET SHADER STEM TGT_STEM)
string(CONCAT TGT_BASIC ${TGT_STEM} "_30.air")
string(CONCAT TGT_BFLOAT ${TGT_STEM} "_31.air")
string(CONCAT TGT_BASIC ${TGT_STEM} "_31.air")
list(APPEND AIR_BASIC ${TGT_BASIC})
list(APPEND AIR_BFLOAT ${TGT_BFLOAT})
metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.0")
metal_to_air(${SHADER} ${TGT_BFLOAT} "-std=metal3.1")
metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.1")
endforeach()
air_to_metallib(kernels_basic.metallib ${AIR_BASIC})
air_to_metallib(kernels_bfloat.metallib ${AIR_BFLOAT})
add_custom_command(
COMMAND echo "// $$(date)" > metallib_dummy.cpp
DEPENDS kernels_basic.metallib kernels_bfloat.metallib
DEPENDS kernels_basic.metallib
OUTPUT metallib_dummy.cpp
COMMENT "Updating metallibs timestamp")
add_custom_target(metallibs DEPENDS kernels_basic.metallib kernels_bfloat.metallib metallib_dummy.cpp)
add_custom_target(metallibs DEPENDS kernels_basic.metallib metallib_dummy.cpp)
else()
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps")
foreach(SHADER ${native_mps_metal})

View File

@ -162,7 +162,7 @@ struct CUDACachingHostAllocatorImpl
}
bool pinned_use_background_threads() override {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
pinned_use_background_threads();
}

View File

@ -24,6 +24,29 @@ static void _assert_match(const O& original, const C& compared, const std::strin
}
}
template<>
void _assert_match<c10::Device, std::optional<c10::Device>>(
const c10::Device& original,
const std::optional<c10::Device>& compared,
const std::string& name) {
if (compared) {
const c10::Device& expected = compared.value();
if (original.type() != expected.type()) {
std::stringstream msg;
msg << "Tensor " << name << " mismatch! Expected: " << expected << ", Got: " << original;
throw std::runtime_error(msg.str());
}
// If the expected device doesn't have an index (e.g., just "cuda"),
// or if both devices have the same index, consider them equal
if (expected.has_index() && original.has_index() && expected.index() != original.index()) {
std::stringstream msg;
msg << "Tensor " << name << " mismatch! Expected: " << expected << ", Got: " << original;
throw std::runtime_error(msg.str());
}
}
}
void _assert_tensor_metadata_meta_symint(at::Tensor const& tensor, at::OptionalSymIntArrayRef sizes, at::OptionalSymIntArrayRef strides, std::optional<c10::ScalarType> dtype, std::optional<c10::Device> device, std::optional<c10::Layout> layout) {
_assert_match(tensor.sym_sizes(), sizes, "sizes");
_assert_match(tensor.sym_strides(), strides, "strides");

View File

@ -367,27 +367,27 @@ void int8pack_mm_kernel_(
auto* C_data = C.data_ptr<T>();
const auto* S_data = scales.const_data_ptr<T>();
int M = A.size(0);
int N = B.size(0);
int K = A.size(1);
int lda = A.stride(0);
constexpr int BLOCK_M = 4;
constexpr int BLOCK_N = 4;
int64_t M = A.size(0);
int64_t N = B.size(0);
int64_t K = A.size(1);
int64_t lda = A.stride(0);
constexpr int64_t BLOCK_M = 4;
constexpr int64_t BLOCK_N = 4;
const int MB = (M + BLOCK_M - 1) / BLOCK_M;
const int NB = (N + BLOCK_N - 1) / BLOCK_N;
const int64_t MB = (M + BLOCK_M - 1) / BLOCK_M;
const int64_t NB = (N + BLOCK_N - 1) / BLOCK_N;
at::parallel_for(0, MB * NB, 0, [&](int begin, int end) {
int mb{0}, nb{0};
at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
int64_t mb{0}, nb{0};
data_index_init(begin, mb, MB, nb, NB);
for (const auto i : c10::irange(begin, end)) {
(void)i;
int mb_start = mb * BLOCK_M;
int mb_size = std::min(BLOCK_M, M - mb_start);
int nb_start = nb * BLOCK_N;
int nb_size = std::min(BLOCK_N, N - nb_start);
int64_t mb_start = mb * BLOCK_M;
int64_t mb_size = std::min(BLOCK_M, M - mb_start);
int64_t nb_start = nb * BLOCK_N;
int64_t nb_size = std::min(BLOCK_N, N - nb_start);
const auto* A_ptr = A_data + mb_start * lda;
const auto* B_ptr = B_data + nb_start * K;

View File

@ -526,7 +526,7 @@ namespace {
// we are dealing with packed tensor here. max index is the same as numel.
// TODO: to really support input tensor large enought to go beyond int32,
// TODO: to really support input tensor large enough to go beyond int32,
// we will need to restrict out shared memory usage and adjust the launch
// config;
AT_ASSERT(input_.numel() < std::numeric_limits<int32_t>::max());
@ -681,7 +681,7 @@ namespace {
const dim3 grid(grid_x, grid_y, grid_z);
// we are dealing with packed tensor here. max index is the same as numel.
// TODO: to really support input tensor large enought to go beyond int32,
// TODO: to really support input tensor large enough to go beyond int32,
// we will need to restrict out shared memory usage and adjust the launch
// config;
AT_ASSERT(input.numel() < std::numeric_limits<int32_t>::max());

View File

@ -1634,6 +1634,9 @@ bool use_fast_accum) {
TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
const bool a_is_2d = mat_a.dim() == 2;
const bool b_is_2d = mat_b.dim() == 2;
if (!a_is_2d || !b_is_2d) {
TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
}
TORCH_CHECK(
mat_a.size(-1) % 16 == 0,
"Expected trailing dimension of mat_a to be divisible by 16 ",
@ -1716,6 +1719,9 @@ std::optional<c10::ScalarType> out_dtype) {
TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
const bool a_is_2d = mat_a.dim() == 2;
const bool b_is_2d = mat_b.dim() == 2;
if (!a_is_2d || !b_is_2d) {
TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
}
// check that the strides are valid, the fn will throw an error if not
check_valid_strides_and_return_transposed(mat_a);

View File

@ -223,7 +223,7 @@ inline CuFFTDataLayout as_cufft_embed(IntArrayRef strides, IntArrayRef sizes, bo
class CuFFTConfig {
public:
// Only move semantics is enought for this class. Although we already use
// Only move semantics is enough for this class. Although we already use
// unique_ptr for the plan, still remove copy constructor and assignment op so
// we don't accidentally copy and take perf hit.
CuFFTConfig(const CuFFTConfig&) = delete;

View File

@ -241,6 +241,8 @@ void bf16bf16_grouped_gemm_impl_sm90_sm100(
Strides tensor_StrideA = make_strides(mat_a.strides());
Strides tensor_StrideB = make_strides(mat_b.strides());
Strides tensor_StrideOutput = make_strides(out.strides());
Strides tensor_ShapeA = make_strides(mat_a.sizes());
Strides tensor_ShapeB = make_strides(mat_b.sizes());
at::cuda::detail::prepare_grouped_gemm_data<<<1, group_count, 0, stream>>>(
reinterpret_cast<DtypeA*>(mat_a.data_ptr()),
@ -264,6 +266,8 @@ void bf16bf16_grouped_gemm_impl_sm90_sm100(
tensor_StrideA,
tensor_StrideB,
tensor_StrideOutput,
tensor_ShapeA,
tensor_ShapeB,
0,
0,
a_row_major,

View File

@ -38,18 +38,20 @@ __global__ void prepare_grouped_gemm_data(
Strides tensor_StrideA,
Strides tensor_StrideB,
Strides tensor_StrideOutput,
Strides tensor_ShapeA,
Strides tensor_ShapeB,
int64_t a_scale_stride,
int64_t b_scale_stride,
bool a_row_major = true,
bool b_row_major = false) {
int32_t tid = threadIdx.x;
int32_t delta = 0;
int32_t offset = 0;
if (offs != nullptr) {
int32_t start = tid == 0 ? 0 : offs[tid - 1];
delta = offs[tid] - start;
if (K < 0) {
CUDA_KERNEL_ASSERT(delta >=0 && "expected ofsets to be greater or equal 0\n");
}
offset = offs[tid];
delta = offset - start;
CUDA_KERNEL_ASSERT(delta >=0 && "expected gemm dimension to be greater or equal 0\n");
// TMA transfers require global memory tensor addresses to be
// aligned to 16 bytes.
@ -84,6 +86,7 @@ __global__ void prepare_grouped_gemm_data(
int64_t lda, ldb, ldoutput;
if (M < 0) {
// A and output is 2d
CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[0] && "expected offset to be less than tensor size\n");
M = delta;
lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1];
ldb = b_row_major ? tensor_StrideB[1] : tensor_StrideB[2];
@ -96,6 +99,7 @@ __global__ void prepare_grouped_gemm_data(
output_ptrs[tid] = tid == 0 ? output : output + offs[tid - 1] * ldoutput;
B_ptrs[tid] = B + tid * tensor_StrideB[0];
} else if (N < 0) {
CUDA_KERNEL_ASSERT(offset <= tensor_ShapeB[1] && "expected offset to be less than tensor size\n");
N = delta;
lda = a_row_major ? tensor_StrideA[1] : tensor_StrideA[2];
ldb = b_row_major ? tensor_StrideB[0] : tensor_StrideB[1]; // B is transposed
@ -108,6 +112,7 @@ __global__ void prepare_grouped_gemm_data(
inputB_scale_ptrs[tid] = tid == 0 ? scale_B : scale_B + offs[tid - 1];
}
} else if (K < 0) {
CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[1] && offset <= tensor_ShapeB[0] && "expected offset to be less than tensor size\n");
// A, B is 2d, output is 3d
K = delta;
lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1];

View File

@ -298,6 +298,9 @@ void f8f8bf16_grouped_gemm_impl_sm90(
Strides tensor_StrideA = make_strides(mat_a.strides());
Strides tensor_StrideB = make_strides(mat_b.strides());
Strides tensor_StrideOutput = make_strides(out.strides());
Strides tensor_ShapeA = make_strides(mat_a.sizes());
Strides tensor_ShapeB = make_strides(mat_b.sizes());
// scale stride will be used inside the kernel only if needed,
// so for 1d scales the "1" assigned here won't be used
int64_t a_scale_stride = scale_a.stride(0);
@ -325,6 +328,8 @@ void f8f8bf16_grouped_gemm_impl_sm90(
tensor_StrideA,
tensor_StrideB,
tensor_StrideOutput,
tensor_ShapeA,
tensor_ShapeB,
a_scale_stride,
b_scale_stride);

View File

@ -28,6 +28,22 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
TORCH_CHECK(false, "cudnn_batch_norm: ATen not compiled with cuDNN support");
}
std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
const Tensor& input,
const Tensor& weight,
const std::optional<Tensor>& bias,
const std::optional<Tensor>& running_mean,
const std::optional<Tensor>& running_var,
bool training,
double exponential_average_factor,
double epsilon,
Tensor& out,
Tensor& save_mean,
Tensor& save_var,
Tensor& reserve) {
AT_ERROR("cudnn_batch_norm_out: ATen not compiled with cuDNN support");
}
std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
const Tensor& input,
const Tensor& grad_output,
@ -120,7 +136,12 @@ size_t _get_cudnn_batch_norm_reserve_space_size(
return reserve_size;
}
std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
// Param `reserve` is a placeholder, just passing an empty tensor.
// usage:
// auto reserve = torch::empty({0}, torch::device(torch::kCUDA));
// at::native::cudnn_batch_norm_out(..., epsilon, output, save_mean, save_var,
// reserve);
std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
const Tensor& input_t,
const Tensor& weight_t,
const std::optional<Tensor>& bias_t_opt,
@ -128,7 +149,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
const std::optional<Tensor>& running_var_t_opt,
bool training,
double exponential_average_factor,
double epsilon) {
double epsilon,
Tensor& output_t,
Tensor& save_mean,
Tensor& save_var,
Tensor& reserve) {
// See [Note: hacky wrapper removal for optional tensor]
c10::MaybeOwned<Tensor> bias_t_maybe_owned =
at::borrow_from_optional_tensor(bias_t_opt);
@ -168,9 +193,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
cudnnBatchNormMode_t mode = getCudnnBatchNormMode(
training, input->suggest_memory_format(), input->dim());
auto output_t =
at::empty_like(*input, input->options(), input->suggest_memory_format());
TensorArg output{output_t, "output", 0};
auto handle = getCudnnHandle();
@ -182,15 +204,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
Constant one(dataType, 1);
Constant zero(dataType, 0);
Tensor save_mean, save_var;
Tensor reserve;
if (training) {
int64_t num_features = input_t.size(1);
save_mean = at::empty({num_features}, weight_t.options());
save_var = at::empty({num_features}, weight_t.options());
auto op = CUDNN_BATCHNORM_OPS_BN;
size_t workspace_size;
AT_CUDNN_CHECK(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
@ -238,9 +253,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
reserve_size));
} else {
reserve = at::empty({0}, input->options().dtype(kByte));
// This keeps a consistent output with native_batch_norm
save_mean = at::empty({0}, weight_t.options());
save_var = at::empty({0}, weight_t.options());
AT_CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
handle,
mode,
@ -261,10 +273,48 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
// save_mean and save_var can be undefined
// If this causes problems, we can initialize them to empty tensors
// of the correct type
return std::tuple<Tensor, Tensor, Tensor, Tensor>{
return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>{
output_t, save_mean, save_var, reserve};
}
std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
const Tensor& input_t,
const Tensor& weight_t,
const std::optional<Tensor>& bias_t_opt,
const std::optional<Tensor>& running_mean_t_opt,
const std::optional<Tensor>& running_var_t_opt,
bool training,
double exponential_average_factor,
double epsilon) {
auto output_t = at::empty_like(
input_t, input_t.options(), input_t.suggest_memory_format());
Tensor save_mean, save_var, reserve;
if (training) {
int64_t num_features = input_t.size(1);
save_mean = at::empty({num_features}, weight_t.options());
save_var = at::empty({num_features}, weight_t.options());
} else {
// This keeps a consistent output with native_batch_norm
save_mean = at::empty({0}, weight_t.options());
save_var = at::empty({0}, weight_t.options());
}
return cudnn_batch_norm_out(
input_t,
weight_t,
bias_t_opt,
running_mean_t_opt,
running_var_t_opt,
training,
exponential_average_factor,
epsilon,
output_t,
save_mean,
save_var,
reserve);
}
// NB: CuDNN only implements the backward algorithm for batchnorm
// in training mode (evaluation mode batchnorm has a different algorithm),
// which is why this doesn't accept a 'training' parameter.

View File

@ -1,7 +1,6 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/Config.h>
#include <ATen/Context.h>
#include <ATen/Dispatch.h>
#include <ATen/core/Tensor.h>
#include <ATen/native/mkldnn/Matmul.h>
@ -428,56 +427,74 @@ static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
}
}
template <typename T>
bool use_mkldnn_typed_matmul(
bool use_mkldnn_bf16_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
bool dtype_check = false;
if constexpr (std::is_same_v<T, c10::BFloat16>) {
#if defined(__aarch64__)
if (mkldnn_bf16_device_check_arm()) {
// onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g.
// Arm Neoverse V1 so, don't restrict the mkldnn_matmul only for bf16
// inputs, allow it for float as well
dtype_check = use_mkldnn_bf16_matmul() &&
((mat1.scalar_type() == kFloat) || (mat1.scalar_type() == kBFloat16));
}
#else
dtype_check = dtype_check && use_mkldnn_bf16_matmul() &&
(mat1.scalar_type() == kBFloat16);
if (mkldnn_bf16_device_check_arm()) {
// onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g.
// Arm Neoverse V1 so, don't restrict the mkldnn_matmul only for bf16
// inputs, allow it for float as well
return (
use_mkldnn_bf16_matmul() &&
(mat1.scalar_type() == mat2.scalar_type()) &&
(!result.defined() || (mat1.scalar_type() == result.scalar_type())) &&
((mat1.scalar_type() == kFloat) || (mat1.scalar_type() == kBFloat16)) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
} else
#endif
} else if constexpr (std::is_same_v<T, c10::Half>) {
dtype_check = dtype_check && use_mkldnn_fp16_matmul() &&
(mat1.scalar_type() == kHalf);
} else if constexpr (std::is_same_v<T, float>) {
dtype_check = dtype_check &&
(use_mkldnn_bf32_matmul() || use_mkldnn_tf32_matmul()) &&
(mat1.scalar_type() == kFloat);
{
return (
use_mkldnn_bf16_matmul() && mat1.scalar_type() == kBFloat16 &&
mat2.scalar_type() == kBFloat16 &&
(!result.defined() || result.scalar_type() == kBFloat16) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
}
if (!dtype_check) {
return false;
}
bool size_check =
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2);
dtype_check = (mat1.scalar_type() == mat2.scalar_type()) &&
(!result.defined() || result.scalar_type() == mat1.scalar_type());
return dtype_check && size_check;
}
bool use_mkldnn_fp16_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
return (
use_mkldnn_fp16_matmul() && mat1.scalar_type() == kHalf &&
mat2.scalar_type() == kHalf &&
(!result.defined() || result.scalar_type() == kHalf) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
}
bool use_mkldnn_bf32_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
return (
use_mkldnn_bf32_matmul() && mat1.scalar_type() == kFloat &&
mat2.scalar_type() == kFloat &&
(!result.defined() || result.scalar_type() == kFloat) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
}
bool use_mkldnn_tf32_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
return (
use_mkldnn_tf32_matmul() && mat1.scalar_type() == kFloat &&
mat2.scalar_type() == kFloat &&
(!result.defined() || result.scalar_type() == kFloat) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
}
bool use_mkldnn_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
auto mat1_type = mat1.scalar_type();
if (mat1_type != kBFloat16 || mat1_type != kHalf || mat1_type != kFloat) {
return false;
}
AT_DISPATCH_FLOATING_TYPES_AND2(
kBFloat16, kHalf, mat1.scalar_type(), "use_mkldnn_matmul", [&] {
return use_mkldnn_typed_matmul<scalar_t>(mat1, mat2, result);
});
return false;
return (
use_mkldnn_bf16_matmul(mat1, mat2, result) ||
use_mkldnn_fp16_matmul(mat1, mat2, result) ||
use_mkldnn_bf32_matmul(mat1, mat2, result) ||
use_mkldnn_tf32_matmul(mat1, mat2, result));
}
static void _mkldnn_matmul_i8i8i32_with_primitive(

View File

@ -469,4 +469,94 @@ Tensor _weight_int4pack_mm_xpu(
return C;
}
Tensor& _int_mm_out_xpu(
const Tensor& self,
const Tensor& mat2,
Tensor& result) {
TORCH_CHECK(
self.dim() == 2,
"Expected self to be of dimension 2 but got ",
self.dim());
TORCH_CHECK(
mat2.dim() == 2,
"Expected mat2 to be of dimension 2 but got ",
mat2.dim());
TORCH_CHECK(
self.size(1) == mat2.size(0),
"self.size(1) needs to match mat2.size(0) but got ",
self.size(1),
" and ",
mat2.size(0));
TORCH_CHECK(
self.dtype() == at::kChar,
"Expected self dtype to be of type int8 but got ",
self.dtype());
TORCH_CHECK(
mat2.dtype() == at::kChar,
"Expected mat2 dtype to be of type int8 but got ",
mat2.dtype());
TORCH_CHECK(
result.dtype() == at::kInt,
"Expected result dtype to be of type kInt but got ",
result.dtype());
TORCH_CHECK(
result.size(0) == self.size(0),
"Expected result.size(0) to be ",
self.size(0),
" but got ",
result.size(0));
TORCH_CHECK(
result.size(1) == mat2.size(1),
"Expected result.size(1) to be ",
mat2.size(1),
" but got ",
result.size(1));
TORCH_CHECK(
result.dim() == 2,
"Expected result to be of dimension 2 but got ",
result.dim());
TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous.");
if (result.numel() == 0 || self.size(1) == 0) {
return result.zero_();
}
Tensor bias = at::Tensor();
Tensor mat2_scales = at::ones({1}, mat2.options().dtype(at::kFloat));
Tensor mat2_zero_points = at::Tensor();
auto post_op_args = torch::List<std::optional<at::Scalar>>();
at::native::onednn::quantized_matmul(
self.contiguous(),
1.0,
0,
mat2.contiguous(),
mat2_scales,
mat2_zero_points,
bias,
result,
1.0,
0,
result.scalar_type(),
/*other*/ std::nullopt,
/*other scale*/ 1.0,
/*other zp*/ 0,
/*binary post op*/ "none",
/*binary alpha*/ 1.0,
/*post_op_name*/ "none",
post_op_args,
/*post_op_algorithm*/ "none",
/*m2_trans*/ true);
return result;
}
Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
Tensor result =
at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
return _int_mm_out_xpu(self, mat2, result);
}
} // namespace at::native

View File

@ -953,8 +953,7 @@ class BundledShaderLibary : public MetalShaderLibrary {
if (C10_UNLIKELY(!library)) {
auto device = MPSDevice::getInstance()->device();
NSError* error = nil;
auto section_name = is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) ? "metal_bfloat" : "metal_basic";
library = [device newLibraryWithData:getSectionData(section_name) error:&error];
library = [device newLibraryWithData:getSectionData("metal_basic") error:&error];
TORCH_CHECK(library, "Failed to create metal library, error: ", [[error description] UTF8String]);
}
return library;

View File

@ -33,21 +33,15 @@ struct shrink_backward_functor {
REGISTER_UNARY_ALPHA_OP(hardshrink, float, float, float);
REGISTER_UNARY_ALPHA_OP(hardshrink, half, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_UNARY_ALPHA_OP(hardshrink, bfloat, bfloat, bfloat);
#endif
REGISTER_UNARY_ALPHA_OP(softshrink, float, float, float);
REGISTER_UNARY_ALPHA_OP(softshrink, half, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_UNARY_ALPHA_OP(softshrink, bfloat, bfloat, bfloat);
#endif
REGISTER_BINARY_ALPHA_OP(shrink_backward, float, float, float);
REGISTER_BINARY_ALPHA_OP(shrink_backward, half, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_BINARY_ALPHA_OP(shrink_backward, bfloat, bfloat, bfloat);
#endif
struct hardsigmoid_functor {
template <typename T>
@ -67,15 +61,11 @@ struct hardsigmoid_backward_functor {
REGISTER_UNARY_OP(hardsigmoid, float, float);
REGISTER_UNARY_OP(hardsigmoid, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_UNARY_OP(hardsigmoid, bfloat, bfloat);
#endif
REGISTER_BINARY_OP(hardsigmoid_backward, float, float);
REGISTER_BINARY_OP(hardsigmoid_backward, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_BINARY_OP(hardsigmoid_backward, bfloat, bfloat);
#endif
struct hardswish_functor {
template <typename T>
@ -103,15 +93,11 @@ struct hardswish_backward_functor {
REGISTER_UNARY_OP(hardswish, float, float);
REGISTER_UNARY_OP(hardswish, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_UNARY_OP(hardswish, bfloat, bfloat);
#endif
REGISTER_BINARY_OP(hardswish_backward, float, float);
REGISTER_BINARY_OP(hardswish_backward, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_BINARY_OP(hardswish_backward, bfloat, bfloat);
#endif
struct leaky_relu_functor {
template <typename T>
@ -135,12 +121,8 @@ struct leaky_relu_backward_functor {
REGISTER_UNARY_ALPHA_OP(leaky_relu, float, float, float);
REGISTER_UNARY_ALPHA_OP(leaky_relu, half, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_UNARY_ALPHA_OP(leaky_relu, bfloat, bfloat, bfloat);
#endif
REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, float, float, float);
REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, half, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, bfloat, bfloat, bfloat);
#endif

View File

@ -113,18 +113,12 @@ kernel void ampUpdateScale(
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(float);
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(bfloat);
#endif
INSTANTIATE_AMP_UPDATE_SCALE(float);
INSTANTIATE_AMP_UPDATE_SCALE(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_AMP_UPDATE_SCALE(bfloat);
#endif
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(float);
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(bfloat);
#endif

View File

@ -590,9 +590,7 @@ kernel void attention(
INSTANTIATE_SDPA_VECTOR_HEADS(float);
INSTANTIATE_SDPA_VECTOR_HEADS(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_SDPA_VECTOR_HEADS(bfloat);
#endif
#define INSTANTIATE_ATTN(DTYPE, bq, bk, bd, wm, wn) \
template [[host_name("attention_" #DTYPE "_bq" #bq "_bk" #bk "_bd" #bd \
@ -621,6 +619,4 @@ INSTANTIATE_SDPA_VECTOR_HEADS(bfloat);
INSTANTIATE_ATTN_SHAPES_HELPER(float);
INSTANTIATE_ATTN_SHAPES_HELPER(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_ATTN_SHAPES_HELPER(bfloat);
#endif

View File

@ -209,38 +209,9 @@ struct hermite_polynomial_he_functor {
};
struct nextafter_functor {
#if __METAL_VERSION__ < 310
template <typename U>
struct bit_type {};
template <>
struct bit_type<float> {
using type = int;
};
template <>
struct bit_type<half> {
using type = short;
};
#endif
template <typename T>
inline T operator()(const T a, const T b) {
#if __METAL_VERSION__ >= 310
return static_cast<T>(::metal::nextafter(a, b));
#else
using U = typename bit_type<T>::type;
if (a == b) {
return a;
}
if (::metal::isunordered(a, b)) {
return NAN;
}
if (a == 0) {
constexpr auto eps = as_type<T>(static_cast<U>(1));
return b > 0 ? eps : -eps;
}
auto bits = as_type<U>(a);
(a > 0) ^ (a > b) ? bits++ : bits--;
return as_type<T>(bits);
#endif
}
};
@ -344,13 +315,6 @@ struct fmod_functor {
}
};
// Some helper defines
#if __METAL_VERSION__ >= 310
#define _METAL_310_PLUS(x) x
#else
#define _METAL_310_PLUS(x)
#endif
#define REGISTER_INTEGER_BINARY_OP(NAME) \
REGISTER_BINARY_OP(NAME, long, long); \
REGISTER_BINARY_OP(NAME, int, int); \
@ -370,12 +334,12 @@ struct fmod_functor {
#define REGISTER_FLOAT_BINARY_OP(NAME) \
REGISTER_BINARY_OP(NAME, float, float); \
REGISTER_BINARY_OP(NAME, half, half); \
_METAL_310_PLUS(REGISTER_BINARY_OP(NAME, bfloat, bfloat))
REGISTER_BINARY_OP(NAME, bfloat, bfloat)
#define REGISTER_OPMATH_FLOAT_BINARY_OP(NAME) \
REGISTER_OPMATH_BINARY_OP(NAME, float, float); \
REGISTER_OPMATH_BINARY_OP(NAME, half, half); \
_METAL_310_PLUS(REGISTER_OPMATH_BINARY_OP(NAME, bfloat, bfloat))
REGISTER_OPMATH_BINARY_OP(NAME, bfloat, bfloat)
REGISTER_FLOAT_BINARY_OP(copysign);
REGISTER_INT2FLOAT_BINARY_OP(copysign);
@ -447,11 +411,9 @@ REGISTER_BINARY_ALPHA_OP(lerp_alpha, uchar, uchar, uchar);
REGISTER_BINARY_ALPHA_OP(lerp_alpha, char, char, char);
REGISTER_BINARY_ALPHA_OP(lerp_alpha, bool, bool, bool);
#if __METAL_VERSION__ >= 310
REGISTER_BINARY_ALPHA_OP(add_alpha, bfloat, bfloat, bfloat);
REGISTER_BINARY_ALPHA_OP(sub_alpha, bfloat, bfloat, bfloat);
REGISTER_BINARY_ALPHA_OP(lerp_alpha, bfloat, bfloat, bfloat);
#endif
// Complex binary functions
REGISTER_BINARY_OP(polar, float, float2);

View File

@ -180,10 +180,8 @@ REGISTER_SEARCHSORTED_OP(float, int);
REGISTER_SEARCHSORTED_OP(float, long);
REGISTER_SEARCHSORTED_OP(half, int);
REGISTER_SEARCHSORTED_OP(half, long);
#if __METAL_VERSION__ >= 310
REGISTER_SEARCHSORTED_OP(bfloat, int);
REGISTER_SEARCHSORTED_OP(bfloat, long);
#endif
REGISTER_SEARCHSORTED_OP(char, int);
REGISTER_SEARCHSORTED_OP(char, long);
REGISTER_SEARCHSORTED_OP(uchar, int);

View File

@ -96,6 +96,4 @@ kernel void col2im_kernel(
INSTANTIATE_COL2IM(bool);
INSTANTIATE_COL2IM(float);
INSTANTIATE_COL2IM(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_COL2IM(bfloat);
#endif

View File

@ -20,9 +20,7 @@ REGISTER_CROSS_FUNC(short);
REGISTER_CROSS_FUNC(char);
REGISTER_CROSS_FUNC(uchar);
REGISTER_CROSS_FUNC(bool);
#if __METAL_VERSION__ >= 310
REGISTER_CROSS_FUNC(bfloat);
#endif
template <typename T, typename U>
kernel void cross(
@ -68,6 +66,4 @@ REGISTER_CROSS_OP(short);
REGISTER_CROSS_OP(char);
REGISTER_CROSS_OP(uchar);
REGISTER_CROSS_OP(bool);
#if __METAL_VERSION__ >= 310
REGISTER_CROSS_OP(bfloat);
#endif

View File

@ -1,11 +1,9 @@
#include <metal_stdlib>
using metal::max;
#if __METAL_VERSION__ >= 310
bfloat max(bfloat a, bfloat b) {
return a > b ? a : b;
}
#endif
#define kmaxThreadGroups 32
#define kmaxTensors 32
@ -306,11 +304,9 @@ REGISTER_ADAM_OPS_QUART(float, float);
REGISTER_ADAM_OPS_QUART(float, half);
REGISTER_ADAM_OPS_QUART(half, float);
REGISTER_ADAM_OPS_QUART(half, half);
#if __METAL_VERSION__ >= 310
REGISTER_ADAM_OPS_QUART(float, bfloat);
REGISTER_ADAM_OPS_QUART(bfloat, bfloat);
REGISTER_ADAM_OPS_QUART(bfloat, float);
#endif
template <typename T>
inline void sgd_momentum_math(
@ -460,7 +456,5 @@ REGISTER_FUSED_SGD_OP(float);
REGISTER_FUSED_SGD_OP(half);
REGISTER_FUSED_SGD_MOMENTUM_OP(float);
REGISTER_FUSED_SGD_MOMENTUM_OP(half);
#if __METAL_VERSION__ >= 310
REGISTER_FUSED_SGD_OP(bfloat);
REGISTER_FUSED_SGD_MOMENTUM_OP(bfloat);
#endif

View File

@ -106,9 +106,7 @@ kernel void polygamma(
constant int64_t& order [[buffer(2)]], \
uint id [[thread_position_in_grid]]);
#if __METAL_VERSION__ >= 310
INSTANTIATE_GAMMA_KERNELS(bfloat, bfloat);
#endif
INSTANTIATE_GAMMA_KERNELS(half, half);
INSTANTIATE_GAMMA_KERNELS(float, float);
INSTANTIATE_GAMMA_KERNELS(bool, float);

View File

@ -76,6 +76,4 @@ INSTANTIATE_IM2COL(float);
INSTANTIATE_IM2COL(float2);
INSTANTIATE_IM2COL(half);
INSTANTIATE_IM2COL(half2);
#if __METAL_VERSION__ >= 310
INSTANTIATE_IM2COL(bfloat);
#endif

View File

@ -240,9 +240,7 @@ REGISTER_INDEX_OP(put_accumulate, short, short);
REGISTER_INDEX_OP(put_accumulate, char, char);
REGISTER_INDEX_OP(put_accumulate, uchar, uchar);
REGISTER_INDEX_OP(put_accumulate, bool, bool);
#if __METAL_VERSION__ >= 310
REGISTER_INDEX_OP(put_accumulate, bfloat, bfloat);
#endif
template <typename StridesT, typename DataT>
kernel void kernel_index_offsets(
@ -477,10 +475,8 @@ INSTANTIATE_INDEX_COPY(char, long);
INSTANTIATE_INDEX_COPY(uchar, int);
INSTANTIATE_INDEX_COPY(uchar, long);
#if __METAL_VERSION__ >= 310
INSTANTIATE_INDEX_COPY(bfloat, int);
INSTANTIATE_INDEX_COPY(bfloat, long);
#endif
INSTANTIATE_INDEX_COPY(float2, int);
INSTANTIATE_INDEX_COPY(float2, long);
INSTANTIATE_INDEX_COPY(half2, int);

View File

@ -288,7 +288,6 @@ kernel void layer_norm_looped(
#define instantiate_layer_norm(DTYPE) \
instantiate_layer_norm_single_row(DTYPE) instantiate_layer_norm_looped(DTYPE)
instantiate_layer_norm(float) instantiate_layer_norm(half)
#if __METAL_VERSION__ >= 310
instantiate_layer_norm(bfloat)
#endif
instantiate_layer_norm(float);
instantiate_layer_norm(half);
instantiate_layer_norm(bfloat);

View File

@ -635,9 +635,7 @@ kernel void applyPivots(
INSTANTIATE_NAIVE_MM(float);
INSTANTIATE_NAIVE_MM(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_NAIVE_MM(bfloat);
#endif
// Integral MM
INSTANTIATE_NAIVE_MM(short);

View File

@ -48,3 +48,14 @@ struct PoolingBackwardParams {
::c10::metal::array<idx_type_t, N> grad_output_strides;
::c10::metal::array<idx_type_t, N> indices_strides;
};
template <unsigned N = 5, typename idx_type_t = int32_t>
struct MaxUnpoolingParams {
int32_t dims;
int32_t pooling_dims;
::c10::metal::array<idx_type_t, N> input_sizes;
::c10::metal::array<idx_type_t, N> input_strides;
::c10::metal::array<idx_type_t, N> output_sizes;
::c10::metal::array<idx_type_t, N> output_strides;
::c10::metal::array<idx_type_t, N> indices_strides;
};

View File

@ -168,6 +168,16 @@ PoolOffsets find_pool_offsets(
leading_dims,
return_indices,
tid);
case 3:
return find_pool_offsets_dim_specific<3>(
output_sizes,
output_strides,
indices_strides,
input_strides,
pooling_dim_indices,
leading_dims,
return_indices,
tid);
}
return PoolOffsets();
}
@ -292,6 +302,68 @@ kernel void max_pool_backward(
pooling_dims);
}
template <typename T>
void max_unpool_impl(
device T* output,
T input_element,
int32_t input_index,
constant int32_t* output_sizes,
constant int32_t* output_strides,
int32_t pooling_dims) {
int32_t size_prod = 1;
int32_t pool_offset = 0;
for (auto dim = pooling_dims - 1; dim >= 0; dim--) {
auto next_size_prod = output_sizes[dim] * size_prod;
pool_offset +=
output_strides[dim] * ((input_index % next_size_prod) / size_prod);
size_prod *= output_sizes[dim];
}
output[pool_offset] = input_element;
}
// Kernel computes one element of the grad input per kernel call.
template <typename T>
kernel void max_unpool(
device T* output [[buffer(0)]],
constant T* input [[buffer(1)]],
constant int64_t* indices [[buffer(2)]],
constant MaxUnpoolingParams<5>& params [[buffer(3)]],
uint tid [[thread_position_in_grid]]) {
auto pooling_dims = params.pooling_dims;
auto dims = params.dims;
auto input_sizes = params.input_sizes.data();
auto input_strides = params.input_strides.data();
auto output_sizes = params.output_sizes.data();
auto output_strides = params.output_strides.data();
auto indices_strides = params.indices_strides.data();
auto leading_dims = dims - pooling_dims;
// NOTE: Since we're doing unpooling, the variable names "input" and "output"
// are reversed compared to the pooling operations. So in `find_pool_offsets`,
// we need to map "input" -> "output" and "output" -> "input".
PoolOffsets offsets = find_pool_offsets(
/*output_sizes=*/input_sizes,
/*output_strides=*/input_strides,
indices_strides,
/*input_strides=*/output_strides,
/*pooling_dim_indices=*/nullptr,
dims,
leading_dims,
/*return_indices=*/true,
tid);
max_unpool_impl<T>(
output + offsets.input_leading,
input[offsets.output],
indices[offsets.indices],
output_sizes + leading_dims,
output_strides + leading_dims,
pooling_dims);
}
template <typename T>
struct AvgPoolIterBounds {
T start;
@ -428,18 +500,25 @@ kernel void avg_pool(
params.divisor_override);
}
#define REGISTER_POOL_OP(DTYPE) \
template [[host_name("max_pool_" #DTYPE)]] kernel void max_pool<DTYPE>( \
constant DTYPE * input [[buffer(0)]], \
device DTYPE * output [[buffer(1)]], \
device int64_t* indices [[buffer(2)]], \
constant PoolingParams<5>& params [[buffer(3)]], \
uint tid [[thread_position_in_grid]]); \
\
template [[host_name("avg_pool_" #DTYPE)]] kernel void avg_pool<DTYPE>( \
constant DTYPE * input [[buffer(0)]], \
device DTYPE * output [[buffer(1)]], \
constant AvgPoolingParams<5> & params [[buffer(2)]], \
#define REGISTER_POOL_OP(DTYPE) \
template [[host_name("max_pool_" #DTYPE)]] kernel void max_pool<DTYPE>( \
constant DTYPE * input [[buffer(0)]], \
device DTYPE * output [[buffer(1)]], \
device int64_t* indices [[buffer(2)]], \
constant PoolingParams<5>& params [[buffer(3)]], \
uint tid [[thread_position_in_grid]]); \
\
template [[host_name("max_unpool_" #DTYPE)]] kernel void max_unpool<DTYPE>( \
device DTYPE * output [[buffer(0)]], \
constant DTYPE * input [[buffer(1)]], \
constant int64_t* indices [[buffer(2)]], \
constant MaxUnpoolingParams<5>& params [[buffer(3)]], \
uint tid [[thread_position_in_grid]]); \
\
template [[host_name("avg_pool_" #DTYPE)]] kernel void avg_pool<DTYPE>( \
constant DTYPE * input [[buffer(0)]], \
device DTYPE * output [[buffer(1)]], \
constant AvgPoolingParams<5> & params [[buffer(2)]], \
uint tid [[thread_position_in_grid]]);
#define REGISTER_MAX_POOL_BACKWARD_OP(DTYPE) \
@ -453,6 +532,7 @@ kernel void avg_pool(
REGISTER_POOL_OP(float);
REGISTER_POOL_OP(half);
REGISTER_POOL_OP(bfloat);
REGISTER_POOL_OP(int);
REGISTER_POOL_OP(long);
REGISTER_POOL_OP(short);
@ -462,8 +542,4 @@ REGISTER_POOL_OP(bool);
REGISTER_MAX_POOL_BACKWARD_OP(float);
REGISTER_MAX_POOL_BACKWARD_OP(half);
#if __METAL_VERSION__ >= 310
REGISTER_POOL_OP(bfloat);
REGISTER_MAX_POOL_BACKWARD_OP(bfloat);
#endif

View File

@ -197,12 +197,10 @@ INSTANTIATE_INT4MV(float, 128);
INSTANTIATE_INT4MV(half, 128);
INSTANTIATE_INT4MV(float, 256);
INSTANTIATE_INT4MV(half, 256);
#if __METAL_VERSION__ >= 310
INSTANTIATE_INT4MV(bfloat, 32);
INSTANTIATE_INT4MV(bfloat, 64);
INSTANTIATE_INT4MV(bfloat, 128);
INSTANTIATE_INT4MV(bfloat, 256);
#endif
// ------------------------------ int8 MM For M >= 12 ------------------------------------
/**
@ -234,12 +232,10 @@ template <> struct BlockType<half> {
using simdgroup_type8x8 = simdgroup_half8x8;
using type4 = half4;
};
#if __METAL_VERSION__ >= 310
template <> struct BlockType<bfloat> {
using simdgroup_type8x8 = simdgroup_bfloat8x8;
using type4 = bfloat4;
};
#endif
template<typename T>
float2 get_scale_zero_q8(constant T * scalesAndZeros, uint2 index) {
@ -490,9 +486,7 @@ kernel void kernel_mul_mm<DTYPE, WDTYPE, DEQUANT_FUNC>( \
INSTANTIATE_MM(float, char, get_scale_zero_q8);
INSTANTIATE_MM(half, char, get_scale_zero_q8);
#if __METAL_VERSION__ >= 310
INSTANTIATE_MM(bfloat, char, get_scale_zero_q8);
#endif
// ------------------------------ int8 MM For M < 12 ------------------------------------
/* Matrix vector multiplication, used for small M size for matrix multiplication as well.
@ -646,6 +640,4 @@ kernel void kernel_mul_mv<DTYPE>(
INSTANTIATE_MV(float);
INSTANTIATE_MV(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_MV(bfloat);
#endif

View File

@ -192,6 +192,4 @@ template <typename T>
instantiate_rms(float)
instantiate_rms(half)
#if __METAL_VERSION__ >= 310
instantiate_rms(bfloat)
#endif // clang-format on

View File

@ -23,6 +23,4 @@ kernel void renorm(
REGISTER_RENORM_OP(float);
REGISTER_RENORM_OP(half);
#if __METAL_VERSION__ >= 310
REGISTER_RENORM_OP(bfloat);
#endif

View File

@ -25,379 +25,6 @@ struct LogAddExp {
};
};
#if __METAL_VERSION__ < 310
template <typename T, typename acc_t = accum_t<T>>
struct CumMinOp {
static acc_t apply(acc_t a, acc_t b) {
return metal::min(a, b);
}
static acc_t identity() {
return static_cast<acc_t>(
metal::is_floating_point_v<T> ? metal::numeric_limits<T>::infinity()
: metal::numeric_limits<T>::max());
}
};
template <typename T, typename acc_t = accum_t<T>>
struct CumMaxOp {
static acc_t apply(acc_t a, acc_t b) {
return metal::max(a, b);
}
static acc_t identity() {
return static_cast<acc_t>(
metal::is_floating_point_v<T> ? -metal::numeric_limits<T>::infinity()
: metal::numeric_limits<T>::lowest());
}
};
template <typename T, typename acc_t = accum_t<T>>
struct LogCumSumExpOp {
static acc_t apply(acc_t x, acc_t y) {
return LogAddExp{}(x, y);
}
static acc_t identity() {
return -metal::numeric_limits<acc_t>::infinity();
}
};
// Inclusive scan along innermost dimension for contiguous tensors
template <typename T, typename Op, typename acc_t = accum_t<T>>
kernel void scan_contiguous_innermost_dim(
constant T* input [[buffer(0)]],
device T* output [[buffer(1)]],
constant uint& num_rows [[buffer(2)]],
constant uint& row_size [[buffer(3)]],
uint row [[thread_position_in_grid]]) {
if (row >= num_rows)
return;
const uint offset = row * row_size;
acc_t accumulator = Op::identity();
for (uint col = 0; col < row_size; col++) {
T val = input[offset + col];
acc_t accum_val = static_cast<acc_t>(val);
accumulator = Op::apply(accumulator, accum_val);
output[offset + col] = static_cast<T>(accumulator);
}
}
// Inclusive scan along outer dimension for contiguous tensors
template <typename T, typename Op, typename acc_t = accum_t<T>>
kernel void scan_contiguous_outer_dim(
constant T* input [[buffer(0)]],
device T* output [[buffer(1)]],
constant uint& num_orows [[buffer(2)]],
constant uint& num_irows [[buffer(3)]],
constant uint& row_size [[buffer(4)]],
uint thread_index [[thread_position_in_grid]]) {
const uint orow = thread_index / num_irows;
const uint irow = thread_index % num_irows;
if (orow >= num_orows)
return;
acc_t accumulator = Op::identity();
const uint idx_base = orow * row_size * num_irows + irow;
for (uint col = 0, idx = idx_base; col < row_size; col++, idx += num_irows) {
T val = input[idx];
acc_t accum_val = static_cast<acc_t>(val);
accumulator = Op::apply(accumulator, accum_val);
output[idx] = static_cast<T>(accumulator);
}
}
// Inclusive scan with indices along innermost dimension for contiguous tensors
template <typename T, typename Op, typename acc_t = accum_t<T>>
kernel void scan_with_indices_contiguous_innermost_dim(
constant T* input [[buffer(0)]],
device T* values [[buffer(1)]],
device int64_t* indices [[buffer(2)]],
constant uint& num_rows [[buffer(3)]],
constant uint& row_size [[buffer(4)]],
uint row [[thread_position_in_grid]]) {
if (row >= num_rows)
return;
const uint offset = row * row_size;
acc_t accumulator = Op::identity();
int64_t best_idx = 0;
for (uint col = 0; col < row_size; col++) {
T val = input[offset + col];
acc_t accum_val = static_cast<acc_t>(val);
if (col == 0 || Op::apply(accum_val, accumulator) == accum_val) {
accumulator = accum_val;
best_idx = col;
}
values[offset + col] = static_cast<T>(accumulator);
indices[offset + col] = best_idx;
}
}
// Inclusive scan with indices along outer dimension for contiguous tensors
template <typename T, typename Op, typename acc_t = accum_t<T>>
kernel void scan_with_indices_contiguous_outer_dim(
constant T* input [[buffer(0)]],
device T* values [[buffer(1)]],
device int64_t* indices [[buffer(2)]],
constant uint& num_orows [[buffer(3)]],
constant uint& num_irows [[buffer(4)]],
constant uint& row_size [[buffer(5)]],
uint thread_index [[thread_position_in_grid]]) {
const uint orow = thread_index / num_irows;
const uint irow = thread_index % num_irows;
if (orow >= num_orows)
return;
acc_t accumulator = Op::identity();
int64_t best_idx = 0;
const uint idx_base = orow * row_size * num_irows + irow;
for (uint col = 0, idx = idx_base; col < row_size; col++, idx += num_irows) {
T val = input[idx];
acc_t accum_val = static_cast<acc_t>(val);
if (col == 0 || Op::apply(accum_val, accumulator) == accum_val) {
accumulator = accum_val;
best_idx = col;
}
values[idx] = static_cast<T>(accumulator);
indices[idx] = best_idx;
}
}
// Shared utility functions for strided kernels
inline long calculate_non_scan_elements(
constant long* sizes,
uint ndim,
uint scan_dim) {
long total = 1;
for (uint i = 0; i < ndim; ++i) {
if (i != scan_dim) {
total *= sizes[i];
}
}
return total;
}
inline void thread_index_to_coordinates(
uint index,
int pos[c10::metal::max_ndim],
constant long* sizes,
uint ndim,
uint scan_dim) {
long remaining_index = index;
for (uint i = 0; i < ndim; ++i) {
if (i != scan_dim) {
pos[i] = remaining_index % sizes[i];
remaining_index /= sizes[i];
} else {
pos[i] = 0;
}
}
}
inline long calculate_base_offset(
int pos[c10::metal::max_ndim],
constant long* strides,
uint ndim,
uint scan_dim) {
long offset = 0;
for (uint i = 0; i < ndim; ++i) {
if (i != scan_dim) {
offset += pos[i] * strides[i];
}
}
return offset;
}
// Generic strided scan kernel
template <typename T, typename Op, typename acc_t = accum_t<T>>
kernel void scan_strided(
constant T* input [[buffer(0)]],
device T* output [[buffer(1)]],
constant long* sizes [[buffer(2)]],
constant long* input_strides [[buffer(3)]],
constant long* output_strides [[buffer(4)]],
constant uint& ndim [[buffer(5)]],
constant uint& scan_dim [[buffer(6)]],
uint thread_index [[thread_position_in_grid]]) {
const long total_non_scan_elements =
calculate_non_scan_elements(sizes, ndim, scan_dim);
if (thread_index >= total_non_scan_elements) {
return;
}
int pos[c10::metal::max_ndim];
thread_index_to_coordinates(thread_index, pos, sizes, ndim, scan_dim);
const long input_base_offset =
calculate_base_offset(pos, input_strides, ndim, scan_dim);
const long output_base_offset =
calculate_base_offset(pos, output_strides, ndim, scan_dim);
acc_t accumulator = Op::identity();
const long scan_size = sizes[scan_dim];
const long input_scan_stride = input_strides[scan_dim];
const long output_scan_stride = output_strides[scan_dim];
for (long scan_idx = 0; scan_idx < scan_size; scan_idx++) {
const long input_offset = input_base_offset + scan_idx * input_scan_stride;
const long output_offset =
output_base_offset + scan_idx * output_scan_stride;
T val = input[input_offset];
acc_t accum_val = static_cast<acc_t>(val);
accumulator = Op::apply(accumulator, accum_val);
output[output_offset] = static_cast<T>(accumulator);
}
}
// Generic strided scan with indices kernel
template <typename T, typename Op, typename acc_t = accum_t<T>>
kernel void scan_with_indices_strided(
constant T* input [[buffer(0)]],
device T* values [[buffer(1)]],
device int64_t* indices [[buffer(2)]],
constant long* sizes [[buffer(3)]],
constant long* input_strides [[buffer(4)]],
constant long* values_strides [[buffer(5)]],
constant long* indices_strides [[buffer(6)]],
constant uint& ndim [[buffer(7)]],
constant uint& scan_dim [[buffer(8)]],
uint thread_index [[thread_position_in_grid]]) {
const long total_non_scan_elements =
calculate_non_scan_elements(sizes, ndim, scan_dim);
if (thread_index >= total_non_scan_elements) {
return;
}
int pos[c10::metal::max_ndim];
thread_index_to_coordinates(thread_index, pos, sizes, ndim, scan_dim);
const long input_base_offset =
calculate_base_offset(pos, input_strides, ndim, scan_dim);
const long values_base_offset =
calculate_base_offset(pos, values_strides, ndim, scan_dim);
const long indices_base_offset =
calculate_base_offset(pos, indices_strides, ndim, scan_dim);
acc_t accumulator = Op::identity();
int64_t best_idx = 0;
const long scan_size = sizes[scan_dim];
const long input_scan_stride = input_strides[scan_dim];
const long values_scan_stride = values_strides[scan_dim];
const long indices_scan_stride = indices_strides[scan_dim];
for (long scan_idx = 0; scan_idx < scan_size; scan_idx++) {
const long input_offset = input_base_offset + scan_idx * input_scan_stride;
const long values_offset =
values_base_offset + scan_idx * values_scan_stride;
const long indices_offset =
indices_base_offset + scan_idx * indices_scan_stride;
T val = input[input_offset];
acc_t accum_val = static_cast<acc_t>(val);
if (scan_idx == 0 || Op::apply(accum_val, accumulator) == accum_val) {
accumulator = accum_val;
best_idx = scan_idx;
}
values[values_offset] = static_cast<T>(accumulator);
indices[indices_offset] = best_idx;
}
}
#define REGISTER_SCAN_OP(OP_NAME, OP_CLASS, DTYPE) \
template [[host_name(#OP_NAME "_contiguous_innermost_" #DTYPE)]] kernel void \
scan_contiguous_innermost_dim<DTYPE, OP_CLASS<DTYPE>>( \
constant DTYPE * input [[buffer(0)]], \
device DTYPE * output [[buffer(1)]], \
constant uint & num_rows [[buffer(2)]], \
constant uint & row_size [[buffer(3)]], \
uint row [[thread_position_in_grid]]); \
\
template [[host_name(#OP_NAME "_contiguous_outer_" #DTYPE)]] kernel void \
scan_contiguous_outer_dim<DTYPE, OP_CLASS<DTYPE>>( \
constant DTYPE * input [[buffer(0)]], \
device DTYPE * output [[buffer(1)]], \
constant uint & num_orows [[buffer(2)]], \
constant uint & num_irows [[buffer(3)]], \
constant uint & row_size [[buffer(4)]], \
uint thread_index [[thread_position_in_grid]]); \
\
template [[host_name(#OP_NAME "_strided_" #DTYPE)]] kernel void \
scan_strided<DTYPE, OP_CLASS<DTYPE>>( \
constant DTYPE * input [[buffer(0)]], \
device DTYPE * output [[buffer(1)]], \
constant long* sizes [[buffer(2)]], \
constant long* input_strides [[buffer(3)]], \
constant long* output_strides [[buffer(4)]], \
constant uint& ndim [[buffer(5)]], \
constant uint& scan_dim [[buffer(6)]], \
uint thread_index [[thread_position_in_grid]]);
#define REGISTER_SCAN_WITH_INDICES_OP(OP_NAME, OP_CLASS, DTYPE) \
template [[host_name(#OP_NAME "_contiguous_innermost_" #DTYPE)]] kernel void \
scan_with_indices_contiguous_innermost_dim<DTYPE, OP_CLASS<DTYPE>>( \
constant DTYPE * input [[buffer(0)]], \
device DTYPE * values [[buffer(1)]], \
device int64_t* indices [[buffer(2)]], \
constant uint& num_rows [[buffer(3)]], \
constant uint& row_size [[buffer(4)]], \
uint row [[thread_position_in_grid]]); \
\
template [[host_name(#OP_NAME "_contiguous_outer_" #DTYPE)]] kernel void \
scan_with_indices_contiguous_outer_dim<DTYPE, OP_CLASS<DTYPE>>( \
constant DTYPE * input [[buffer(0)]], \
device DTYPE * values [[buffer(1)]], \
device int64_t* indices [[buffer(2)]], \
constant uint& num_orows [[buffer(3)]], \
constant uint& num_irows [[buffer(4)]], \
constant uint& row_size [[buffer(5)]], \
uint thread_index [[thread_position_in_grid]]); \
\
template [[host_name(#OP_NAME "_strided_" #DTYPE)]] kernel void \
scan_with_indices_strided<DTYPE, OP_CLASS<DTYPE>>( \
constant DTYPE * input [[buffer(0)]], \
device DTYPE * values [[buffer(1)]], \
device int64_t* indices [[buffer(2)]], \
constant long* sizes [[buffer(3)]], \
constant long* input_strides [[buffer(4)]], \
constant long* values_strides [[buffer(5)]], \
constant long* indices_strides [[buffer(6)]], \
constant uint& ndim [[buffer(7)]], \
constant uint& scan_dim [[buffer(8)]], \
uint thread_index [[thread_position_in_grid]]);
// Simple scan operations
REGISTER_SCAN_OP(logcumsumexp, LogCumSumExpOp, float);
REGISTER_SCAN_OP(logcumsumexp, LogCumSumExpOp, half);
// Scan operations with indices
REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, float);
REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, half);
REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, long);
REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, int);
REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, short);
REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, char);
REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, uchar);
REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, bool);
REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, float);
REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, half);
REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, long);
REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, int);
REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, short);
REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, char);
REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, uchar);
REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, bool);
#else // __METAL_VERSION__ >= 310
C10_METAL_CONSTEXPR auto simd_size = c10::metal::simdgroup_size;
// The reminder of this file contains cummin and cummax implementations adapted
@ -1159,5 +786,3 @@ REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, short, 4);
REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, char, 4);
REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, uchar, 4);
REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, bool, 4);
#endif

View File

@ -89,6 +89,4 @@ REGISTER_SPECIAL(short, float);
REGISTER_SPECIAL(int, float);
REGISTER_SPECIAL(long, float);
REGISTER_SPECIAL(half, half);
#if __METAL_VERSION__ >= 310
REGISTER_SPECIAL(bfloat, bfloat);
#endif

View File

@ -100,9 +100,7 @@ kernel void triul(
INSTANTIATE_TRIUL_KERNELS(float, int);
INSTANTIATE_TRIUL_KERNELS(half, int);
#if __METAL_VERSION__ >= 310
INSTANTIATE_TRIUL_KERNELS(bfloat, int);
#endif
INSTANTIATE_TRIUL_KERNELS(float2, int);
INSTANTIATE_TRIUL_KERNELS(half2, int);

View File

@ -556,11 +556,9 @@ REGISTER_UNARY_OP(abs, half, half);
REGISTER_UNARY_OP(acos, DTYPE1, DTYPE0); \
REGISTER_UNARY_OP(atan, DTYPE1, DTYPE0)
#if __METAL_VERSION__ >= 310
INSTANTIATE_UNARY_KERNELS2(bfloat, bfloat);
REGISTER_UNARY_OP(neg, bfloat, bfloat);
REGISTER_UNARY_OP(abs, bfloat, bfloat);
#endif
INSTANTIATE_UNARY_KERNELS2(half, half);
INSTANTIATE_UNARY_KERNELS2(float, float);
INSTANTIATE_UNARY_KERNELS2(float, bool);
@ -600,6 +598,4 @@ INSTANTIATE_UNARY_KERNELS_VEC2(float);
REGISTER_UNARY_ALPHA_OP(round_decimals, float, long, float);
REGISTER_UNARY_ALPHA_OP(round_decimals, half, long, half);
#if __METAL_VERSION__ >= 310
REGISTER_UNARY_ALPHA_OP(round_decimals, bfloat, long, bfloat);
#endif

View File

@ -70,6 +70,4 @@ kernel void unfold_backward(
INSTANTIATE_UNFOLD_BACKWARD(float);
INSTANTIATE_UNFOLD_BACKWARD(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_UNFOLD_BACKWARD(bfloat);
#endif

View File

@ -852,6 +852,4 @@ INSTANTIATE_UPSAMPLE_2D(bilinear2d, uchar);
INSTANTIATE_UPSAMPLE_3D(uchar);
INSTANTIATE_UPSAMPLE_ALL(float);
INSTANTIATE_UPSAMPLE_ALL(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_UPSAMPLE_ALL(bfloat);
#endif

View File

@ -21,6 +21,8 @@
#include <ATen/ops/max_pool2d_with_indices_native.h>
#include <ATen/ops/max_pool3d_with_indices_backward_native.h>
#include <ATen/ops/max_pool3d_with_indices_native.h>
#include <ATen/ops/max_unpool2d_native.h>
#include <ATen/ops/max_unpool3d_native.h>
#endif
namespace at::native {
@ -492,6 +494,60 @@ static void max_pool_with_indices_backward_out_mps_template(Tensor& grad_input,
});
}
static void max_unpool_out_mps_template(const Tensor& input,
const Tensor& indices,
IntArrayRef output_size_,
IntArrayRef stride,
IntArrayRef padding,
Tensor& output,
const int32_t pooling_dims,
const std::string& op_name) {
auto dims = input.dim();
auto leading_dims = input.dim() - pooling_dims;
const auto memory_format = input.suggest_memory_format();
std::vector<int64_t> output_size(dims);
for (int dim : c10::irange(leading_dims)) {
output_size[dim] = input.sizes()[dim];
}
for (int dim : c10::irange(pooling_dims)) {
output_size[leading_dims + dim] = output_size_[dim];
}
output.resize_(output_size, memory_format);
output.fill_(0);
id<MTLDevice> device = MPSDevice::getInstance()->device();
MPSStream* mpsStream = getCurrentMPSStream();
const auto numThreads = input.numel();
MaxUnpoolingParams<5> params;
params.dims = dims;
params.pooling_dims = pooling_dims;
for (const auto dim : c10::irange(dims)) {
params.output_sizes[dim] = safe_downcast<int32_t, int64_t>(output.size(dim));
params.output_strides[dim] = safe_downcast<int32_t, int64_t>(output.stride(dim));
params.input_sizes[dim] = safe_downcast<int32_t, int64_t>(input.size(dim));
params.input_strides[dim] = safe_downcast<int32_t, int64_t>(input.stride(dim));
params.indices_strides[dim] = safe_downcast<int32_t, int64_t>(indices.stride(dim));
}
dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
@autoreleasepool {
id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
auto PSO = lib.getPipelineStateForFunc("max_unpool_" + scalarToMetalTypeString(input));
getMPSProfiler().beginProfileKernel(PSO, op_name, {input});
[computeEncoder setComputePipelineState:PSO];
mtl_setArgs(computeEncoder, output, input, indices, params);
mtl_dispatch1DJob(computeEncoder, PSO, numThreads);
getMPSProfiler().endProfileKernel(PSO);
}
});
}
static void avg_pool2d_template(const Tensor& input,
const Tensor& output,
const std::optional<Tensor>& grad_output_opt,
@ -896,6 +952,68 @@ Tensor max_pool3d_with_indices_backward_mps(const Tensor& grad_output,
return grad_input;
}
Tensor& max_unpooling2d_forward_out_mps(const Tensor& self,
const Tensor& indices,
IntArrayRef output_size,
Tensor& output) {
mps::max_unpool_out_mps_template(self,
indices,
output_size,
/*stride=*/{},
/*padding=*/{},
output,
/*pooling_dims=*/2,
"max_unpool2d");
return output;
}
Tensor max_unpooling2d_forward_mps(const Tensor& self, const Tensor& indices, IntArrayRef output_size) {
auto output = at::empty({0}, self.options());
mps::max_unpool_out_mps_template(self,
indices,
output_size,
/*stride=*/{},
/*padding=*/{},
output,
/*pooling_dims=*/2,
"max_unpool2d");
return output;
}
Tensor& max_unpooling3d_forward_out_mps(const Tensor& self,
const Tensor& indices,
IntArrayRef output_size,
IntArrayRef stride,
IntArrayRef padding,
Tensor& output) {
mps::max_unpool_out_mps_template(self,
indices,
output_size,
stride,
padding,
output,
/*pooling_dims=*/3,
"max_unpool3d");
return output;
}
Tensor max_unpooling3d_forward_mps(const Tensor& self,
const Tensor& indices,
IntArrayRef output_size,
IntArrayRef stride,
IntArrayRef padding) {
auto output = at::empty({0}, self.options());
mps::max_unpool_out_mps_template(self,
indices,
output_size,
stride,
padding,
output,
/*pooling_dims=*/3,
"max_unpool3d");
return output;
}
TORCH_IMPL_FUNC(avg_pool2d_out_mps)
(const Tensor& input,
int64_t kH,

View File

@ -719,6 +719,7 @@
dispatch:
CPU, CUDA: all_out
MPS: all_out_mps
MTIA: all_out_mtia
- func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
@ -808,6 +809,7 @@
CPU, Meta: arange_out
CUDA: arange_cuda_out
MPS: arange_mps_out
MTIA: arange_mtia_out
cpp_no_default_args: ['step']
# This function is a temporary hack to allow tracing of arange like constructs with dynamic
@ -1889,7 +1891,10 @@
- func: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor, Tensor)
dispatch:
CUDA: cudnn_batch_norm
autogen: cudnn_batch_norm.out
- func: cudnn_batch_norm.out(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))
dispatch:
CUDA: cudnn_batch_norm_out
# NB: You can only use this if you used cudnn_batch_norm training=True
- func: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor)
@ -4182,11 +4187,13 @@
dispatch:
CPU: _int_mm_cpu
CUDA: _int_mm_cuda
XPU: _int_mm_xpu
- func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU: _int_mm_out_cpu
CUDA: _int_mm_out_cuda
XPU: _int_mm_out_xpu
- func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor
dispatch:
@ -7124,18 +7131,21 @@
dispatch:
CPU: _scaled_mm_cpu
CUDA: _scaled_mm_cuda
tags: needs_exact_strides
- func: _scaled_mm.out(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False, *, Tensor(a!) out) -> Tensor(a!)
variants: function
dispatch:
CPU: _scaled_mm_out_cpu
CUDA: _scaled_mm_out_cuda
tags: needs_exact_strides
- func: _scaled_grouped_mm(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? offs=None, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False) -> Tensor
variants: function
dispatch:
CUDA: _scaled_grouped_mm_cuda
tags: needs_exact_strides
- func: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
variants: function
@ -10487,6 +10497,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow_
CUDA: foreach_tensor_add_scalar_kernel_cuda_
MTIA: foreach_tensor_add_scalar_kernel_mtia_
autogen: _foreach_add.Scalar_out
- func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
@ -10495,6 +10506,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow
CUDA: foreach_tensor_add_list_kernel_cuda
MTIA: foreach_tensor_add_list_kernel_mtia
- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@ -10502,6 +10514,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow_
CUDA: foreach_tensor_add_list_kernel_cuda_
MTIA: foreach_tensor_add_list_kernel_mtia_
autogen: _foreach_add.List_out
- func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@ -10532,6 +10545,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow_
CUDA: foreach_tensor_add_tensor_kernel_cuda_
MTIA: foreach_tensor_add_tensor_kernel_mtia_
autogen: _foreach_add.Tensor_out
- func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
@ -10592,6 +10606,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow_
CUDA: foreach_tensor_mul_scalar_kernel_cuda_
MTIA: foreach_tensor_mul_scalar_kernel_mtia_
autogen: _foreach_mul.Scalar_out
- func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
@ -10600,6 +10615,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow
CUDA: foreach_tensor_mul_list_kernel_cuda
MTIA: foreach_tensor_mul_list_kernel_mtia
- func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@ -10607,6 +10623,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow_
CUDA: foreach_tensor_mul_list_kernel_cuda_
MTIA: foreach_tensor_mul_list_kernel_mtia_
autogen: _foreach_mul.List_out
- func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@ -10630,6 +10647,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow
CUDA: foreach_tensor_mul_tensor_kernel_cuda
MTIA: foreach_tensor_mul_tensor_kernel_mtia
- func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@ -10637,6 +10655,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow_
CUDA: foreach_tensor_mul_tensor_kernel_cuda_
MTIA: foreach_tensor_mul_tensor_kernel_mtia_
autogen: _foreach_mul.Tensor_out
- func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
@ -10933,6 +10952,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow
CUDA: foreach_tensor_addcmul_scalar_cuda
MTIA: foreach_tensor_addcmul_scalar_mtia
- func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@ -10954,6 +10974,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow_
CUDA: foreach_tensor_addcmul_scalar_cuda_
MTIA: foreach_tensor_addcmul_scalar_mtia_
autogen: _foreach_addcmul.Scalar_out
- func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
@ -10978,6 +10999,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_abs_slow
CUDA: foreach_tensor_abs_cuda
MTIA: foreach_tensor_abs_mtia
- func: _foreach_abs_(Tensor(a!)[] self) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@ -10985,6 +11007,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_abs_slow_
CUDA: foreach_tensor_abs_cuda_
MTIA: foreach_tensor_abs_mtia_
autogen: _foreach_abs.out
- func: _foreach_acos(Tensor[] self) -> Tensor[]
@ -11319,6 +11342,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_norm_slow
CUDA: foreach_tensor_norm_cuda
MTIA: foreach_tensor_norm_mtia
autogen: _foreach_norm.Scalar_out
- func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
@ -11491,6 +11515,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_sqrt_slow_
CUDA: foreach_tensor_sqrt_cuda_
MTIA: foreach_tensor_sqrt_mtia_
autogen: _foreach_sqrt.out
- func: _foreach_tan(Tensor[] self) -> Tensor[]
@ -11552,6 +11577,7 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_copy_list_kernel_slow_
CUDA: foreach_tensor_copy_list_kernel_cuda_
MTIA: foreach_tensor_copy_list_kernel_mtia_
autogen: _foreach_copy.out
- func: _foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out
@ -11559,6 +11585,7 @@
variants: function
dispatch:
CompositeExplicitAutograd: _foreach_copy
MTIA: foreach_tensor_copy_list_kernel_mtia
- func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
dispatch:
@ -12476,24 +12503,28 @@
dispatch:
CPU: max_unpooling2d_forward_out_cpu
CUDA: max_unpooling2d_forward_out_cuda
MPS: max_unpooling2d_forward_out_mps
- func: max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor
python_module: nn
dispatch:
CPU: max_unpooling2d_forward_cpu
CUDA: max_unpooling2d_forward_cuda
MPS: max_unpooling2d_forward_mps
- func: max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
python_module: nn
dispatch:
CPU: max_unpooling3d_forward_out_cpu
CUDA: max_unpooling3d_forward_out_cuda
MPS: max_unpooling3d_forward_out_mps
- func: max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor
python_module: nn
dispatch:
CPU: max_unpooling3d_forward_cpu
CUDA: max_unpooling3d_forward_cuda
MPS: max_unpooling3d_forward_mps
- func: reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
python_module: nn

View File

@ -1,7 +1,7 @@
# generate a list of kernels, but not actually emit files at config stage
execute_process(
COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py
--api fwd --receipt 600 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/fwd_blob_list.txt
--api fwd --receipt 4 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/fwd_blob_list.txt
RESULT_VARIABLE ret
)
@ -11,7 +11,27 @@ endif()
execute_process(
COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py
--api bwd --receipt 600 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/bwd_blob_list.txt
--api fwd_splitkv --receipt 4 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/fwd_splitkv_blob_list.txt
RESULT_VARIABLE ret
)
if(ret AND NOT ret EQUAL 0)
message( FATAL_ERROR "CK Tile FMHA FAILED to generate a list of FWD_SPLITKV kernels via Python.")
endif()
execute_process(
COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py
--api fwd_appendkv --receipt 4 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/fwd_appendkv_blob_list.txt
RESULT_VARIABLE ret
)
if(ret AND NOT ret EQUAL 0)
message( FATAL_ERROR "CK Tile FMHA FAILED to generate a list of FWD_APPENDKV kernels via Python.")
endif()
execute_process(
COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py
--api bwd --receipt 4 --list_blobs ${CMAKE_CURRENT_LIST_DIR}/bwd_blob_list.txt
RESULT_VARIABLE ret
)
@ -19,15 +39,29 @@ if(ret AND NOT ret EQUAL 0)
message( FATAL_ERROR "CK Tile FMHA FAILED to generate a list of BWD kernels via Python.")
endif()
# Generate the files for both fwd and bwd
execute_process(COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api fwd --receipt 600 --output_dir ${CMAKE_CURRENT_LIST_DIR}
# Generate the files for both fwd, fwd_splitkv, fwd_appendkv, and bwd
execute_process(COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api fwd --receipt 4 --output_dir ${CMAKE_CURRENT_LIST_DIR}
)
if(ret AND NOT ret EQUAL 0)
message( FATAL_ERROR "CK Tile FMHA FAILED to generate FWD kernels.")
endif()
execute_process(COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api bwd --receipt 600 --output_dir ${CMAKE_CURRENT_LIST_DIR}
execute_process(COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api fwd_splitkv --receipt 4 --output_dir ${CMAKE_CURRENT_LIST_DIR}
)
if(ret AND NOT ret EQUAL 0)
message( FATAL_ERROR "CK Tile FMHA FAILED to generate FWD_SPLITKV kernels.")
endif()
execute_process(COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api fwd_appendkv --receipt 4 --output_dir ${CMAKE_CURRENT_LIST_DIR}
)
if(ret AND NOT ret EQUAL 0)
message( FATAL_ERROR "CK Tile FMHA FAILED to generate FWD_APPENDKV kernels.")
endif()
execute_process(COMMAND python3 ${CMAKE_SOURCE_DIR}/third_party/composable_kernel/example/ck_tile/01_fmha/generate.py --api bwd --receipt 4 --output_dir ${CMAKE_CURRENT_LIST_DIR}
RESULT_VARIABLE ret
)
@ -44,6 +78,22 @@ if(ret AND NOT ret EQUAL 0)
message( FATAL_ERROR "CK Tile FMHA FAILED to change make_kernel to make_kernel_pt for the fwd pass")
endif()
execute_process(
COMMAND bash -c "${CMAKE_CURRENT_LIST_DIR}/add_make_kernel_pt.sh ${CMAKE_CURRENT_LIST_DIR}/fwd_splitkv_blob_list.txt"
RESULT_VARIABLE ret)
if(ret AND NOT ret EQUAL 0)
message( FATAL_ERROR "CK Tile FMHA FAILED to change make_kernel to make_kernel_pt for the fwd_splitkv pass")
endif()
execute_process(
COMMAND bash -c "${CMAKE_CURRENT_LIST_DIR}/add_make_kernel_pt.sh ${CMAKE_CURRENT_LIST_DIR}/fwd_appendkv_blob_list.txt"
RESULT_VARIABLE ret)
if(ret AND NOT ret EQUAL 0)
message( FATAL_ERROR "CK Tile FMHA FAILED to change make_kernel to make_kernel_pt for the fwd appendkv pass")
endif()
# Change make_kernel to make_kernel_pt for bwd
execute_process(
COMMAND bash -c "${CMAKE_CURRENT_LIST_DIR}/add_make_kernel_pt.sh ${CMAKE_CURRENT_LIST_DIR}/bwd_blob_list.txt"

View File

@ -21,6 +21,8 @@ while IFS= read -r file; do
if [ -f "$file" ]; then
# Use sed to replace "make_kernel" with "make_kernel_pt" in place
sed -i 's/make_kernel/make_kernel_pt/g' "$file"
sed -i 's/\#include \"fmha_fwd.hpp\"/\#include \"fmha_fwd.hpp\"\n\#include \"launch_kernel_pt.hpp\"/g' "$file"
sed -i 's/\#include \"fmha_bwd.hpp\"/\#include \"fmha_bwd.hpp\"\n\#include \"launch_kernel_pt.hpp\"/g' "$file"
echo "Updated: $file"
else
echo "Skipping: $file (not found)"

View File

@ -1,100 +0,0 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <ostream>
#include <string>
#include <ck_tile/core.hpp>
#include <ck_tile/ops/fmha.hpp>
// keep sync with BlockAttentionBiasEnum
enum class bias_enum
{
no_bias = 0,
elementwise_bias = 1,
alibi = 2,
};
struct bias_info
{
bias_enum type;
/*
* simple dispatch logic
*
* if type == elementwise_bias:
* if rank_info == 0:
* bias is 1*1*s*s
* elif rank_info == 1:
* bias is 1*h*s*s
* elif rank_info == 2:
* bias is b*h*s*s
*
* elif type == alibi:
* if rank_info == 0:
* alibi in 1*h
* elif rank_info == 1:
* alibi in b*h
*/
int rank_info;
void serialize(std::ostream& os) const
{
if(type == bias_enum::no_bias)
os << "n";
else if(type == bias_enum::elementwise_bias)
{
os << "e";
if(rank_info != 0)
{
os << "[" << rank_info << "]";
}
}
else if(type == bias_enum::alibi)
{
os << "alibi";
if(rank_info != 0)
{
os << "[" << rank_info << "]";
}
}
}
static bias_info decode(std::string str)
{
bias_info info{bias_enum::no_bias, 0};
if(str == "0" || str == "n")
{
info.type = bias_enum::no_bias;
}
else if(str.compare(0, 1, "1") == 0 || str.compare(0, 1, "e") == 0 ||
str.compare(0, 11, "elementwise") == 0)
{
info.type = bias_enum::elementwise_bias;
auto found_0 = str.find(':');
if(found_0 != std::string::npos)
{
std::string e = str.substr(found_0 + 1);
info.rank_info = atoi(e.c_str());
}
}
else if(str.compare(0, 1, "2") == 0 || str.compare(0, 1, "a") == 0 ||
str.compare(0, 5, "alibi") == 0)
{
info.type = bias_enum::alibi;
auto found_0 = str.find(':');
if(found_0 != std::string::npos)
{
std::string e = str.substr(found_0 + 1);
info.rank_info = atoi(e.c_str());
}
}
return info;
}
friend std::ostream& operator<<(std::ostream& os, const bias_info& bi)
{
bi.serialize(os);
return os;
}
};

View File

@ -1,457 +0,0 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <ck_tile/core.hpp>
#include <ck_tile/host/kernel_launch.hpp>
#include <ck_tile/ops/fmha.hpp>
#include <ck_tile/ops/epilogue.hpp>
#include <mask.hpp>
#include <bias.hpp>
#include <launch_kernel_pt.hpp>
#include <type_traits>
#include <utility>
#include <variant>
struct FmhaBwdFp16
{
};
struct FmhaBwdBf16
{
};
template <typename DataType>
struct FmhaBwdTypeConfig;
template <>
struct FmhaBwdTypeConfig<FmhaBwdFp16>
{
using QDataType = ck_tile::half_t;
using KDataType = ck_tile::half_t;
using VDataType = ck_tile::half_t;
using GemmDataType = ck_tile::half_t;
using BiasDataType = ck_tile::half_t;
using LSEDataType = float;
using AccDataType = float; // data type for gemm accumulation
using DDataType = float;
using RandValOutputDataType = uint8_t;
using ODataType = ck_tile::half_t;
using OGradDataType = ck_tile::half_t;
using QGradDataType = ck_tile::half_t;
using KGradDataType = ck_tile::half_t;
using VGradDataType = ck_tile::half_t;
using BiasGradDataType = ck_tile::half_t;
};
template <>
struct FmhaBwdTypeConfig<FmhaBwdBf16>
{
using QDataType = ck_tile::bf16_t;
using KDataType = ck_tile::bf16_t;
using VDataType = ck_tile::bf16_t;
using GemmDataType = ck_tile::bf16_t;
using BiasDataType = ck_tile::bf16_t;
using LSEDataType = float;
using AccDataType = float; // data type for gemm accumulation
using DDataType = float;
using RandValOutputDataType = uint8_t;
using ODataType = ck_tile::bf16_t;
using OGradDataType = ck_tile::bf16_t;
using QGradDataType = ck_tile::bf16_t;
using KGradDataType = ck_tile::bf16_t;
using VGradDataType = ck_tile::bf16_t;
using BiasGradDataType = ck_tile::bf16_t;
};
struct FmhaMasks
{
using NoMask = ck_tile::GenericAttentionMask<false>;
using GenericMask = ck_tile::GenericAttentionMask<true, true>;
using CausalMask = ck_tile::GenericAttentionMask<true, false>;
};
// runtime args, some will passed to karg, some will used to compute grids/blocks
struct fmha_bwd_args
{
const void* q_ptr;
const void* k_ptr;
const void* v_ptr;
const void* bias_ptr; // bias or alibi_slope pointer
const void* o_ptr;
const void* lse_ptr;
const void* do_ptr;
void* d_ptr;
void* rand_val_ptr;
void* dq_ptr;
void* dk_ptr;
void* dv_ptr;
void* dbias_ptr;
void* dq_acc_ptr;
const void* seqstart_q_ptr;
const void* seqstart_k_ptr;
const void* seqlen_k_ptr;
ck_tile::index_t seqlen_q;
ck_tile::index_t seqlen_k;
ck_tile::index_t batch;
ck_tile::index_t max_seqlen_q;
ck_tile::index_t max_seqlen_k;
ck_tile::index_t hdim_q;
ck_tile::index_t hdim_v;
ck_tile::index_t nhead_q;
ck_tile::index_t nhead_k;
float scale;
ck_tile::index_t stride_q;
ck_tile::index_t stride_k;
ck_tile::index_t stride_v;
ck_tile::index_t stride_bias; // if alibi, b*h need set this to h, 1*h need set this to 0
ck_tile::index_t stride_o;
ck_tile::index_t stride_randval;
ck_tile::index_t stride_do;
ck_tile::index_t stride_dq_acc;
ck_tile::index_t stride_dq;
ck_tile::index_t stride_dk;
ck_tile::index_t stride_dv;
ck_tile::index_t stride_dbias;
ck_tile::index_t nhead_stride_q;
ck_tile::index_t nhead_stride_k;
ck_tile::index_t nhead_stride_v;
ck_tile::index_t nhead_stride_bias;
ck_tile::index_t nhead_stride_o;
ck_tile::index_t nhead_stride_randval;
ck_tile::index_t nhead_stride_do;
ck_tile::index_t nhead_stride_lsed;
ck_tile::index_t nhead_stride_dq_acc;
ck_tile::index_t nhead_stride_dq;
ck_tile::index_t nhead_stride_dk;
ck_tile::index_t nhead_stride_dv;
ck_tile::index_t nhead_stride_dbias;
ck_tile::index_t batch_stride_q;
ck_tile::index_t batch_stride_k;
ck_tile::index_t batch_stride_v;
ck_tile::index_t batch_stride_bias;
ck_tile::index_t batch_stride_o;
ck_tile::index_t batch_stride_randval;
ck_tile::index_t batch_stride_do;
ck_tile::index_t batch_stride_lsed;
ck_tile::index_t batch_stride_dq_acc;
ck_tile::index_t batch_stride_dq;
ck_tile::index_t batch_stride_dk;
ck_tile::index_t batch_stride_dv;
ck_tile::index_t batch_stride_dbias;
ck_tile::index_t split_stride_dq_acc;
ck_tile::index_t window_size_left;
ck_tile::index_t window_size_right;
ck_tile::index_t mask_type;
float p_drop;
float p_undrop;
std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
drop_seed_offset;
};
template <typename FmhaBwdDQDKDVKernel>
auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
{
assert(args.nhead_q % args.nhead_k == 0);
auto kargs = [&] {
// create group mode kernel arguments
if constexpr(FmhaBwdDQDKDVKernel::kIsGroupMode)
{
return FmhaBwdDQDKDVKernel::MakeKargsImpl(args.q_ptr,
args.k_ptr,
args.v_ptr,
args.bias_ptr,
args.lse_ptr,
args.do_ptr,
args.d_ptr,
args.rand_val_ptr,
args.dk_ptr,
args.dv_ptr,
args.dbias_ptr,
args.dq_acc_ptr,
args.seqstart_q_ptr,
args.seqstart_k_ptr,
args.seqlen_k_ptr,
args.hdim_q,
args.hdim_v,
args.nhead_q,
args.nhead_q / args.nhead_k,
args.scale,
args.stride_q,
args.stride_k,
args.stride_v,
args.stride_bias,
args.stride_randval,
args.stride_do,
args.stride_dq_acc,
args.stride_dk,
args.stride_dv,
args.stride_dbias,
args.nhead_stride_q,
args.nhead_stride_k,
args.nhead_stride_v,
args.nhead_stride_bias,
args.nhead_stride_randval,
args.nhead_stride_do,
args.nhead_stride_lsed,
args.nhead_stride_dq_acc,
args.nhead_stride_dk,
args.nhead_stride_dv,
args.nhead_stride_dbias,
args.split_stride_dq_acc,
args.window_size_left,
args.window_size_right,
args.mask_type,
args.p_drop,
args.drop_seed_offset);
}
else
{ // create batch mode kernel arguments
return FmhaBwdDQDKDVKernel::MakeKargsImpl(args.q_ptr,
args.k_ptr,
args.v_ptr,
args.bias_ptr,
args.lse_ptr,
args.do_ptr,
args.d_ptr,
args.rand_val_ptr,
args.dk_ptr,
args.dv_ptr,
args.dbias_ptr,
args.dq_acc_ptr,
args.seqlen_q,
args.seqlen_k,
args.hdim_q,
args.hdim_v,
args.nhead_q,
args.nhead_q / args.nhead_k,
args.scale,
args.stride_q,
args.stride_k,
args.stride_v,
args.stride_bias,
args.stride_randval,
args.stride_do,
args.stride_dq_acc,
args.stride_dk,
args.stride_dv,
args.stride_dbias,
args.nhead_stride_q,
args.nhead_stride_k,
args.nhead_stride_v,
args.nhead_stride_bias,
args.nhead_stride_randval,
args.nhead_stride_do,
args.nhead_stride_lsed,
args.nhead_stride_dq_acc,
args.nhead_stride_dk,
args.nhead_stride_dv,
args.nhead_stride_dbias,
args.batch_stride_q,
args.batch_stride_k,
args.batch_stride_v,
args.batch_stride_bias,
args.batch_stride_randval,
args.batch_stride_do,
args.batch_stride_lsed,
args.batch_stride_dq_acc,
args.batch_stride_dk,
args.batch_stride_dv,
args.batch_stride_dbias,
args.split_stride_dq_acc,
args.window_size_left,
args.window_size_right,
args.mask_type,
args.p_drop,
args.drop_seed_offset);
}
}();
dim3 grids = FmhaBwdDQDKDVKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_k);
return ck_tile::make_tuple(kargs, grids);
}
template <typename FmhaBwdOGradDotOKernel>
auto fmha_bwd_dot_do_o_create_kargs_and_grids(fmha_bwd_args args)
{
auto kargs = [&] {
// create group mode kernel arguments
if constexpr(FmhaBwdOGradDotOKernel::kIsGroupMode)
{
return FmhaBwdOGradDotOKernel::MakeKargs(args.o_ptr,
args.do_ptr,
args.d_ptr,
args.p_undrop,
args.seqstart_q_ptr,
args.hdim_v,
args.stride_do,
args.stride_o,
args.nhead_stride_do,
args.nhead_stride_o,
args.nhead_stride_lsed);
}
else
{ // create batch mode kernel arguments
return FmhaBwdOGradDotOKernel::MakeKargs(args.o_ptr,
args.do_ptr,
args.d_ptr,
args.p_undrop,
args.seqlen_q,
args.hdim_v,
args.stride_do,
args.stride_o,
args.nhead_stride_do,
args.nhead_stride_o,
args.nhead_stride_lsed,
args.batch_stride_do,
args.batch_stride_o,
args.batch_stride_lsed);
}
}();
dim3 grids = FmhaBwdOGradDotOKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q);
return ck_tile::make_tuple(kargs, grids);
}
template <typename FmhaBwdConvertQGradKernel>
auto fmha_bwd_convert_dq_create_kargs_and_grids(fmha_bwd_args args)
{
auto kargs = [&] {
// create group mode kernel arguments
if constexpr(FmhaBwdConvertQGradKernel::kIsGroupMode)
{
return FmhaBwdConvertQGradKernel::MakeKargs(args.dq_acc_ptr,
args.dq_ptr,
args.seqstart_q_ptr,
args.seqstart_k_ptr,
args.hdim_q,
args.stride_dq,
args.stride_dq_acc,
args.nhead_stride_dq,
args.nhead_stride_dq_acc,
args.split_stride_dq_acc);
}
else
{ // create batch mode kernel arguments
return FmhaBwdConvertQGradKernel::MakeKargs(args.dq_acc_ptr,
args.dq_ptr,
args.seqlen_q,
args.seqlen_k,
args.hdim_q,
args.stride_dq,
args.stride_dq_acc,
args.nhead_stride_dq,
args.nhead_stride_dq_acc,
args.batch_stride_dq,
args.batch_stride_dq_acc,
args.split_stride_dq_acc);
}
}();
dim3 grids = FmhaBwdConvertQGradKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q);
return ck_tile::make_tuple(kargs, grids);
}
// this is used to pattern-match internl kernel implementation, not to instantiate kernel
template <ck_tile::index_t HDim_,
typename DataType_,
bool kIsGroupMode_,
ck_tile::BlockFmhaBwdPipelineEnum FmhaBwdPipelineEnum_,
typename FmhaMask_,
typename FmhaDropout_,
ck_tile::BlockAttentionBiasEnum BiasEnum_,
bool kHasBiasGrad_,
bool kPadS_,
bool kPadSK_,
bool kPadD_,
bool kPadDv_,
bool kIsDeterministic_>
struct fmha_bwd_dq_dk_dv_traits_
{
static constexpr ck_tile::index_t HDim = HDim_;
using DataType = ck_tile::remove_cvref_t<DataType_>;
static constexpr bool kIsGroupMode = kIsGroupMode_;
static constexpr auto FmhaBwdPipelineEnum = FmhaBwdPipelineEnum_;
using FmhaMask = ck_tile::remove_cvref_t<FmhaMask_>;
using FmhaDropout = ck_tile::remove_cvref_t<FmhaDropout_>;
static constexpr auto BiasEnum = BiasEnum_;
static constexpr bool kHasBiasGrad = kHasBiasGrad_;
static constexpr bool kPadS = kPadS_;
static constexpr bool kPadSK = kPadSK_;
static constexpr bool kPadD = kPadD_;
static constexpr bool kPadDv = kPadDv_;
static constexpr bool kIsDeterministic = kIsDeterministic_;
};
template <typename Traits_>
float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config&, fmha_bwd_args);
template <typename Traits_>
void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
template <typename Traits_>
std::string fmha_bwd_dq_dk_dv_get_name_();
template <ck_tile::index_t HDim_, typename DataType_, bool kIsGroupMode_, bool kPadS_, bool kPadDv_>
struct fmha_bwd_dot_do_o_traits_
{
static constexpr ck_tile::index_t HDim = HDim_;
using DataType = ck_tile::remove_cvref_t<DataType_>;
static constexpr bool kIsGroupMode = kIsGroupMode_;
static constexpr bool kPadS = kPadS_;
static constexpr bool kPadDv = kPadDv_;
};
template <typename Traits_>
float fmha_bwd_dot_do_o_(const ck_tile::stream_config&, fmha_bwd_args);
template <typename Traits_>
void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
template <typename Traits_>
std::string fmha_bwd_dot_do_o_get_name_();
template <ck_tile::index_t HDim_,
typename DataType_,
bool kIsGroupMode_,
bool kPadS_,
bool kPadD_,
bool kIsDeterministic_>
struct fmha_bwd_convert_dq_traits_
{
static constexpr ck_tile::index_t HDim = HDim_;
using DataType = ck_tile::remove_cvref_t<DataType_>;
static constexpr bool kIsGroupMode = kIsGroupMode_;
static constexpr bool kPadS = kPadS_;
static constexpr bool kPadD = kPadD_;
static constexpr bool kIsDeterministic = kIsDeterministic_;
};
template <typename Traits_>
float fmha_bwd_convert_dq_(const ck_tile::stream_config&, fmha_bwd_args);
template <typename Traits_>
void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
template <typename Traits_>
std::string fmha_bwd_convert_dq_get_name_();
// This is the public API, will be generated by script
struct fmha_bwd_traits
{
int hdim_q;
int hdim_v;
std::string data_type;
bool is_group_mode;
mask_enum mask_type;
bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
bool has_dbias;
bool has_dropout;
bool is_store_randval;
bool is_deterministic;
// TODO: padding check is inside this api
};
template <int Version = 2>
float fmha_bwd(fmha_bwd_traits, fmha_bwd_args, const ck_tile::stream_config&);

View File

@ -1,824 +0,0 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <ck_tile/core.hpp>
#include <ck_tile/host/kernel_launch.hpp>
#include <ck_tile/ops/epilogue.hpp>
#include <ck_tile/ops/fmha.hpp>
#include <bias.hpp>
#include <mask.hpp>
#include <rotary.hpp>
#include <launch_kernel_pt.hpp>
#include <type_traits>
#include <utility>
#include <variant>
struct FmhaFwdFp16
{
};
struct FmhaFwdBf16
{
};
struct FmhaFwdFp8
{
};
struct FmhaFwdBf8
{
};
struct FmhaFwdFp8Fp16
{
};
struct FmhaFwdFp8Bf16
{
};
template <typename DataType>
struct FmhaFwdTypeConfig;
template <>
struct FmhaFwdTypeConfig<FmhaFwdFp16>
{
using QDataType = ck_tile::half_t;
using KDataType = ck_tile::half_t;
using VDataType = ck_tile::half_t;
using BiasDataType = ck_tile::half_t;
using RandValOutputDataType = uint8_t;
using LSEDataType = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
using SaccDataType = float; // data type for first gemm accumulation
using SMPLComputeDataType = float; // data type for reduction, softmax
using PDataType = ck_tile::half_t; // data type for A matrix of second gemm
using OaccDataType = float; // data type for second gemm accumulation
using ODataType = ck_tile::half_t;
};
template <>
struct FmhaFwdTypeConfig<FmhaFwdBf16>
{
using QDataType = ck_tile::bf16_t;
using KDataType = ck_tile::bf16_t;
using VDataType = ck_tile::bf16_t;
using BiasDataType = ck_tile::bf16_t;
using RandValOutputDataType = uint8_t;
using LSEDataType = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
using SaccDataType = float; // data type for first gemm accumulation
using SMPLComputeDataType = float; // data type for reduction, softmax
using PDataType = ck_tile::bf16_t; // data type for A matrix of second gemm
using OaccDataType = float; // data type for second gemm accumulation
using ODataType = ck_tile::bf16_t;
};
template <>
struct FmhaFwdTypeConfig<FmhaFwdFp8>
{
using QDataType = ck_tile::fp8_t;
using KDataType = ck_tile::fp8_t;
using VDataType = ck_tile::fp8_t;
using BiasDataType = float;
using RandValOutputDataType = uint8_t;
using LSEDataType = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
using SaccDataType = float; // data type for first gemm accumulation
using SMPLComputeDataType = float; // data type for reduction, softmax
using PDataType = ck_tile::fp8_t; // data type for A matrix of second gemm
using OaccDataType = float; // data type for second gemm accumulation
using ODataType = ck_tile::fp8_t;
};
template <>
struct FmhaFwdTypeConfig<FmhaFwdBf8>
{
using QDataType = ck_tile::bf8_t;
using KDataType = ck_tile::bf8_t;
using VDataType = ck_tile::bf8_t;
using BiasDataType = ck_tile::bf8_t;
using RandValOutputDataType = uint8_t;
using LSEDataType = float; // data type for lse(logsumexp L_j = max_j + log(l_j))
using SaccDataType = float; // data type for first gemm accumulation
using SMPLComputeDataType = float; // data type for reduction, softmax
using PDataType = ck_tile::bf8_t; // data type for A matrix of second gemm
using OaccDataType = float; // data type for second gemm accumulation
using ODataType = ck_tile::bf8_t;
};
struct FmhaMasks
{
using NoMask = ck_tile::GenericAttentionMask<false>;
using GenericMask = ck_tile::GenericAttentionMask<true, true>;
using CausalMask = ck_tile::GenericAttentionMask<true, false>;
};
// runtime args, some will passed to karg, some will used to compute grids/blocks
struct fmha_fwd_args
{
const void* q_ptr;
const void* k_ptr;
const void* v_ptr;
const void* bias_ptr; // bias or alibi_slope pointer
void* rand_val_ptr;
void* lse_ptr;
void* o_ptr;
const void* seqstart_q_ptr;
const void* seqstart_k_ptr;
const void*
seqlen_k_ptr; // only used if both 'seqstart_q_ptr' & 'seqstart_k_ptr' are not nullptr
ck_tile::index_t seqlen_q;
ck_tile::index_t seqlen_k;
ck_tile::index_t batch;
ck_tile::index_t max_seqlen_q;
ck_tile::index_t hdim_q;
ck_tile::index_t hdim_v;
ck_tile::index_t nhead_q;
ck_tile::index_t nhead_k;
float scale_s;
float scale_p;
float scale_o;
ck_tile::index_t stride_q;
ck_tile::index_t stride_k;
ck_tile::index_t stride_v;
ck_tile::index_t stride_bias; // if alibi, b*h need set this to h, 1*h need set this to 0
ck_tile::index_t stride_randval;
ck_tile::index_t stride_o;
ck_tile::index_t nhead_stride_q;
ck_tile::index_t nhead_stride_k;
ck_tile::index_t nhead_stride_v;
ck_tile::index_t nhead_stride_bias;
ck_tile::index_t nhead_stride_randval;
ck_tile::index_t nhead_stride_lse;
ck_tile::index_t nhead_stride_o;
ck_tile::index_t batch_stride_q;
ck_tile::index_t batch_stride_k;
ck_tile::index_t batch_stride_v;
ck_tile::index_t batch_stride_bias;
ck_tile::index_t batch_stride_randval;
ck_tile::index_t batch_stride_lse;
ck_tile::index_t batch_stride_o;
ck_tile::index_t window_size_left;
ck_tile::index_t window_size_right;
ck_tile::index_t mask_type;
float p_drop;
bool s_randval;
std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
drop_seed_offset;
};
struct fmha_fwd_splitkv_args
{
const void* q_ptr;
const void* k_ptr;
const void* v_ptr;
const void* bias_ptr; // bias or alibi_slope pointer
void* lse_acc_ptr;
void* o_acc_ptr;
void* lse_ptr;
void* o_ptr;
void* block_table_ptr;
ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr
ck_tile::index_t page_block_size; // only used if 'block_table_ptr' is not nullptr
bool is_gappy; // differentiate seqstart_k_ptr usage. only used if 'block_table_ptr' is not
// nullptr.
const void* cache_batch_idx;
// the real seqlen_q & seqlen_k are decided by following:
// batch mode: seqlen_q = kargs.seqlen_q
// seqlen_k = kargs.seqlen_k
// group mode: seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
// seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b]
// or kargs.seqlen_k_ptr[b]
//
// batch mode (kvcache):
// seqlen_q = kargs.seqlen_q
// seqlen_k = kargs.seqlen_k_ptr[b]
// group mode (kvcache):
// seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
//
// when is_gappy=true:
// seqlen_k = kargs.seqlen_k_ptr[b]
// seqstart_k_ptr[b] now store local offset of each batch
//
// when is_gappy=false:
// seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b]
// or kargs.seqlen_k_ptr[b]
const void* seqstart_q_ptr;
const void* seqstart_k_ptr;
const void* seqlen_k_ptr;
ck_tile::index_t seqlen_q;
ck_tile::index_t seqlen_k;
ck_tile::index_t batch;
ck_tile::index_t max_seqlen_q;
ck_tile::index_t hdim_q;
ck_tile::index_t hdim_v;
ck_tile::index_t nhead_q;
ck_tile::index_t nhead_k;
ck_tile::index_t num_splits;
float scale_s;
float scale_p;
float scale_o;
ck_tile::index_t stride_q;
ck_tile::index_t stride_k;
ck_tile::index_t stride_v;
ck_tile::index_t stride_bias; // if alibi, b*h need set this to h, 1*h need set this to 0
ck_tile::index_t stride_o_acc;
ck_tile::index_t stride_o;
ck_tile::index_t nhead_stride_q;
ck_tile::index_t nhead_stride_k;
ck_tile::index_t nhead_stride_v;
ck_tile::index_t nhead_stride_bias;
ck_tile::index_t nhead_stride_lse;
ck_tile::index_t nhead_stride_lse_acc;
ck_tile::index_t nhead_stride_o_acc;
ck_tile::index_t nhead_stride_o;
ck_tile::index_t batch_stride_q;
ck_tile::index_t batch_stride_k;
ck_tile::index_t batch_stride_v;
ck_tile::index_t batch_stride_bias;
ck_tile::index_t batch_stride_lse;
ck_tile::index_t batch_stride_lse_acc;
ck_tile::index_t batch_stride_o_acc;
ck_tile::index_t batch_stride_o;
ck_tile::index_t split_stride_lse_acc;
ck_tile::index_t split_stride_o_acc;
ck_tile::index_t window_size_left;
ck_tile::index_t window_size_right;
ck_tile::index_t mask_type;
};
struct fmha_fwd_appendkv_args
{
void* q_ptr;
void* k_ptr;
const void* knew_ptr;
void* v_ptr;
const void* vnew_ptr;
const void* seqlen_k_ptr;
ck_tile::index_t seqlen_q;
ck_tile::index_t seqlen_knew;
ck_tile::index_t batch;
ck_tile::index_t hdim_q;
ck_tile::index_t hdim_v;
ck_tile::index_t nhead_q;
ck_tile::index_t nhead_k;
const void* rotary_cos_ptr; // only used if 'rotary_dim' > 0
const void* rotary_sin_ptr; // only used if 'rotary_dim' > 0
ck_tile::index_t rotary_dim;
bool has_mask;
void* block_table_ptr;
ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr
ck_tile::index_t page_block_size; // only used if 'block_table_ptr' is not nullptr
const void* cache_batch_idx; // only used if block_table_ptr is nullptr -> batch mode (kvcache)
ck_tile::index_t stride_q;
ck_tile::index_t stride_k;
ck_tile::index_t stride_knew;
ck_tile::index_t stride_v;
ck_tile::index_t stride_vnew;
ck_tile::index_t nhead_stride_q;
ck_tile::index_t nhead_stride_k;
ck_tile::index_t nhead_stride_knew;
ck_tile::index_t nhead_stride_v;
ck_tile::index_t nhead_stride_vnew;
ck_tile::index_t batch_stride_q;
ck_tile::index_t batch_stride_k;
ck_tile::index_t batch_stride_knew;
ck_tile::index_t batch_stride_v;
ck_tile::index_t batch_stride_vnew;
};
template <typename FmhaKernel>
auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
{
assert(args.nhead_q % args.nhead_k == 0);
auto kargs = [&] {
// create group mode kernel arguments
if constexpr(FmhaKernel::kIsGroupMode)
{
return FmhaKernel::MakeKargsImpl(args.q_ptr,
args.k_ptr,
args.v_ptr,
args.bias_ptr,
args.rand_val_ptr,
args.lse_ptr,
args.o_ptr,
args.seqstart_q_ptr,
args.seqstart_k_ptr,
args.seqlen_k_ptr,
args.hdim_q,
args.hdim_v,
args.nhead_q,
args.nhead_q / args.nhead_k,
args.scale_s,
args.scale_p,
args.scale_o,
args.stride_q,
args.stride_k,
args.stride_v,
args.stride_bias,
args.stride_randval,
args.stride_o,
args.nhead_stride_q,
args.nhead_stride_k,
args.nhead_stride_v,
args.nhead_stride_bias,
args.nhead_stride_randval,
args.nhead_stride_lse,
args.nhead_stride_o,
args.window_size_left,
args.window_size_right,
args.mask_type,
args.p_drop,
args.s_randval,
args.drop_seed_offset);
}
else
{ // create batch mode kernel arguments
return FmhaKernel::MakeKargsImpl(args.q_ptr,
args.k_ptr,
args.v_ptr,
args.bias_ptr,
args.rand_val_ptr,
args.lse_ptr,
args.o_ptr,
args.seqlen_q,
args.seqlen_k,
args.hdim_q,
args.hdim_v,
args.nhead_q,
args.nhead_q / args.nhead_k,
args.scale_s,
args.scale_p,
args.scale_o,
args.stride_q,
args.stride_k,
args.stride_v,
args.stride_bias,
args.stride_randval,
args.stride_o,
args.nhead_stride_q,
args.nhead_stride_k,
args.nhead_stride_v,
args.nhead_stride_bias,
args.nhead_stride_randval,
args.nhead_stride_lse,
args.nhead_stride_o,
args.batch_stride_q,
args.batch_stride_k,
args.batch_stride_v,
args.batch_stride_bias,
args.batch_stride_randval,
args.batch_stride_lse,
args.batch_stride_o,
args.window_size_left,
args.window_size_right,
args.mask_type,
args.p_drop,
args.s_randval,
args.drop_seed_offset);
}
}();
if constexpr(FmhaKernel::kIsGroupMode)
{
dim3 grids = FmhaKernel::GridSize(
args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, args.seqlen_k_ptr != nullptr);
return ck_tile::make_tuple(kargs, grids);
}
else
{
dim3 grids =
FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, false);
return ck_tile::make_tuple(kargs, grids);
}
}
template <typename Kernel>
auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
{
assert(args.nhead_q % args.nhead_k == 0);
auto kargs = [&] {
// create group mode kernel arguments
if constexpr(Kernel::kIsGroupMode)
{
return Kernel::MakeKargs(args.q_ptr,
args.k_ptr,
args.v_ptr,
args.bias_ptr,
args.lse_acc_ptr,
args.o_acc_ptr,
args.batch,
args.seqstart_q_ptr,
args.seqstart_k_ptr,
args.seqlen_k_ptr,
args.hdim_q,
args.hdim_v,
args.nhead_q,
args.nhead_q / args.nhead_k,
args.num_splits,
args.block_table_ptr,
args.batch_stride_block_table,
args.page_block_size,
args.is_gappy,
args.scale_s,
args.scale_p,
args.stride_q,
args.stride_k,
args.stride_v,
args.stride_bias,
args.stride_o_acc,
args.nhead_stride_q,
args.nhead_stride_k,
args.nhead_stride_v,
args.nhead_stride_bias,
args.nhead_stride_lse_acc,
args.nhead_stride_o_acc,
args.batch_stride_k, // only used for paged-kvcache
args.batch_stride_v, // only used for paged-kvcache
args.split_stride_lse_acc,
args.split_stride_o_acc,
args.window_size_left,
args.window_size_right,
args.mask_type);
}
else
{ // create batch mode kernel arguments
return Kernel::MakeKargs(args.q_ptr,
args.k_ptr,
args.v_ptr,
args.bias_ptr,
args.lse_acc_ptr,
args.o_acc_ptr,
args.batch,
args.seqlen_q,
args.seqlen_k,
args.seqlen_k_ptr,
args.hdim_q,
args.hdim_v,
args.nhead_q,
args.nhead_q / args.nhead_k,
args.num_splits,
args.block_table_ptr,
args.batch_stride_block_table,
args.page_block_size,
args.cache_batch_idx,
args.scale_s,
args.scale_p,
args.stride_q,
args.stride_k,
args.stride_v,
args.stride_bias,
args.stride_o_acc,
args.nhead_stride_q,
args.nhead_stride_k,
args.nhead_stride_v,
args.nhead_stride_bias,
args.nhead_stride_lse_acc,
args.nhead_stride_o_acc,
args.batch_stride_q,
args.batch_stride_k,
args.batch_stride_v,
args.batch_stride_bias,
args.batch_stride_lse_acc,
args.batch_stride_o_acc,
args.split_stride_lse_acc,
args.split_stride_o_acc,
args.window_size_left,
args.window_size_right,
args.mask_type);
}
}();
dim3 grids = Kernel::GridSize(
args.batch, args.nhead_q, args.nhead_k, args.max_seqlen_q, args.hdim_v, args.num_splits);
return ck_tile::make_tuple(kargs, grids);
}
template <typename Kernel>
auto fmha_fwd_splitkv_combine_create_kargs_and_grids(fmha_fwd_splitkv_args args)
{
assert(args.nhead_q % args.nhead_k == 0);
auto kargs = [&] {
// create group mode kernel argumentszs
if constexpr(Kernel::kIsGroupMode)
{
return Kernel::MakeKargs(args.lse_acc_ptr,
args.o_acc_ptr,
args.lse_ptr,
args.o_ptr,
args.batch,
args.seqstart_q_ptr,
args.hdim_v,
args.num_splits,
args.scale_o,
args.stride_o_acc,
args.stride_o,
args.nhead_stride_lse_acc,
args.nhead_stride_o_acc,
args.nhead_stride_lse,
args.nhead_stride_o,
args.split_stride_lse_acc,
args.split_stride_o_acc);
}
else
{ // create batch mode kernel arguments
return Kernel::MakeKargs(args.lse_acc_ptr,
args.o_acc_ptr,
args.lse_ptr,
args.o_ptr,
args.batch,
args.seqlen_q,
args.hdim_v,
args.num_splits,
args.scale_o,
args.stride_o_acc,
args.stride_o,
args.nhead_stride_lse_acc,
args.nhead_stride_o_acc,
args.nhead_stride_lse,
args.nhead_stride_o,
args.batch_stride_lse_acc,
args.batch_stride_o_acc,
args.batch_stride_lse,
args.batch_stride_o,
args.split_stride_lse_acc,
args.split_stride_o_acc);
}
}();
dim3 grids = Kernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v);
return ck_tile::make_tuple(kargs, grids);
}
template <typename Kernel>
auto fmha_fwd_appendkv_create_kargs_and_grids(fmha_fwd_appendkv_args args)
{
assert(args.nhead_q % args.nhead_k == 0);
auto kargs = Kernel::MakeKargs(args.q_ptr,
args.k_ptr,
args.knew_ptr,
args.v_ptr,
args.vnew_ptr,
args.seqlen_q,
args.seqlen_k_ptr,
args.seqlen_knew,
args.hdim_q,
args.hdim_v,
args.nhead_q,
args.nhead_q / args.nhead_k,
args.rotary_cos_ptr,
args.rotary_sin_ptr,
args.rotary_dim,
args.has_mask,
args.block_table_ptr,
args.batch_stride_block_table,
args.page_block_size,
args.cache_batch_idx,
args.stride_q,
args.stride_k,
args.stride_knew,
args.stride_v,
args.stride_vnew,
args.nhead_stride_q,
args.nhead_stride_k,
args.nhead_stride_knew,
args.nhead_stride_v,
args.nhead_stride_vnew,
args.batch_stride_q,
args.batch_stride_k,
args.batch_stride_knew,
args.batch_stride_v,
args.batch_stride_vnew);
dim3 grids = Kernel::GridSize(args.batch, args.nhead_q, args.seqlen_q, args.seqlen_knew);
return ck_tile::make_tuple(kargs, grids);
}
// this is used to pattern-match internl kernel implementation, not to instantiate kernel
template <ck_tile::index_t HDim_,
typename DataType_,
bool kIsGroupMode_,
ck_tile::index_t kM0_,
ck_tile::index_t kN0_,
ck_tile::index_t kK0_,
ck_tile::index_t kN1_,
ck_tile::index_t kK1_,
ck_tile::index_t kK0BlockLength_,
bool kIsVLayoutRowMajor_,
ck_tile::BlockFmhaPipelineEnum FmhaPipelineEnum_,
typename FmhaMask_,
ck_tile::BlockAttentionBiasEnum BiasEnum_,
bool kStoreLse_,
bool kHasDropout_,
bool kDoFp8StaticQuant_,
bool kPadS_,
bool kPadSK_,
bool kPadD_,
bool kPadDv_>
struct fmha_fwd_traits_
{
static constexpr ck_tile::index_t HDim = HDim_;
using DataType = ck_tile::remove_cvref_t<DataType_>;
static constexpr bool kIsGroupMode = kIsGroupMode_;
static constexpr ck_tile::index_t kM0 = kM0_;
static constexpr ck_tile::index_t kN0 = kN0_;
static constexpr ck_tile::index_t kK0 = kK0_;
static constexpr ck_tile::index_t kN1 = kN1_;
static constexpr ck_tile::index_t kK1 = kK1_;
static constexpr ck_tile::index_t kK0BlockLength = kK0BlockLength_;
static constexpr bool kIsVLayoutRowMajor = kIsVLayoutRowMajor_;
static constexpr auto FmhaPipelineEnum = FmhaPipelineEnum_;
using FmhaMask = ck_tile::remove_cvref_t<FmhaMask_>;
static constexpr auto BiasEnum = BiasEnum_;
static constexpr bool kStoreLse = kStoreLse_;
static constexpr bool kHasDropout = kHasDropout_;
static constexpr bool kDoFp8StaticQuant = kDoFp8StaticQuant_;
static constexpr bool kPadS = kPadS_;
static constexpr bool kPadSK = kPadSK_;
static constexpr bool kPadD = kPadD_;
static constexpr bool kPadDv = kPadDv_;
};
template <typename Traits_>
float fmha_fwd_(const ck_tile::stream_config&, fmha_fwd_args);
template <ck_tile::index_t HDim_,
typename DataType_,
bool kIsGroupMode_,
ck_tile::index_t kM0_,
ck_tile::index_t kN0_,
ck_tile::index_t kK0_,
ck_tile::index_t kN1_,
ck_tile::index_t kK1_,
ck_tile::index_t kK0BlockLength_,
bool kIsVLayoutRowMajor_,
ck_tile::BlockFmhaPipelineEnum FmhaPipelineEnum_,
typename FmhaMask_,
ck_tile::BlockAttentionBiasEnum BiasEnum_,
bool kStoreLse_,
bool kDoFp8StaticQuant_,
bool kIsPagedKV_,
bool kPadS_,
bool kPadSK_,
bool kPadD_,
bool kPadDv_>
struct fmha_fwd_splitkv_traits_
{
static constexpr ck_tile::index_t HDim = HDim_;
using DataType = ck_tile::remove_cvref_t<DataType_>;
static constexpr bool kIsGroupMode = kIsGroupMode_;
static constexpr ck_tile::index_t kM0 = kM0_;
static constexpr ck_tile::index_t kN0 = kN0_;
static constexpr ck_tile::index_t kK0 = kK0_;
static constexpr ck_tile::index_t kN1 = kN1_;
static constexpr ck_tile::index_t kK1 = kK1_;
static constexpr ck_tile::index_t kK0BlockLength = kK0BlockLength_;
static constexpr bool kIsVLayoutRowMajor = kIsVLayoutRowMajor_;
static constexpr auto FmhaPipelineEnum = FmhaPipelineEnum_;
using FmhaMask = ck_tile::remove_cvref_t<FmhaMask_>;
static constexpr auto BiasEnum = BiasEnum_;
static constexpr bool kStoreLse = kStoreLse_;
static constexpr bool kDoFp8StaticQuant = kDoFp8StaticQuant_;
static constexpr bool kPadS = kPadS_;
static constexpr bool kPadSK = kPadSK_;
static constexpr bool kPadD = kPadD_;
static constexpr bool kPadDv = kPadDv_;
static constexpr bool kIsPagedKV = kIsPagedKV_;
};
template <typename Traits_>
void fmha_fwd_splitkv_oneshot_(const ck_tile::stream_config&, fmha_fwd_splitkv_args);
template <typename Traits_>
std::string fmha_fwd_splitkv_get_name_();
template <ck_tile::index_t HDim_,
typename DataType_,
bool kIsGroupMode_,
ck_tile::index_t kN1_,
bool kStoreLse_,
bool kDoFp8StaticQuant_,
bool kPadS_,
bool kPadDv_>
struct fmha_fwd_splitkv_combine_traits_
{
static constexpr ck_tile::index_t HDim = HDim_;
using DataType = ck_tile::remove_cvref_t<DataType_>;
static constexpr bool kIsGroupMode = kIsGroupMode_;
static constexpr ck_tile::index_t kN1 = kN1_;
static constexpr bool kStoreLse = kStoreLse_;
static constexpr bool kDoFp8StaticQuant = kDoFp8StaticQuant_;
static constexpr bool kPadS = kPadS_;
static constexpr bool kPadDv = kPadDv_;
};
template <typename Traits_>
void fmha_fwd_splitkv_combine_oneshot_(const ck_tile::stream_config&, fmha_fwd_splitkv_args);
template <typename Traits_>
std::string fmha_fwd_splitkv_combine_get_name_();
// this is used to pattern-match internl kernel implementation, not to instantiate kernel
template <ck_tile::index_t HDim_,
typename DataType_,
ck_tile::index_t kTileSizeS_,
ck_tile::index_t kTileSizeSk_,
ck_tile::index_t kTileSizeD_,
ck_tile::index_t kTileSizeDv_,
bool kIsVLayoutRowMajor_,
bool kPadS_,
bool kPadSk_,
bool kPadD_,
bool kPadDv_,
ck_tile::RotaryEmbeddingEnum RotaryEnum_,
bool kIsPagedKV_>
struct fmha_fwd_appendkv_traits_
{
static constexpr ck_tile::index_t HDim = HDim_;
using DataType = ck_tile::remove_cvref_t<DataType_>;
static constexpr ck_tile::index_t kTileSizeS = kTileSizeS_;
static constexpr ck_tile::index_t kTileSizeSk = kTileSizeSk_;
static constexpr ck_tile::index_t kTileSizeD = kTileSizeD_;
static constexpr ck_tile::index_t kTileSizeDv = kTileSizeDv_;
static constexpr bool kIsVLayoutRowMajor = kIsVLayoutRowMajor_;
static constexpr bool kPadS = kPadS_;
static constexpr bool kPadSk = kPadSk_;
static constexpr bool kPadD = kPadD_;
static constexpr bool kPadDv = kPadDv_;
static constexpr auto RotaryEnum = RotaryEnum_;
static constexpr bool kIsPagedKV = kIsPagedKV_;
};
template <typename Traits_>
float fmha_fwd_appendkv_(const ck_tile::stream_config&, fmha_fwd_appendkv_args);
// This is the public API, will be generated by script
struct fmha_fwd_traits
{
int hdim_q;
int hdim_v;
std::string data_type;
bool is_group_mode;
bool is_v_rowmajor;
mask_enum mask_type;
bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
bool has_lse;
bool has_dropout;
bool do_fp8_static_quant;
// TODO: padding check is inside this api
};
float fmha_fwd(fmha_fwd_traits, fmha_fwd_args, const ck_tile::stream_config&);
struct fmha_fwd_splitkv_traits
{
int hdim_q;
int hdim_v;
std::string data_type;
bool is_group_mode;
bool is_v_rowmajor;
mask_enum mask_type;
bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
bool has_lse;
bool do_fp8_static_quant;
// TODO: padding check is inside this api
};
float fmha_fwd_splitkv(fmha_fwd_splitkv_traits,
fmha_fwd_splitkv_args,
const ck_tile::stream_config&);
struct fmha_fwd_appendkv_traits
{
int hdim_q;
int hdim_v;
std::string data_type;
bool is_v_rowmajor;
rope_enum rope_type;
};
float fmha_fwd_appendkv(fmha_fwd_appendkv_traits,
fmha_fwd_appendkv_args,
const ck_tile::stream_config&);

View File

@ -1,157 +0,0 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <ostream>
#include <string>
#include <ck_tile/core.hpp>
#include <ck_tile/ops/fmha.hpp>
// keep this in sync with ck_tile::GenericAttentionMaskEnum
enum class mask_enum
{
no_mask = 0,
mask_top_left,
mask_bottom_right,
window_generic,
};
struct mask_info
{
mask_enum type;
ck_tile::index_t y, x;
ck_tile::index_t left, right; // FA style SWA left/right
void serialize(std::ostream& os) const
{
if(type == mask_enum::no_mask)
os << "n";
else if(type == mask_enum::mask_top_left)
os << "t(" << left << ":" << right << ")";
else if(type == mask_enum::mask_bottom_right)
os << "b(" << left << ":" << right << ")";
else
{
os << "g(" << y << ":" << x << ")";
}
}
static mask_info decode(std::string str, ck_tile::index_t seqlen_q, ck_tile::index_t seqlen_k)
{
ck_tile::index_t x_total = seqlen_k;
ck_tile::index_t y_total = seqlen_q;
mask_info tmp;
auto found_0 = str.find(':');
if(found_0 != std::string::npos)
{
std::string t = str.substr(0, found_0);
std::string v = str.substr(found_0 + 1);
if(t == "xt" || t == "xb")
{
// xformer style sliding window attn from top-left
ck_tile::index_t window_size = atoi(v.c_str());
ck_tile::index_t left_size = -1;
ck_tile::index_t right_size = 0;
if(window_size > 0)
{
left_size = window_size / 2;
right_size = window_size - 1 - left_size;
}
auto r = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
left_size, right_size, y_total, x_total, t == "xt");
tmp.type = t == "xt" ? mask_enum::mask_top_left : mask_enum::mask_bottom_right;
tmp.y = r.at(ck_tile::number<0>{});
tmp.x = r.at(ck_tile::number<1>{});
tmp.left = left_size;
tmp.right = right_size;
}
else
{
auto found_1 = v.find(",");
if(found_1 == std::string::npos)
{
printf("not supported value %s, %s\n", v.c_str(), str.c_str());
assert(0);
}
tmp.type = mask_enum::window_generic;
ck_tile::index_t v0 = atoi(v.substr(0, found_1).c_str());
ck_tile::index_t v1 = atoi(v.substr(found_1 + 1).c_str());
// TODO: some validation
if(t == "t")
{
tmp.type = mask_enum::mask_top_left;
auto r = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
v0, v1, y_total, x_total, true);
tmp.y = r.at(ck_tile::number<0>{});
tmp.x = r.at(ck_tile::number<1>{});
tmp.left = v0;
tmp.right = v1;
}
else if(t == "b")
{
tmp.type = mask_enum::mask_bottom_right;
auto r = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
v0, v1, y_total, x_total, false);
tmp.y = r.at(ck_tile::number<0>{});
tmp.x = r.at(ck_tile::number<1>{});
tmp.left = v0;
tmp.right = v1;
}
else if(t == "g")
{
tmp.y = v0;
tmp.x = v1;
tmp.left = v0; // TODO: don't use this?
tmp.right = v1;
}
else
{
printf("not supported type %s, %s\n", t.c_str(), str.c_str());
assert(0);
}
}
}
else
{
auto set_causal_top_left = [&]() {
tmp.type = mask_enum::mask_top_left;
tmp.y = seqlen_q;
tmp.x = 1;
tmp.left = -1;
tmp.right = 0;
};
auto set_causal_bottom_right = [&]() {
tmp.type = mask_enum::mask_bottom_right;
tmp.y = seqlen_q;
tmp.x = seqlen_k - seqlen_q + 1;
tmp.left = -1;
tmp.right = 0;
};
if(str == "t")
set_causal_top_left();
else if(str == "b")
set_causal_bottom_right();
else
{
tmp.type = static_cast<mask_enum>(atoi(str.c_str()));
if(tmp.type == mask_enum::mask_top_left)
{
set_causal_top_left();
}
else if(tmp.type == mask_enum::mask_bottom_right)
{
set_causal_bottom_right();
}
}
}
return tmp;
}
friend std::ostream& operator<<(std::ostream& os, const mask_info& mi)
{
mi.serialize(os);
return os;
}
};

View File

@ -22,6 +22,7 @@ fmha_fwd_traits get_ck_fmha_fwd_traits(const mask_info &mask,
dtype,
false, // is_group_mode
true, // is_v_rowmajor
false, // has_logits_soft_cap
mask.type,
enable_bias ? bias_enum::elementwise_bias : bias_enum::no_bias,
has_lse,
@ -85,6 +86,7 @@ fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse,
ck_tile::index_t stride_attn_bias = 0;
ck_tile::index_t batch_stride_bias = 0;
ck_tile::index_t nhead_stride_bias = 0;
if (attn_bias_.has_value()) {
auto a_b = attn_bias_.value();
CHECK_DEVICE(a_b);
@ -94,7 +96,6 @@ fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse,
nhead_stride_bias = a_b.stride(1);
batch_stride_bias = a_b.stride(0);
}
return fmha_fwd_args{q.data_ptr(),
k.data_ptr(),
v.data_ptr(),
@ -116,6 +117,7 @@ fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse,
softmax_scale, // scale_s
1, // scale_p
1, // scale_o
0.0f, // logits_soft_cap
stride_q,
stride_k,
stride_v,
@ -139,6 +141,7 @@ fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse,
mask.left,
mask.right,
static_cast<ck_tile::index_t>(mask.type),
-1, // min_seqlen_q
p_dropout,
has_dropout_randval,
drop_seed_offset};

View File

@ -20,6 +20,7 @@ fmha_fwd_traits get_ck_fmha_varlen_fwd_traits(const mask_info &mask,
dtype,
true, // is_group_mode
true, // is_v_rowmajor
false, // has_logits_soft_cap
mask.type,
enable_bias ? bias_enum::elementwise_bias : bias_enum::no_bias,
has_lse,
@ -117,6 +118,7 @@ fmha_fwd_args get_ck_fmha_varlen_fwd_args(bool has_lse,
softmax_scale, // scale_s
1, // scale_p
1, // scale_o
0.0f, // logits_soft_cap
stride_q,
stride_k,
stride_v,
@ -140,6 +142,7 @@ fmha_fwd_args get_ck_fmha_varlen_fwd_args(bool has_lse,
mask.left,
mask.right,
static_cast<ck_tile::index_t>(mask.type),
-1, // min_seqlen_q
p_dropout,
has_dropout_randval,
drop_seed_offset};

View File

@ -1,84 +0,0 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <ck_tile/core.hpp>
#include <ck_tile/host/host_tensor.hpp>
#include <cassert>
#include <cmath>
#include <functional>
#include <iterator>
#include <optional>
#include <random>
#include <tuple>
// keep sync with RotaryEmbeddingEnum
enum class rope_enum
{
none = 0,
interleaved = 1,
half_rotated = 2,
};
template <typename DataType>
std::tuple<ck_tile::HostTensor<DataType>, ck_tile::HostTensor<DataType>>
generate_rotary_cos_sin(ck_tile::index_t seqlen,
ck_tile::index_t rotary_dim,
std::optional<unsigned> seed = std::nullopt)
{
// return dummy tensors if we won't apply RoPE at all
if(rotary_dim <= 0)
{
ck_tile::HostTensor<DataType> dummy({1, 1});
return std::make_tuple(dummy, dummy);
}
std::mt19937 random_engine(seed.has_value() ? *seed : std::random_device{}());
std::uniform_real_distribution<float> generator(0.0f, 1.0f);
const ck_tile::index_t num_rows = seqlen * 2;
const ck_tile::index_t num_cols = rotary_dim / 2;
using std::begin, std::end;
ck_tile::HostTensor<float> angle({num_rows, num_cols});
std::generate(begin(angle), end(angle), [&] { return generator(random_engine) * 2 * M_PI; });
ck_tile::HostTensor<DataType> cos({num_rows, num_cols});
std::transform(begin(angle), end(angle), begin(cos), [](float origin_value) {
return ck_tile::type_convert<DataType>(std::cos(origin_value));
});
ck_tile::HostTensor<DataType> sin({num_rows, num_cols});
std::transform(begin(angle), end(angle), begin(sin), [](float origin_value) {
return ck_tile::type_convert<DataType>(std::sin(origin_value));
});
return std::make_tuple(cos, sin);
}
template <typename DataType>
std::tuple<ck_tile::HostTensor<DataType>, ck_tile::HostTensor<DataType>>
slice_rotary_cos_sin(const ck_tile::HostTensor<DataType>& cos,
const ck_tile::HostTensor<DataType>& sin,
ck_tile::index_t seqlen_offset,
ck_tile::index_t seqlen)
{
assert(cos.get_num_of_dimension() == 2 && sin.get_num_of_dimension() == 2);
assert(cos.get_length(0) == sin.get_length(0) && cos.get_length(1) == sin.get_length(1));
assert(static_cast<std::size_t>(seqlen_offset + seqlen) <= cos.get_length(0));
const ck_tile::index_t num_rows = seqlen;
const ck_tile::index_t num_cols = cos.get_length(1);
ck_tile::HostTensor<DataType> cos_pt({num_rows, num_cols});
cos_pt.ForEach([&](auto& self, auto i) { self(i) = cos(i[0] + seqlen_offset, i[1]); });
ck_tile::HostTensor<DataType> sin_pt({num_rows, num_cols});
sin_pt.ForEach([&](auto& self, auto i) { self(i) = sin(i[0] + seqlen_offset, i[1]); });
return std::make_tuple(cos_pt, sin_pt);
}

View File

@ -5,6 +5,12 @@ import os
import sys
# Run only this selected group of models, leave this empty to run everything
TORCHBENCH_ONLY_MODELS = [
m.strip() for m in os.getenv("TORCHBENCH_ONLY_MODELS", "").split(",") if m.strip()
]
# Note - hf and timm have their own version of this, torchbench does not
# TODO(voz): Someday, consolidate all the files into one runner instead of a shim like this...
def model_names(filename: str) -> set[str]:
@ -17,6 +23,8 @@ def model_names(filename: str) -> set[str]:
if len(line_parts) == 1:
line_parts = line.split(",")
model_name = line_parts[0]
if TORCHBENCH_ONLY_MODELS and model_name not in TORCHBENCH_ONLY_MODELS:
continue
names.add(model_name)
return names

View File

@ -9,6 +9,7 @@ import copy
import csv
import dataclasses
import functools
import gc
import importlib
import itertools
import json
@ -2387,6 +2388,7 @@ class BenchmarkRunner:
)
def warmup(fn, model, example_inputs, mode, niters=10):
gc.collect()
peak_mem = 0
start_stats = get_dynamo_stats()
try:
@ -2548,6 +2550,7 @@ class BenchmarkRunner:
return experiment(*self.maybe_cast(model, example_inputs))
def warmup(fn, model, example_inputs, mode, niters=5):
gc.collect()
peak_mem = 0
start_stats = get_dynamo_stats()
try:

View File

@ -106,6 +106,11 @@ finally:
# on A100 GPUs - 40 GB.
BATCH_SIZE_KNOWN_MODELS = {}
# Run only this selected group of models, leave this empty to run everything
TORCHBENCH_ONLY_MODELS = [
m.strip() for m in os.getenv("TORCHBENCH_ONLY_MODELS", "").split(",") if m.strip()
]
# TODO(sdym): use batch-size-file parameter of common.main, like torchbench.py
# Get the list of models and their batch sizes
@ -116,6 +121,8 @@ with open(MODELS_FILENAME) as fh:
lines = [line.rstrip() for line in lines]
for line in lines:
model_name, batch_size = line.split(",")
if TORCHBENCH_ONLY_MODELS and model_name not in TORCHBENCH_ONLY_MODELS:
continue
batch_size = int(batch_size)
BATCH_SIZE_KNOWN_MODELS[model_name] = batch_size
assert len(BATCH_SIZE_KNOWN_MODELS)

Some files were not shown because too many files have changed in this diff Show More