Compare commits

..

1 Commits

Author SHA1 Message Date
84dea7642c [wip][flex_attention]: adding a heuristic for USE_TMA
Summary:
Adding a USE_TMA heuristic which captures almost all of the perf.

The best perf improvements are seen in document_mask+large shapes which are very important for training.

Mean: +15.8% improvement overall

Only a single shape (below) sees a perf degradation of >5%.

Document Mask: 37.8% average improvement
* Strong performance on variable-length sequences
* Once case of regression on a short sequences (-5.72%)

Sliding Window: 58.7% average improvement
* 30-97% improvements

Head Bias: 2.5% average improvement
* Consistent 2-3% gains across all sequence lengths

Others:
* Softcap: 0.15% average improvement (minimal impact)
* Prefix_LM: -0.4% average change (slight degradation)
* Full Attention (noop): 1.6% average improvement
* Causal: 0.6% average improvement

Note: sliding window uplift is because of the test side change.

Test Plan:
Ran 
```
buck2 run @//mode/opt -c fbcode.nvcc_arch=b200a -c fbcode.enable_gpu_sections=true -c fbcode.platform010_cuda_version=12.8 //pytorch/tritonbench:run -- --op flex_attention --metrics tflops,latency
```

Across the different variations.

Here is the uplift plot:
 {F1982793932}

Raw Data:
```
Shape	Mask Type	Post Changes	Pre-Changes	Percent Change
(8, 16, 128, 16, 128, 128)	noop	12.2404	11.672	4.869773818
(8, 16, 128, 16, 128, 128)	causal	19.6761	18.9992	3.562781591
(8, 16, 128, 16, 128, 128)	rel	12.64	12.1127	4.353282092
(8, 16, 128, 16, 128, 128)	head_bias	12.3075	12.0091	2.484782373
(8, 16, 128, 16, 128, 128)	alibi	19.0314	18.2775	4.124743537
(8, 16, 128, 16, 128, 128)	sliding_window	19.1613	18.5902	3.072048714
(1, 16, 1024, 16, 1024, 128)	document_mask	28.8403	30.5914	-5.72415777
(8, 16, 128, 16, 128, 128)	prefix_lm	18.3073	18.8398	-2.826463126
(8, 16, 128, 16, 128, 128)	softcap	18.2974	18.3773	-0.4347755111
(8, 16, 256, 16, 256, 128)	noop	48.6082	47.8148	1.659318872
(8, 16, 256, 16, 256, 128)	causal	57.6149	56.2675	2.394632781
(8, 16, 256, 16, 256, 128)	rel	47.4945	47.2779	0.4581421764
(8, 16, 256, 16, 256, 128)	head_bias	48.3813	47.014	2.908282639
(8, 16, 256, 16, 256, 128)	alibi	56.9332	54.2731	4.901323123
(8, 16, 256, 16, 256, 128)	sliding_window	56.4246	52.6885	7.090921169
(1, 16, 2048, 16, 2048, 128)	document_mask	76.3089	74.3736	2.602133015
(8, 16, 256, 16, 256, 128)	prefix_lm	58.2794	55.5256	4.95951417
(8, 16, 256, 16, 256, 128)	softcap	55.2826	53.8968	2.571210165
(8, 16, 512, 16, 512, 128)	noop	183.26	187.142	-2.074360646
(8, 16, 512, 16, 512, 128)	causal	174.899	172.658	1.297941596
(8, 16, 512, 16, 512, 128)	rel	189.178	185.085	2.211416376
(8, 16, 512, 16, 512, 128)	head_bias	187.142	181.349	3.194393131
(8, 16, 512, 16, 512, 128)	alibi	164.641	164.48	0.09788424125
(8, 16, 512, 16, 512, 128)	sliding_window	169.957	122.94	38.24385879
(1, 16, 4096, 16, 4096, 128)	document_mask	146.112	107.093	36.43468761
(8, 16, 512, 16, 512, 128)	prefix_lm	175.354	175.354	0
(8, 16, 512, 16, 512, 128)	softcap	162.262	162.183	0.04871040738
(8, 16, 1024, 16, 1024, 128)	noop	451.681	446.073	1.257193329
(8, 16, 1024, 16, 1024, 128)	causal	252.694	252.641	0.02097838435
(8, 16, 1024, 16, 1024, 128)	rel	446.166	443.139	0.6830813808
(8, 16, 1024, 16, 1024, 128)	head_bias	454.921	446.258	1.941253714
(8, 16, 1024, 16, 1024, 128)	alibi	236.666	236.712	-0.01943289736
(8, 16, 1024, 16, 1024, 128)	sliding_window	249.215	167.812	48.50844993
(1, 16, 8192, 16, 8192, 128)	document_mask	210.169	149.254	40.81297654
(8, 16, 1024, 16, 1024, 128)	prefix_lm	254.604	254.284	0.125843545
(8, 16, 1024, 16, 1024, 128)	softcap	229.892	229.848	0.01914308587
(8, 16, 2048, 16, 2048, 128)	noop	608.016	597.526	1.755572142
(8, 16, 2048, 16, 2048, 128)	causal	321.017	321.693	-0.2101382374
(8, 16, 2048, 16, 2048, 128)	rel	587.111	586.792	0.054363386
(8, 16, 2048, 16, 2048, 128)	head_bias	598.687	587.111	1.971688488
(8, 16, 2048, 16, 2048, 128)	alibi	295.773	295.773	0
(8, 16, 2048, 16, 2048, 128)	sliding_window	318.097	183.105	73.72381967
(1, 16, 16384, 16, 16384, 128)	document_mask	285.551	195.233	46.26164634
(8, 16, 2048, 16, 2048, 128)	prefix_lm	322.304	321.806	0.1547516205
(8, 16, 2048, 16, 2048, 128)	softcap	286.219	285.88	0.1185812229
(8, 16, 4096, 16, 4096, 128)	noop	623.445	625.253	-0.2891629468
(8, 16, 4096, 16, 4096, 128)	causal	369.057	369.057	0
(8, 16, 4096, 16, 4096, 128)	rel	626.718	630.683	-0.6286835066
(8, 16, 4096, 16, 4096, 128)	head_bias	619.192	617.717	0.2387824845
(8, 16, 4096, 16, 4096, 128)	alibi	336.842	336.849	-0.002078082464
(8, 16, 4096, 16, 4096, 128)	sliding_window	367.357	192.262	91.07103848
(1, 16, 32768, 16, 32768, 128)	document_mask	352.813	236.054	49.46283478
(8, 16, 4096, 16, 4096, 128)	prefix_lm	369.309	369.309	0
(8, 16, 4096, 16, 4096, 128)	softcap	324.088	324.112	-0.007404847707
(8, 16, 8192, 16, 8192, 128)	noop	668.267	644.41	3.702146149
(8, 16, 8192, 16, 8192, 128)	causal	393.945	397.241	-0.8297230145
(8, 16, 8192, 16, 8192, 128)	rel	663.352	649.078	2.199119366
(8, 16, 8192, 16, 8192, 128)	head_bias	653.654	654.78	-0.1719661566
(8, 16, 8192, 16, 8192, 128)	alibi	363.893	360.898	0.8298743689
(8, 16, 8192, 16, 8192, 128)	sliding_window	385.163	197.756	94.76678331
(1, 16, 65536, 16, 65536, 128)	document_mask	405.668	264.619	53.3026729
(8, 16, 8192, 16, 8192, 128)	prefix_lm	393.872	397.458	-0.9022336951
(8, 16, 8192, 16, 8192, 128)	softcap	345.572	346.346	-0.223475946
(8, 16, 16384, 16, 16384, 128)	noop	681.404	642.487	6.057243182
(8, 16, 16384, 16, 16384, 128)	causal	416.768	413.811	0.7145774279
(8, 16, 16384, 16, 16384, 128)	rel	676.936	660.778	2.445299329
(8, 16, 16384, 16, 16384, 128)	head_bias	676.907	657.428	2.962910007
(8, 16, 16384, 16, 16384, 128)	alibi	382.453	373.863	2.29763309
(8, 16, 16384, 16, 16384, 128)	sliding_window	395.434	200.448	97.27510377
(1, 16, 131072, 16, 131072, 128)	document_mask	439.601	282.557	55.57958217
(8, 16, 16384, 16, 16384, 128)	prefix_lm	416.927	412.476	1.079093087
(8, 16, 16384, 16, 16384, 128)	softcap	363.194	359.052	1.153593351
```

Differential Revision: D84373821
2025-10-17 15:31:36 -07:00
546 changed files with 3297 additions and 6408 deletions

View File

@ -83,6 +83,10 @@ function build_cpython {
py_suffix=${py_ver::-1}
py_folder=$py_suffix
fi
# Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4
if [ "$py_suffix" == "3.14.0" ]; then
py_suffix="3.14.0rc2"
fi
wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
do_cpython_build $py_ver Python-$py_suffix

View File

@ -57,8 +57,8 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
logger.info("Successfully cloned %s", target)
return r, commit
except GitCommandError:
logger.exception("Git operation failed")
except GitCommandError as e:
logger.error("Git operation failed: %s", e)
raise

View File

@ -7,12 +7,14 @@ max-line-length = 120
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead
ignore =
E203,E305,E402,E501,E704,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
# to line this up with executable bit
EXE001,
# these ignores are from flake8-bugbear; please fix!
B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
# these ignores are from flake8-logging-format; please fix!
G100,G101,G200
# these ignores are from flake8-simplify. please fix or ignore with commented reason
SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
# SIM104 is already covered by pyupgrade ruff

View File

@ -1 +1 @@
69bbe7363897764f9e758d851cd0340147d27f94
1b013f5b5a87a1882eb143c26d79d091150d6a37

29
.github/labeler.yml vendored
View File

@ -133,32 +133,3 @@
"ciflow/vllm":
- .github/ci_commit_pins/vllm.txt
"ciflow/b200":
- test/test_matmul_cuda.py
- test/test_scaled_matmul_cuda.py
- test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
- torch/**/*cublas*
- torch/_inductor/kernel/mm.py
- test/inductor/test_max_autotune.py
- third_party/fbgemm
"ciflow/h100":
- test/test_matmul_cuda.py
- test/test_scaled_matmul_cuda.py
- test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
- torch/**/*cublas*
- torch/_inductor/kernel/mm.py
- test/inductor/test_max_autotune.py
- third_party/fbgemm
"ciflow/rocm":
- test/test_matmul_cuda.py
- test/test_scaled_matmul_cuda.py
- test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
- torch/_inductor/kernel/mm.py
- test/inductor/test_max_autotune.py
- third_party/fbgemm

View File

@ -33,7 +33,6 @@ ciflow_push_tags:
- ciflow/rocm
- ciflow/rocm-mi300
- ciflow/rocm-mi355
- ciflow/rocm-navi31
- ciflow/s390
- ciflow/slow
- ciflow/torchbench

View File

@ -79,21 +79,21 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
),
"12.9": (
"nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
"nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
"nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
"nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | "
"nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | "
"nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | "
"nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | "
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
"nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
),
"13.0": (
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "

View File

@ -26,8 +26,9 @@ name: !{{ build_environment }}
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "!{{ py_ver.strip('t') + ('.4' if '3.14' not in py_ver else '.0') }}"
python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
{%- endmacro %}

View File

@ -224,7 +224,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -473,7 +473,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -722,7 +722,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -971,7 +971,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1220,7 +1220,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1469,7 +1469,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1718,7 +1718,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -259,7 +259,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_9-test: # Testing
@ -925,7 +925,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_9-test: # Testing
@ -1591,7 +1591,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_9-test: # Testing
@ -2257,7 +2257,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_9-test: # Testing
@ -2923,7 +2923,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_9-test: # Testing
@ -3589,7 +3589,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_9-test: # Testing
@ -4255,7 +4255,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_9-test: # Testing

View File

@ -63,6 +63,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.10.4"
freethreaded: false

View File

@ -59,6 +59,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.10.4"
freethreaded: false
@ -168,6 +169,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.11.4"
freethreaded: false
@ -277,6 +279,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.12.4"
freethreaded: false
@ -386,6 +389,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.13.4"
freethreaded: false
@ -495,6 +499,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.13.4"
freethreaded: true
@ -604,8 +609,9 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.14.0"
python-version: "3.14.0-rc.2"
freethreaded: false
- name: Checkout PyTorch
uses: actions/checkout@v4
@ -713,8 +719,9 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.14.0"
python-version: "3.14.0-rc.2"
freethreaded: true
- name: Checkout PyTorch
uses: actions/checkout@v4

View File

@ -1,63 +0,0 @@
name: rocm-navi31
on:
push:
tags:
- ciflow/rocm-navi31/*
workflow_dispatch:
schedule:
# We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
# Also run less frequently on weekends.
- cron: 45 */2 * * 1-5
- cron: 45 4,12 * * 0,6
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions: read-all
jobs:
target-determination:
if: github.repository_owner == 'pytorch'
name: before-test
uses: ./.github/workflows/target_determination.yml
permissions:
id-token: write
contents: read
linux-jammy-rocm-py3_10-build:
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3_10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
tests-to-include: >-
${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
inductor/test_flex_attention inductor/test_max_autotune' || '' }}
secrets: inherit

View File

@ -59,3 +59,29 @@ jobs:
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-rocm-py3_10-gfx1100-test:
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3_10-gfx1100
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
]}
tests-to-include: >
test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
inductor/test_flex_attention inductor/test_max_autotune
secrets: inherit

View File

@ -190,40 +190,6 @@ jobs:
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
secrets: inherit
linux-jammy-rocm-py3_10-build:
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
secrets: inherit
inductor-build:
name: inductor-build
uses: ./.github/workflows/_linux-build.yml

1
.gitignore vendored
View File

@ -374,7 +374,6 @@ third_party/ruy/
third_party/glog/
# Virtualenv
.venv/
venv/
# Log files

View File

@ -201,17 +201,3 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
/torch/csrc/stable/ @janeyx99 @mikaylagawarecki
/torch/headeronly/ @janeyx99
/torch/header_only_apis.txt @janeyx99
# FlexAttention
/torch/nn/attention/flex_attention.py @drisspg
/torch/_higher_order_ops/flex_attention.py @drisspg
/torch/_inductor/kernel/flex/ @drisspg
/torch/_inductor/codegen/cpp_flex_attention_template.py @drisspg
/test/inductor/test_flex_attention.py @drisspg
/test/inductor/test_flex_decoding.py @drisspg
# Low Precision GEMMs
/aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
/aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
/aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
/test/test_scaled_matmul_cuda.py @drisspg @slayton58

View File

@ -313,14 +313,13 @@ IF(USE_FBGEMM_GENAI)
# Add additional HIPCC compiler flags for performance
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-mllvm
-amdgpu-coerce-illegal-types=1
-mllvm
-enable-post-misched=0
-mllvm
-greedy-reverse-local-assignment=1
-fhip-new-launch-api)
if(DEFINED ROCM_VERSION_DEV AND ROCM_VERSION_DEV VERSION_LESS "7.2.0")
list(PREPEND FBGEMM_GENAI_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1)
endif()
# Only compile for gfx942 for now.
# This is rather hacky, I could not figure out a clean solution :(

View File

@ -183,6 +183,11 @@ struct CUDACachingHostAllocatorImpl
return true;
}
bool pinned_use_background_threads() override {
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
pinned_use_background_threads();
}
EventPool::Event create_event_internal(DeviceIndex idx) {
// Leak the event pool to avoid shutdown issue.
static auto* event_pool = new EventPool();

View File

@ -177,6 +177,7 @@ inline void segmented_sort_pairs(
}
}
#if CUB_SUPPORTS_UNIQUE_BY_KEY()
template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename NumSelectedIteratorT>
inline void unique_by_key(
KeysInputIteratorT keys_in, ValuesInputIteratorT values_in,
@ -192,6 +193,7 @@ inline void unique_by_key(
CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::UniqueByKey,
keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::cuda::getCurrentCUDAStream());
}
#endif
namespace impl {
@ -577,6 +579,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
#endif
}
#if CUB_SUPPORTS_SCAN_BY_KEY()
template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT>
inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items) {
@ -604,6 +607,7 @@ inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT
#endif
}
#endif
template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
void unique(InputIteratorT input, OutputIteratorT output,

View File

@ -28,6 +28,22 @@
#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
#endif
// cub support for UniqueByKey is added to cub 1.16 in:
// https://github.com/NVIDIA/cub/pull/405
#if CUB_VERSION >= 101600
#define CUB_SUPPORTS_UNIQUE_BY_KEY() true
#else
#define CUB_SUPPORTS_UNIQUE_BY_KEY() false
#endif
// cub support for scan by key is added to cub 1.15
// in https://github.com/NVIDIA/cub/pull/376
#if CUB_VERSION >= 101500
#define CUB_SUPPORTS_SCAN_BY_KEY() 1
#else
#define CUB_SUPPORTS_SCAN_BY_KEY() 0
#endif
// cub support for cub::FutureValue is added to cub 1.15 in:
// https://github.com/NVIDIA/cub/pull/305
#if CUB_VERSION >= 101500

View File

@ -160,10 +160,6 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
DispatchKey::CUDA,
DispatchKey::CPU,
DispatchKey::PrivateUse1,
DispatchKey::SparseCPU,
DispatchKey::SparseCUDA,
DispatchKey::SparseCsrCPU,
DispatchKey::SparseCsrCUDA,
});
inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) {

View File

@ -141,8 +141,6 @@ void compute_triu_tril(const Tensor& self, int64_t k, const Tensor &result) {
return;
}
checkTrilTriuMemoryOverlap(result, self);
bool inplace_op = self.is_same(result);
bool inplace_update = false;

View File

@ -1,4 +1,3 @@
#include <ATen/MemoryOverlap.h>
#include <ATen/core/Tensor.h>
#include <ATen/native/LinearAlgebraUtils.h>
@ -55,13 +54,4 @@ static inline std::tuple<bool, Tensor> checkTrilTriuBatchContiguous(const Tensor
return std::make_tuple(true, tensor);
}
static inline void checkTrilTriuMemoryOverlap(const Tensor& result, const Tensor& self) {
if (result.is_same(self)) {
at::assert_no_internal_overlap(result);
} else {
at::assert_no_internal_overlap(result);
at::assert_no_overlap(result, self);
}
}
} // namespace at::native

View File

@ -120,7 +120,7 @@ static void pow_tensor_scalar_kernel(
} else if (dtype == ScalarType::Half) {
[&]() {
using scalar_t =
decltype(c10::impl::ScalarTypeToCPPType<ScalarType::Half>::t);
c10::impl::ScalarTypeToCPPTypeT<ScalarType::Half>;
const auto exp = exp_scalar.to<scalar_t>();
using Vec = Vectorized<scalar_t>;
cpu_kernel_vec(iter,

View File

@ -2322,23 +2322,12 @@ _scaled_nvfp4_nvfp4(
const Tensor& scale_b, const SwizzleType swizzle_b,
const std::optional<Tensor>& bias,
const c10::ScalarType out_dtype,
Tensor& out,
const std::optional<Tensor>& global_scale_a = std::nullopt,
const std::optional<Tensor>& global_scale_b = std::nullopt) {
const bool single_scale,
Tensor& out) {
#ifdef USE_ROCM
TORCH_CHECK_NOT_IMPLEMENTED(false, "NVFP4 scaling not supported on ROCM");
#endif
std::optional<Tensor> alpha = std::nullopt;
// Note: "Or" here means that if only one scale is passed, we check for the other. Otherwise,
// if this is "And" we would silently do nothing in the case where one global scale is
// passed and not the other.
if (global_scale_a.has_value() || global_scale_b.has_value()) {
TORCH_CHECK_VALUE(global_scale_a.has_value(),
"For two-level-scaled NVFP4, global_scale_a must have a value");
TORCH_CHECK_VALUE(global_scale_b.has_value(),
"For two-level-scaled NVFP4, global_scale_b must have a value");
alpha = global_scale_a.value().mul(global_scale_b.value());
}
TORCH_CHECK_VALUE(single_scale, "Only single-scaled NVFP4 currently supported");
// Restrictions:
// A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
// Scales must be swizzled
@ -2360,7 +2349,7 @@ _scaled_nvfp4_nvfp4(
auto scaling_choice_a = ScalingType::BlockWise1x16;
auto scaling_choice_b = ScalingType::BlockWise1x16;
return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out, alpha);
return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
}
@ -2566,10 +2555,9 @@ _scaled_mm_cuda_v2_out(
} else if (gemm_impl == ScaledGemmImplementation::MXFP8_MXFP8) {
return _scaled_mxfp8_mxfp8(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
} else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4) {
return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out,
scale_a[1], scale_b[1]);
TORCH_CHECK_NOT_IMPLEMENTED(false, "Only single-scale NVFP4 currently supported");
} else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE) {
return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, true /* single_scale */, out);
} else if (gemm_impl == ScaledGemmImplementation::MXFP4_MXFP4) {
return _scaled_mxfp4_mxfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
} else {

View File

@ -856,9 +856,13 @@ struct type_specialized_kernel_launcher {
out_calc_t output_offset_calculator,
loader_t loader,
storer_t storer) {
if (ret_t == rt_binary_specializations[arg_index][0] &&
arg0_t == rt_binary_specializations[arg_index][1] &&
arg1_t == rt_binary_specializations[arg_index][2])
constexpr ScalarType sret_t = rt_binary_specializations[arg_index][0];
constexpr ScalarType sarg0_t = rt_binary_specializations[arg_index][1];
constexpr ScalarType sarg1_t = rt_binary_specializations[arg_index][2];
if (ret_t == sret_t && arg0_t == sarg0_t && arg1_t == sarg1_t) {
using cret_t = c10::impl::ScalarTypeToCPPTypeT<sret_t>;
using carg0_t = c10::impl::ScalarTypeToCPPTypeT<sarg0_t>;
using carg1_t = c10::impl::ScalarTypeToCPPTypeT<sarg1_t>;
launch_vectorized_templated_kernel<
func_t,
array_t,
@ -866,12 +870,9 @@ struct type_specialized_kernel_launcher {
out_calc_t,
loader_t,
storer_t,
decltype(c10::impl::ScalarTypeToCPPType<
rt_binary_specializations[arg_index][0]>::t),
decltype(c10::impl::ScalarTypeToCPPType<
rt_binary_specializations[arg_index][1]>::t),
decltype(c10::impl::ScalarTypeToCPPType<
rt_binary_specializations[arg_index][2]>::t)>(
cret_t,
carg0_t,
carg1_t>(
numel,
f,
data,
@ -879,6 +880,7 @@ struct type_specialized_kernel_launcher {
output_offset_calculator,
loader,
storer);
}
}
};

View File

@ -15,7 +15,9 @@
#include <ATen/native/cuda/block_reduce.cuh>
#include <ATen/native/cuda/thread_constants.h>
#if CUB_SUPPORTS_SCAN_BY_KEY()
#include <thrust/iterator/reverse_iterator.h>
#endif
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
@ -238,6 +240,10 @@ __global__ void renorm_kernel(
} // anonymous namespace
#if !CUB_SUPPORTS_SCAN_BY_KEY()
template<typename index_t>
void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
#endif
Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices_,
int64_t num_weights, int64_t padding_idx,
@ -300,6 +306,7 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
if (scale_grad_by_freq) {
count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
#if CUB_SUPPORTS_SCAN_BY_KEY()
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -326,6 +333,11 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
num_indices
);
});
#else
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
});
#endif
}
return embedding_backward_cuda_kernel(grad, orig_indices,

View File

@ -10,7 +10,9 @@
#include <c10/macros/Macros.h>
#if CUB_SUPPORTS_UNIQUE_BY_KEY()
#include <thrust/iterator/counting_iterator.h>
#endif
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
@ -194,9 +196,18 @@ __global__ void compute_num_of_partial_segments(const index_t *partials_per_segm
partials_per_segment_offset[num_of_segments-1];
}
#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
__global__ void write_num_of_segments_for_legacy_thrust_path(int64_t *num_of_segments_ptr, int64_t num_of_segments) {
*num_of_segments_ptr = num_of_segments;
}
#endif
} // anon namespace
#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
template<typename index_t>
int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets);
#endif
Tensor embedding_backward_cuda_kernel(
const Tensor &grad,
@ -223,12 +234,20 @@ Tensor embedding_backward_cuda_kernel(
auto segment_offsets = at::empty({numel}, orig_indices.options());
auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong));
int64_t *num_of_segments_ptr = num_of_segments_tensor.mutable_data_ptr<int64_t>();
#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
int64_t num_of_segments = embedding_backward_cuda_kernel_unique_by_key<index_t>(sorted_indices, segment_offsets);
write_num_of_segments_for_legacy_thrust_path<<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>(num_of_segments_ptr, num_of_segments);
C10_CUDA_KERNEL_LAUNCH_CHECK();
});
#else
AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
cuda::cub::unique_by_key(
sorted_indices.const_data_ptr<index_t>(), thrust::make_counting_iterator(0),
segment_offsets.mutable_data_ptr<index_t>(),
num_of_segments_ptr, sorted_indices.numel());
});
#endif
int64_t max_segments = std::min<int64_t>(numel, num_weights);

View File

@ -31,10 +31,16 @@
#include <c10/macros/Macros.h>
#if CUB_SUPPORTS_SCAN_BY_KEY()
#include <thrust/iterator/reverse_iterator.h>
#endif
namespace at::native {
#if !CUB_SUPPORTS_SCAN_BY_KEY()
template<typename index_t>
void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
#endif
namespace {
@ -193,6 +199,7 @@ Tensor embedding_bag_backward_cuda_sum_avg(
if (scale_grad_by_freq) {
count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
#if CUB_SUPPORTS_SCAN_BY_KEY()
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -219,6 +226,11 @@ Tensor embedding_bag_backward_cuda_sum_avg(
num_indices
);
});
#else
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
});
#endif
}
return embedding_backward_cuda_kernel(grad, orig_indices, sorted_indices,
count, num_weights, padding_idx, mode == EmbeddingBagMode::MEAN, offset2bag,

View File

@ -0,0 +1,90 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/native/cuda/SortingCommon.cuh>
#include <ATen/cuda/cub_definitions.cuh>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#else
#include <ATen/ops/empty_like.h>
#endif
#include <ATen/cuda/ThrustAllocator.h>
#include <thrust/device_ptr.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/device_ptr.h>
#include <thrust/iterator/constant_iterator.h>
namespace at::native {
#if !CUB_SUPPORTS_SCAN_BY_KEY()
template<typename index_t>
void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count) {
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
at::cuda::ThrustAllocator allocator;
auto policy = thrust::cuda::par(allocator).on(stream);
auto num_indices = count.numel();
// Compute an increasing sequence per unique item in sortedIndices:
// sorted: 2 5 5 5 7 7 8 9 9
// count: 1 1 2 3 1 2 1 1 2
auto sorted_data = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
auto count_data = thrust::device_ptr<index_t>(count.mutable_data_ptr<index_t>());
thrust::inclusive_scan_by_key(
policy,
sorted_data,
sorted_data + num_indices,
thrust::make_constant_iterator(1),
count_data
);
// Take the maximum of each count per unique key in reverse:
// sorted: 2 5 5 5 7 7 8 9 9
// count: 1 3 3 3 2 2 1 2 2
thrust::inclusive_scan_by_key(
policy,
thrust::make_reverse_iterator(sorted_data + num_indices),
thrust::make_reverse_iterator(sorted_data),
thrust::make_reverse_iterator(count_data + num_indices),
thrust::make_reverse_iterator(count_data + num_indices),
thrust::equal_to<index_t>(),
thrust::maximum<index_t>()
);
}
template
void embedding_dense_backward_cuda_scan<int>(Tensor &sorted_indices, Tensor &count);
template
void embedding_dense_backward_cuda_scan<int64_t>(Tensor &sorted_indices, Tensor &count);
#endif
template<typename index_t>
int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets) {
auto stream = at::cuda::getCurrentCUDAStream();
at::cuda::ThrustAllocator allocator;
auto policy = thrust::cuda::par(allocator).on(stream);
const ptrdiff_t numel = sorted_indices.numel();
auto sorted_indices_dev = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
auto dummy = at::empty_like(sorted_indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
auto dummy_dev = thrust::device_ptr<index_t>(dummy.mutable_data_ptr<index_t>());
auto ends = thrust::unique_by_key_copy(
policy,
sorted_indices_dev,
sorted_indices_dev + numel,
thrust::make_counting_iterator(0),
dummy_dev,
thrust::device_ptr<index_t>(segment_offsets.mutable_data_ptr<index_t>()));
return thrust::get<0>(ends) - dummy_dev;
}
template
int64_t embedding_backward_cuda_kernel_unique_by_key<int>(const Tensor &sorted_indices, Tensor &segment_offsets);
template
int64_t embedding_backward_cuda_kernel_unique_by_key<int64_t>(const Tensor &sorted_indices, Tensor &segment_offsets);
} // namespace at::native

View File

@ -146,7 +146,6 @@ __global__ void nll_loss2d_backward_no_reduce_kernel(
int64_t batch_size = target.size(0);
int64_t H = target.size(1);
int64_t W = target.size(2);
int64_t n_classes = grad_input.size(1);
CUDA_KERNEL_LOOP(index, n_threads) {
const int64_t b = index % batch_size;
@ -157,7 +156,6 @@ __global__ void nll_loss2d_backward_no_reduce_kernel(
if (cur_target == ignore_index) {
continue;
}
CUDA_KERNEL_ASSERT(cur_target >= 0 && cur_target < n_classes);
scalar_t value = -(weight != nullptr ? weight[cur_target] : static_cast<scalar_t>(1));
grad_input[b][cur_target][h][w] = value * grad_output[b][h][w];
}

View File

@ -413,12 +413,14 @@ struct ReduceOp {
value = thread_reduce<output_vec_size>(input_slice);
}
if (config.should_block_x_reduce()) {
value = block_x_reduce<output_vec_size>(value, shared_memory);
}
if (config.should_block_y_reduce()) {
value = block_y_reduce<output_vec_size>(value, shared_memory);
}
__syncthreads();
if (config.should_block_x_reduce()) {
value = block_x_reduce<output_vec_size>(value, shared_memory);
}
using out_ptr_vec_t = std::array<out_scalar_t*, output_vec_size>;
using offset_vec_t = std::array<index_t, output_vec_size>;
offset_vec_t base_offsets;
@ -655,8 +657,8 @@ struct ReduceOp {
__syncthreads();
// Intra-warp reduction, fix CUDA to have offset decreasing for better numerics
// matching Triton, etc.
// TODO(PaulZhang12): AMD and internal
#if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
// todo for AMD
#ifdef USE_ROCM
for (int offset = 1; offset < dim_x; offset <<= 1) {
#else
for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {

View File

@ -19,6 +19,7 @@
namespace at::native {
// TODO: remove this when CUDA <11.6 is no longer supported
void topk_out_with_sort(
const Tensor& self,
int64_t k, int64_t dim, bool largest,
@ -30,12 +31,21 @@ void topk_out_with_sort(
indices.copy_(sorted_indices.narrow(dim, 0, k));
}
// TODO: remove this when CUDA <11.6 is no longer supported
bool disable_sort_for_topk();
bool should_use_sort(const Tensor& self, int64_t dim) {
#if defined(USE_ROCM)
if (self.dtype() == kBool) return false; // Bool sort not supported in ROCm: https://github.com/pytorch/pytorch/issues/139972
return (self.numel() >= 10000 && self.numel() == self.size(dim)); // based on the experiments in https://github.com/pytorch/pytorch/pull/146387
#else
return false;
if (disable_sort_for_topk()) return false;
// This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/68632
if (self.dim() == 0) return false;
if (self.dtype() == kBool) return false; // Bool is not support by topk
int64_t slice_size = self.size(dim);
if (slice_size == 0) return false;
int64_t num_slices = self.numel() / slice_size;
return num_slices <= 10 && slice_size >= 100000;
#endif
}

View File

@ -21,6 +21,11 @@ using namespace at::native;
namespace at::native {
// TODO: remove this when CUDA <11.6 is no longer supported
bool disable_sort_for_topk() {
return CUB_SUPPORTS_SCAN_BY_KEY();
}
namespace sbtopk { // single_block_topk
template <typename T>
@ -413,6 +418,10 @@ __global__ void computeBlockwiseWithinKCounts(
}
__syncthreads();
#if !CUB_SUPPORTS_SCAN_BY_KEY()
return;
#endif
Bitwise desired_digit = at::cuda::Bitfield<Bitwise>::getBitfield(desired, current_bit, RADIX_BITS);
// if largest, then only threads that has tidx > desired_digit are active
@ -468,6 +477,7 @@ __global__ void computeBlockwiseWithinKCounts(
}
}
#if CUB_SUPPORTS_SCAN_BY_KEY()
// Assumption: slice_size can not be larger than UINT32_MAX
template <typename Bitwise>
__global__ void computeBlockwiseKthCounts(
@ -599,6 +609,7 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<const T, IndexType> inpu
}
}
}
#endif
int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) {
// occupancy of this kernel is limited by registers per threads
@ -676,12 +687,16 @@ void launch(
uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get());
AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream));
#if CUB_SUPPORTS_SCAN_BY_KEY()
auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
AT_CUDA_CHECK(cudaMemsetAsync(withinKCounts, 0, num_blocks * sizeof(uint32_t), stream));
auto kthCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
uint32_t* kthCounts = reinterpret_cast<uint32_t*>(kthCounts_buffer.get());
#else
uint32_t* withinKCounts = nullptr;
#endif
Bitwise desiredMask = 0;
dim3 grid;
@ -728,6 +743,7 @@ void launch(
}
desired = desired_in;
#if CUB_SUPPORTS_SCAN_BY_KEY()
computeBlockwiseKthCounts<Bitwise><<<std::min(((int64_t)numInputSlices + 255) / 256, (int64_t)1073741824), 256, 0, stream>>>(
desired, counts, num_blocks, blocks_per_slice, kthCounts);
C10_CUDA_KERNEL_LAUNCH_CHECK();
@ -743,6 +759,28 @@ void launch(
topK, topKWithinSliceStride, indices, indicesWithinSliceStride, items_per_thread,
blocks_per_slice, kthValues, withinKCounts, kthCounts, num_blocks);
C10_CUDA_KERNEL_LAUNCH_CHECK();
#else
// Find topk values based on kth values
{
dim3 grid;
TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk");
int warp_size = at::cuda::warp_size();
dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024));
sbtopk::gatherTopK<T, IndexType, Dim, /* WithKthValues= */true><<<grid, block, 0, stream>>>(
input,
inputSliceSize,
outputSliceSize,
largest,
numInputSlices,
inputWithinSliceStride,
topK,
topKWithinSliceStride,
indices,
indicesWithinSliceStride,
kthValues);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
#endif
}
} // namespace mbtopk
@ -750,6 +788,7 @@ void launch(
bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
if (num_slices > std::numeric_limits<uint32_t>::max() ||
slice_size > std::numeric_limits<uint32_t>::max()) return false;
#if CUB_SUPPORTS_SCAN_BY_KEY()
// This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/74267
return (num_slices <= 20 && slice_size >= 20000) ||
(num_slices > 20 && num_slices <= 40 && slice_size >= 10000) ||
@ -758,6 +797,12 @@ bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
(num_slices >= 200 && num_slices < 800 && slice_size >= 3000) ||
(num_slices >= 800 && num_slices <= 4000 && slice_size >= 800) ||
(num_slices > 4000 && slice_size >= 400);
#else
// This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/71081
return (num_slices <= 400 && slice_size >= 5000) ||
(num_slices > 400 && num_slices < 4000 && slice_size >= 1000) ||
(num_slices >= 4000 && slice_size >= 300);
#endif
}
void launch_gather_topk_kernel(

View File

@ -5,7 +5,6 @@
#include <ATen/Dispatch.h>
#include <ATen/MemoryOverlap.h>
#include <ATen/native/Resize.h>
#include <ATen/native/TriangularOpsUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
@ -45,7 +44,7 @@ __global__ void triu_tril_kernel(
const int64_t k,
const int64_t N_padded,
const IndexType last_dim_padded) {
int64_t linear_idx = (((int64_t)blockIdx.x) * blockDim.x + threadIdx.x) * elements_per_thread;
int64_t linear_idx = (blockIdx.x * blockDim.x + threadIdx.x) * elements_per_thread;
if (linear_idx >= N_padded) {
return;
}
@ -111,8 +110,6 @@ __global__ void triu_tril_kernel(
template <bool upper>
void triu_tril_cuda_template(const Tensor& result, const Tensor& self, int64_t k, const char* name) {
checkTrilTriuMemoryOverlap(result, self);
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
at::ScalarType::ComplexHalf,
at::ScalarType::Half,

View File

@ -466,7 +466,7 @@ struct ReduceJitOp {
__syncthreads();
#if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
#ifdef USE_ROCM
for (int offset = 1; offset < dim_x; offset <<= 1) {
#else
for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {

View File

@ -487,7 +487,9 @@ std::unique_ptr<fe::graph::Graph> build_graph(
auto scaled_dot_product_flash_attention_options =
fe::graph::SDPA_attributes()
.set_name("CUDNN_SDPA")
.set_generate_stats(return_softmaxstats)
.set_is_inference(return_softmaxstats == false)
// TODO(eqy): switch to this API once cuDNN FE is upgraded
// .set_generate_stats(return_softmaxstats)
.set_causal_mask(is_causal)
.set_attn_scale(attn_scale);
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
@ -705,7 +707,9 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
auto scaled_dot_product_flash_attention_options =
fe::graph::SDPA_attributes()
.set_name("CUDNN_SDPA_NESTEDTENSOR")
.set_generate_stats(return_softmaxstats)
.set_is_inference(return_softmaxstats == false)
// TODO(eqy): switch to this API once cuDNN FE is upgraded
// .set_generate_stats(return_softmaxstats)
.set_causal_mask(is_causal)
.set_attn_scale(attn_scale)
.set_seq_len_q(SEQ_LEN_Q_)

View File

@ -441,7 +441,7 @@ kernel void applySYRK(
uint3 tid [[thread_position_in_threadgroup]],
uint3 tgid [[threadgroup_position_in_grid]],
uint3 tpg [[threads_per_threadgroup]],
uint warp_id [[simdgroup_index_in_threadgroup]]) {
uint sgitg [[simdgroup_index_in_threadgroup]]) {
const uint tx = tid.x;
const uint ty = tid.y;
const uint simdGroupsPerThreadgroup = (tpg.x * tpg.y + 31) / 32;
@ -474,8 +474,11 @@ kernel void applySYRK(
(actSize_j % 8 == 0) && (actSize_h % 8 == 0) && (actSize_k % 8 == 0);
if (use_simdgroup) {
uint warp_id = sgitg;
simdgroup_matrix<float, 8, 8> negative_identity =
simdgroup_matrix<float, 8, 8>(-1.0);
simdgroup_matrix<float, 8, 8> identity = simdgroup_matrix<float, 8, 8>(1.0);
simdgroup_matrix<float, 8, 8> Prod;
simdgroup_matrix<float, 8, 8> Afrag;
simdgroup_matrix<float, 8, 8> Bfrag;
@ -518,7 +521,8 @@ kernel void applySYRK(
/* transpose = */ upper);
simdgroup_multiply(Prod, Afrag, Bfrag);
simdgroup_multiply_accumulate(Cfrag, Prod, negative_identity, Cfrag);
simdgroup_multiply(Prod, Prod, negative_identity);
simdgroup_multiply_accumulate(Cfrag, Cfrag, identity, Prod);
}
simdgroup_store(

View File

@ -706,7 +706,6 @@
variants: function, method
dispatch:
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_all
tags: reduction
- func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
@ -716,7 +715,6 @@
cpp_no_default_args: ['dim']
dispatch:
CompositeExplicitAutograd: all_dims_default
tags: reduction
- func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
@ -725,7 +723,6 @@
CPU, CUDA: all_out
MPS: all_out_mps
MTIA: all_out_mtia
tags: reduction
- func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
@ -734,16 +731,13 @@
CPU, CUDA: all_dims_out
CompositeExplicitAutograd: all_dims_out_default
cpp_no_default_args: ['dim']
tags: reduction
- func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
- func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool
variants: function, method
@ -755,14 +749,14 @@
device_check: NoCheck # TensorIterator
structured_delegate: any.out
variants: function, method
tags: [core, reduction]
tags: core
- func: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
structured_delegate: any.dims_out
variants: function, method
cpp_no_default_args: ['dim']
tags: [core, reduction]
tags: core
dispatch:
CompositeExplicitAutograd: any_dims_default
@ -772,7 +766,6 @@
dispatch:
CPU, CUDA: any_out
MPS: any_out_mps
tags: reduction
- func: any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
@ -781,16 +774,13 @@
CPU, CUDA: any_dims_out
CompositeExplicitAutograd: any_dims_out_default
cpp_no_default_args: ['dim']
tags: reduction
- func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
- func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
dispatch:
@ -836,27 +826,25 @@
structured_delegate: argmax.out
device_check: NoCheck # TensorIterator
variants: function, method
tags: [core, reduction]
tags: core
- func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU, CUDA: argmax_out
MPS: argmax_out_mps
tags: reduction
- func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
structured_delegate: argmin.out
device_check: NoCheck # TensorIterator
variants: function, method
tags: [core, reduction]
tags: core
- func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU, CUDA: argmin_out
MPS: argmin_out_mps
tags: reduction
- func: acosh(Tensor self) -> Tensor
variants: function, method
@ -1382,7 +1370,6 @@
dispatch:
SparseCPU: bmm_sparse_cpu
SparseCUDA: bmm_sparse_cuda
SparseMPS: bmm_sparse_mps
NestedTensorCPU: bmm_nested
NestedTensorCUDA: bmm_nested_cuda
tags: core
@ -1398,7 +1385,6 @@
MTIA: bmm_out_mtia
SparseCPU: bmm_out_sparse_cpu
SparseCUDA: bmm_out_sparse_cuda
SparseMPS: bmm_out_sparse_mps
SparseCsrCUDA: bmm_out_sparse_csr_cuda
- func: bmm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
@ -1881,14 +1867,12 @@
CUDA: count_nonzero_cuda
MPS: count_nonzero_mps
autogen: count_nonzero.dim_IntList_out
tags: reduction
- func: count_nonzero(Tensor self, int? dim=None) -> Tensor
variants: function, method
dispatch:
CompositeExplicitAutograd: count_nonzero
autogen: count_nonzero.out
tags: reduction
- func: cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor
variants: function, method
@ -3809,23 +3793,19 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: logsumexp
tags: reduction
- func: logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
# calls squeeze
CompositeExplicitAutogradNonFunctional: logsumexp_out
tags: reduction
- func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
- func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
@ -3875,7 +3855,6 @@
device_check: NoCheck # TensorIterator
structured_delegate: aminmax.out
variants: function, method
tags: reduction
- func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max)
device_check: NoCheck # TensorIterator
@ -3883,7 +3862,6 @@
dispatch:
CPU, CUDA, MTIA: aminmax_out
MPS: aminmax_out_mps
tags: reduction
- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
dispatch:
@ -3899,7 +3877,7 @@
variants: function, method
dispatch:
QuantizedCPU, QuantizedCUDA: qmax
tags: [core, reduction]
tags: core
- func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
device_check: NoCheck # TensorIterator
@ -3909,16 +3887,13 @@
dispatch:
CPU, CUDA, MTIA: max_out
MPS: max_out_mps
tags: reduction
- func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
device_check: NoCheck # TensorIterator
tags: reduction
- func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor
variants: function
@ -3931,14 +3906,13 @@
- func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
variants: function, method
structured_delegate: amax.out
tags: [core, reduction]
tags: core
- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU, CUDA, MTIA: amax_out
MPS: amax_out_mps
tags: reduction
# Return: (Tensor output, Tensor indices)
- func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
@ -4000,14 +3974,13 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: mean
tags: [core, reduction]
tags: core
# For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this.
- func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CompositeExplicitAutograd: mean_dtype_out
tags: reduction
- func: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
structured_delegate: mean.out
@ -4015,7 +3988,7 @@
variants: function, method
dispatch:
QuantizedCPU: mean_quantized_cpu
tags: [core, reduction]
tags: core
- func: mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
structured: True
@ -4024,16 +3997,13 @@
CPU, CUDA: mean_out
MPS: mean_out_mps
QuantizedCPU: mean_out_quantized_cpu
tags: reduction
- func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
- func: nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # Composite
@ -4096,7 +4066,7 @@
variants: function, method
dispatch:
QuantizedCPU, QuantizedCUDA: qmin
tags: [core, reduction]
tags: core
- func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
device_check: NoCheck # TensorIterator
@ -4106,28 +4076,24 @@
dispatch:
CPU, CUDA, MTIA: min_out
MPS: min_out_mps
tags: reduction
- func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
device_check: NoCheck # TensorIterator
tags: reduction
- func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
variants: function, method
structured_delegate: amin.out
tags: [core, reduction]
tags: core
- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU, CUDA, MTIA: amin_out
MPS: amin_out_mps
tags: reduction
# TODO: Add this function to MPS dispatch key so that we avoid declaring it in
# native_functions.yaml
@ -4207,7 +4173,7 @@
structured_delegate: mm.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: _sparse_mm
SparseCPU, SparseCUDA: _sparse_mm
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm
tags: core
@ -5892,7 +5858,6 @@
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
autogen: sum.out
tags: reduction
- func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
# TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype
@ -5903,12 +5868,11 @@
NestedTensorCPU: NestedTensor_sum_dim_CPU
SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
tags: [core, reduction]
tags: core
- func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
structured: True
@ -5916,11 +5880,9 @@
dispatch:
CPU, CUDA: sum_out
MPS: sum_out_mps
tags: reduction
- func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
# TODO: this function will be replaced once nested expand semantics have been settled on
- func: _nested_sum_backward(Tensor grad, Tensor self, int[1]? dim, bool keepdim=False) -> Tensor
@ -5932,13 +5894,11 @@
dispatch:
CPU, CUDA: nansum
MPS: nansum_mps
tags: reduction
- func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA: nansum_out
MPS: nansum_out_mps
tags: reduction
- func: hash_tensor(Tensor self, int[1] dim=[], *, bool keepdim=False, int mode=0) -> Tensor
variants: function, method
@ -6002,13 +5962,11 @@
device_check: NoCheck # TensorIterator
variants: function, method
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
@ -6017,19 +5975,16 @@
CPU, CUDA: std
MPS: std_mps
QuantizedCPU: std_quantized_cpu
tags: reduction
- func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
@ -6038,51 +5993,42 @@
CPU, CUDA: std_mean
MPS: std_mean_mps
autogen: std_mean.correction_out
tags: reduction
- func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
tags: reduction
- func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA: std_out
QuantizedCPU: std_out_quantized_cpu
tags: reduction
- func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: function
tags: reduction
- func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # TensorIterator
@ -6091,13 +6037,13 @@
CPU, CUDA: prod
MPS: prod_mps
autogen: prod.out
tags: [core, reduction]
tags: core
- func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
structured_delegate: prod.int_out
device_check: NoCheck # TensorIterator
variants: function, method
tags: [core, reduction]
tags: core
- func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
structured: True
@ -6105,16 +6051,13 @@
dispatch:
CPU, CUDA: prod_out
MPS: prod_out_mps
tags: reduction
- func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
- func: t(Tensor(a) self) -> Tensor(a)
device_check: NoCheck
@ -6575,12 +6518,11 @@
device_check: NoCheck # TensorIterator
variants: function, method
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: [core, reduction]
tags: core
cpp_no_default_args: ["unbiased"]
- func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
@ -6590,51 +6532,43 @@
CPU, CUDA: var
MPS: var_mps
MTIA: var_mtia
tags: [core, reduction]
tags: core
- func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA: var_out
tags: reduction
- func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: function
tags: reduction
- func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
@ -6643,18 +6577,15 @@
CPU, CUDA: var_mean
MPS: var_mean_mps
autogen: var_mean.correction_out
tags: reduction
- func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
tags: reduction
- func: view_as(Tensor(a) self, Tensor other) -> Tensor(a)
variants: method
@ -6914,7 +6845,6 @@
dispatch:
CompositeExplicitAutograd: norm
autogen: norm.ScalarOpt_dtype_out
tags: reduction
- func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
device_check: NoCheck # TensorIterator
@ -6922,7 +6852,6 @@
dispatch:
CompositeExplicitAutograd: norm
autogen: norm.Scalar_out
tags: reduction
- func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
structured_delegate: norm.dtype_out
@ -6930,7 +6859,6 @@
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sparse_dtype_norm
tags: reduction
- func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
structured_delegate: norm.out
@ -6938,7 +6866,6 @@
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sparse_norm
tags: reduction
- func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
structured: True
@ -6946,7 +6873,6 @@
dispatch:
CPU, CUDA: norm_dtype_out
MPS: norm_dtype_out_mps
tags: reduction
- func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True
@ -6954,26 +6880,21 @@
dispatch:
CPU, CUDA: norm_out
MPS: norm_out_mps
tags: reduction
# These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
- func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
- func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
- func: frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)
variants: method, function
@ -7191,7 +7112,6 @@
MTIA: addmm_out_mtia
SparseCPU: addmm_out_sparse_dense_cpu
SparseCUDA: addmm_out_sparse_dense_cuda
SparseMPS: addmm_out_sparse_dense_mps
SparseCsrCPU: addmm_out_sparse_compressed_cpu
SparseCsrCUDA: addmm_out_sparse_compressed_cuda
@ -7201,7 +7121,6 @@
dispatch:
SparseCPU: addmm_sparse_dense_cpu
SparseCUDA: addmm_sparse_dense_cuda
SparseMPS: addmm_sparse_dense_mps
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: addmm_sparse_compressed_dense
tags: core
@ -10159,14 +10078,12 @@
CPU, CUDA: min
MPS: min_mps
QuantizedCPU: min_quantized_cpu
tags: [reduction]
- func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA: min_unary_out
QuantizedCPU: min_quantized_unary_out
tags: [reduction]
- func: fmin(Tensor self, Tensor other) -> Tensor
structured_delegate: fmin.out
@ -10189,7 +10106,6 @@
CPU, CUDA: max
MPS: max_mps
QuantizedCPU: max_quantized_cpu
tags: [reduction]
- func: fmax(Tensor self, Tensor other) -> Tensor
structured_delegate: fmax.out
@ -10236,7 +10152,6 @@
dispatch:
CPU, CUDA: max_unary_out
QuantizedCPU: max_quantized_unary_out
tags: [reduction]
- func: minimum(Tensor self, Tensor other) -> Tensor
structured_delegate: minimum.out
@ -10356,7 +10271,6 @@
device_check: NoCheck # TensorIterator
structured_delegate: all.all_out
variants: method, function
tags: reduction
- func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck
@ -10365,7 +10279,6 @@
CPU, CUDA: all_all_out
MTIA: all_all_out_mtia
MPS: all_all_out_mps
tags: reduction
- func: any(Tensor self) -> Tensor
device_check: NoCheck # TensorIterator
@ -10373,7 +10286,7 @@
variants: method, function
dispatch:
SparseCPU, SparseCUDA, SparseMPS: any_sparse
tags: [core, reduction]
tags: core
- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck
@ -10381,7 +10294,6 @@
dispatch:
CPU, CUDA: any_all_out
MPS: any_all_out_mps
tags: reduction
- func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
@ -14433,7 +14345,6 @@
python_module: linalg
variants: function
structured_delegate: linalg_vector_norm.out
tags: reduction
- func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
python_module: linalg
@ -14441,7 +14352,6 @@
dispatch:
CPU, CUDA: linalg_vector_norm_out
MPS: linalg_vector_norm_out_mps
tags: reduction
- func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
python_module: linalg

View File

@ -1,6 +1,5 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/SparseTensorUtils.h>
#include <ATen/ExpandUtils.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/sparse/SparseStubs.h>
#include <ATen/native/sparse/SparseBinaryOpIntersectionCommon.h>
@ -19,8 +18,6 @@
#include <ATen/ops/ones_like.h>
#include <ATen/ops/argsort.h>
#include <ATen/ops/result_type.h>
#include <ATen/ops/bmm_native.h>
#include <ATen/ops/addmm_native.h>
#include <ATen/ops/copy_sparse_to_sparse.h>
#include <ATen/ops/mul.h>
#endif
@ -36,305 +33,6 @@ static auto& lib = MetalShaderLibrary::getBundledLibrary();
#include <ATen/native/mps/Mul_metallib.h>
#endif
static Tensor& s_addmm_out_sparse_dense_mps(
Tensor& r,
const Tensor& t,
const SparseTensor& sparse_,
const Tensor& dense,
const Scalar& beta,
const Scalar& alpha) {
TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: sparse_dim must be 2, got ", sparse_.sparse_dim());
TORCH_CHECK(sparse_.dense_dim() == 0, "addmm: sparse values must be 0-dense-dim, got ", sparse_.dense_dim());
TORCH_CHECK(dense.dim() == 2, "addmm: 'dense' must be 2D, got ", dense.dim());
TORCH_CHECK(t.dim() == 2, "addmm: 't' must be 2D, got ", t.dim());
const int64_t I = sparse_.size(0);
const int64_t J = sparse_.size(1);
const int64_t K = dense.size(1);
TORCH_CHECK(dense.size(0) == J,
"addmm: dense (mat2) dim0 must be ", J, ", got ", dense.size(0));
TORCH_CHECK(t.size(0) == I && t.size(1) == K,
"addmm: 't' shape must be (", I, ", ", K, "), got (", t.size(0), ", ", t.size(1), ")");
r.resize_({I, K});
auto sparse = sparse_.coalesce();
const int64_t nnz = sparse._nnz();
if (nnz == 0 || I == 0 || K == 0) {
at::mul_out(r, t, beta);
return r;
}
const auto v_dtype = sparse._values().scalar_type();
const auto d_dtype = dense.scalar_type();
const auto t_dtype = t.scalar_type();
auto compute_dtype = c10::promoteTypes(c10::promoteTypes(v_dtype, d_dtype), t_dtype);
TORCH_CHECK(canCast(compute_dtype, r.scalar_type()),
"Can't convert computed type ", compute_dtype, " to output ", r.scalar_type());
auto indices2d = sparse._indices().contiguous();
auto values = sparse._values().to(compute_dtype);
auto dense_c = dense.to(compute_dtype).contiguous();
auto t_c = t.to(compute_dtype).contiguous();
const bool out_needs_cast = (r.scalar_type() != compute_dtype) || !r.is_contiguous();
Tensor out_buf = out_needs_cast
? at::empty({I, K}, r.options().dtype(compute_dtype))
: r;
auto out_contig = out_buf.contiguous();
auto device = r.device();
auto stream = getCurrentMPSStream();
const float alpha_f = alpha.to<float>();
const float beta_f = beta.to<float>();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
const std::string func = "spmm_addmm_coo_" + mps::scalarToMetalTypeString(values);
auto pso = lib.getPipelineStateForFunc(func);
auto enc = stream->commandEncoder();
[enc setComputePipelineState:pso];
const uint32_t tew = pso.threadExecutionWidth;
const uint32_t gridX = static_cast<uint32_t>(K);
const uint32_t gridZ = static_cast<uint32_t>(I);
const uint32_t tgW = std::min<uint32_t>(gridX, tew);
MTLSize grid = MTLSizeMake(gridX, 1, gridZ);
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
mtl_setArgs(enc,
indices2d,
values,
dense_c,
t_c,
out_contig,
std::array<uint32_t, 3>{static_cast<uint32_t>(I),
static_cast<uint32_t>(J),
static_cast<uint32_t>(K)},
std::array<float, 2>{alpha_f, beta_f},
static_cast<uint32_t>(nnz));
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
}
});
if (out_needs_cast) {
r.copy_(out_contig.to(r.scalar_type()));
}
return r;
}
static void build_batch_ptr_mps(
const Tensor& indices_dim0,
int64_t B,
Tensor& batch_ptr
) {
// Builds an array of pointers which point to each batches elements. Example:
// idx_b = [0, 0, 0, 1, 1, 2, 2, 2, 2] // 9 non-zero elements
// └─────┘ └──┘ └─────────┘
// batch 0 batch 1 batch 2
// batch_ptr = [0, 3, 5, 9]
// │ │ │ └─ end of batch 2 (total nnz)
// │ │ └──── batch 2 starts at index 5
// │ └─────── batch 1 starts at index 3
// └────────── batch 0 starts at index 0
TORCH_CHECK(indices_dim0.is_mps() && batch_ptr.is_mps(), "MPS device expected");
auto device = indices_dim0.device();
auto stream = getCurrentMPSStream();
const int64_t nnz = indices_dim0.numel();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pso = lib.getPipelineStateForFunc("build_batch_ptr_from_sorted_batches");
auto enc = stream->commandEncoder();
[enc setComputePipelineState:pso];
const uint32_t tew = pso.threadExecutionWidth;
const uint32_t Q = static_cast<uint32_t>(B + 1);
const uint32_t tgW = std::min<uint32_t>(Q, tew);
MTLSize grid = MTLSizeMake(Q, 1, 1);
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
mtl_setArgs(enc,
indices_dim0,
batch_ptr,
std::array<uint32_t, 2>{static_cast<uint32_t>(nnz),
static_cast<uint32_t>(B)});
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
}
});
}
static void build_row_ptr_per_batch_mps(
const Tensor& rows,
const Tensor& batch_ptr,
int64_t B,
int64_t I,
Tensor& row_ptr
) {
// Build per-batch CSR-style row pointer arrays from row indices sorted by batch
// Given:
// rows: 1-D array of length nnz with row ids in [0, I), sorted within each batch
// batch_ptr: length B+1, where [batch_ptr[b], batch_ptr[b+1]) is the subrange for batch b
// Produces:
// - row_ptr: shape [B, I+1]
//
// Example (B = 2, I = 4):
// rows = [0, 0, 1, 3, 0, 2, 2] // 7 non-zero elements
// └─── batch 0 ──┘ └─ batch 1 ─┘
// batch_ptr = [0, 4, 7]
// │ │ └─ end of batch 1 (total nnz)
// │ └──── end of batch 0/start of batch 1
// └─────── start of batch 0
//
// per-batch row pointers (I+1 entries each):
// row_ptr[0] = [0, 2, 3, 3, 4]
// row_ptr[1] = [0, 1, 1, 3, 3]
// laid out in memory: [0, 2, 3, 3, 4, 0, 1, 1, 3, 3]
TORCH_CHECK(rows.is_mps() && batch_ptr.is_mps() && row_ptr.is_mps(), "MPS device expected");
auto stream = getCurrentMPSStream();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pso = lib.getPipelineStateForFunc("build_row_ptr_from_sorted_rows_by_batch");
auto enc = stream->commandEncoder();
[enc setComputePipelineState:pso];
const uint32_t tew = pso.threadExecutionWidth;
const uint32_t Qx = static_cast<uint32_t>(I + 1);
const uint32_t Qy = static_cast<uint32_t>(B);
const uint32_t tgW = std::min<uint32_t>(Qx, tew);
MTLSize grid = MTLSizeMake(Qx, Qy, 1);
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
mtl_setArgs(enc,
rows,
batch_ptr,
row_ptr,
std::array<uint32_t, 2>{static_cast<uint32_t>(I),
static_cast<uint32_t>(B)});
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
}
});
}
Tensor& bmm_out_sparse_mps(const SparseTensor& self_, const Tensor& mat2_, Tensor& result_) {
TORCH_CHECK(result_.is_mps(), "bmm_sparse: expected 'out' to be MPS, got ", result_.device());
TORCH_CHECK(self_.is_mps(), "bmm_sparse: expected 'self' to be MPS, got ", self_.device());
TORCH_CHECK(mat2_.is_mps(), "bmm_sparse: expected 'mat2' to be MPS, got ", mat2_.device());
TORCH_CHECK(self_.dense_dim() == 0, "bmm_sparse: Tensor 'self' must have 0 dense dims, but has ", self_.dense_dim());
TORCH_CHECK(self_.sparse_dim() == 3, "bmm_sparse: Tensor 'self' must have 3 sparse dims, but has ", self_.sparse_dim());
TORCH_CHECK(mat2_.dim() == 3, "bmm_sparse: Tensor 'mat2' must have 3 dims, but has ", mat2_.dim());
TORCH_CHECK(self_.size(0) == mat2_.size(0), "bmm_sparse: 'self.size(0)' and 'mat2.size(0)' must match");
TORCH_CHECK(self_.size(2) == mat2_.size(1), "bmm_sparse: 'self.size(2)' and 'mat2.size(1)' must match");
const int64_t B = self_.size(0);
const int64_t I = self_.size(1);
const int64_t J = self_.size(2);
const int64_t K = mat2_.size(2);
auto self = self_.coalesce();
const int64_t nnz = self._nnz();
if (nnz == 0) {
return result_.zero_();
}
const auto computeDtype = at::kFloat;
auto indices = self._indices();
auto values = self._values();
auto values_c = values.scalar_type() == computeDtype ? values : values.to(computeDtype);
auto mat2_c = mat2_.scalar_type() == computeDtype ? mat2_ : mat2_.to(computeDtype);
auto mat2_contig = mat2_c.contiguous();
auto idx_b = indices.select(0, 0).contiguous();
auto idx_i = indices.select(0, 1).contiguous();
auto idx_j = indices.select(0, 2).contiguous();
// builds an array of pointers of where the batch_idx's pointer starts and ends
// look in function for better explanation
auto batch_ptr = at::empty({B + 1}, at::device(result_.device()).dtype(kLong));
build_batch_ptr_mps(idx_b, B, batch_ptr);
// build row_ptr per batch: for each (b, i) get [start, end) into rows/cols/vals
auto row_ptr = at::empty({B * (I + 1)}, at::device(result_.device()).dtype(kLong));
build_row_ptr_per_batch_mps(idx_i, batch_ptr, B, I, row_ptr);
const bool out_needs_cast = (result_.scalar_type() != computeDtype) || !result_.is_contiguous();
Tensor out_buf = out_needs_cast
? at::empty({B, I, K}, result_.options().dtype(computeDtype))
: result_;
auto out_contig = out_buf.contiguous();
auto stream = getCurrentMPSStream();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pso = lib.getPipelineStateForFunc("spmm_bmm_coo_rows_grouped_" + mps::scalarToMetalTypeString(values));
auto enc = stream->commandEncoder();
[enc setComputePipelineState:pso];
const uint32_t tew = pso.threadExecutionWidth;
const uint32_t tgW = std::min<uint32_t>((uint32_t)K, tew);
// One threadgroup per (row i, batch b), lanes cover K
MTLSize grid = MTLSizeMake(tgW, (uint32_t)I, (uint32_t)B);
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
mtl_setArgs(enc,
idx_i,
idx_j,
values_c,
mat2_contig,
out_contig,
row_ptr,
std::array<uint32_t, 4>{(uint32_t)B, (uint32_t)I, (uint32_t)J, (uint32_t)K});
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
}
});
if (out_needs_cast) {
result_.copy_(out_contig.to(result_.scalar_type()));
}
return result_;
}
Tensor bmm_sparse_mps(const Tensor& self, const Tensor& mat2) {
Tensor result = at::zeros({self.size(0), self.size(1), mat2.size(2)}, mat2.options());
return bmm_out_sparse_mps(self, mat2, result);
}
Tensor& addmm_out_sparse_dense_mps(
const Tensor& self,
const SparseTensor& mat1,
const Tensor& mat2,
const Scalar& beta,
const Scalar& alpha,
Tensor& result) {
c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha);
}
Tensor addmm_sparse_dense_mps(
const Tensor& self,
const SparseTensor& mat1,
const Tensor& mat2,
const Scalar& beta,
const Scalar& alpha
) {
c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
Tensor result = at::empty({0}, self.options());
return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha);
}
static SparseTensor& mul_out_dense_sparse_mps(
const Tensor& dense,
const Tensor& sparse,

View File

@ -1,105 +1,10 @@
#include <metal_stdlib>
#include <c10/metal/indexing.h>
#include <c10/metal/utils.h>
using namespace c10::metal;
using namespace metal;
inline uint lower_bound_i64(device const long* arr, uint lo, uint hi, long key) {
uint l = lo, r = hi;
while (l < r) {
uint m = (l + r) >> 1;
long v = arr[m];
if (v < key) {
l = m + 1;
} else {
r = m;
}
}
return l;
}
inline uint upper_bound_i64(device const long* arr, uint lo, uint hi, long key) {
uint l = lo, r = hi;
while (l < r) {
uint m = (l + r) >> 1;
long v = arr[m];
if (v <= key) {
l = m + 1;
} else {
r = m;
}
}
return l;
}
kernel void build_row_ptr_from_sorted_rows_by_batch(
device const long* rows [[buffer(0)]],
device const long* batch_ptr [[buffer(1)]],
device long* row_ptr [[buffer(2)]],
constant uint2& dims [[buffer(3)]],
uint3 tid [[thread_position_in_grid]])
{
const uint I = dims.x;
const uint B = dims.y;
const uint i = tid.x;
const uint b = tid.y;
if (b >= B || i > I) return;
const uint base = (uint)batch_ptr[b];
const uint lim = (uint)batch_ptr[b + 1];
const ulong out_base = (ulong)b * (ulong)(I + 1);
if (i == I) {
row_ptr[out_base + (ulong)I] = (long)lim;
} else {
const long key = (long)i;
const uint pos = lower_bound_i64(rows, base, lim, key);
row_ptr[out_base + (ulong)i] = (long)pos;
}
}
template <typename T>
kernel void spmm_bmm_coo_rows_grouped(
device const long* rows [[buffer(0)]],
device const long* cols [[buffer(1)]],
device const T* vals [[buffer(2)]],
device const T* dense [[buffer(3)]],
device T* out [[buffer(4)]],
device const long* row_ptr [[buffer(5)]],
constant uint4& dims [[buffer(6)]],
uint3 tid [[thread_position_in_grid]],
uint3 ltid [[thread_position_in_threadgroup]],
uint3 tptg [[threads_per_threadgroup]])
{
const uint B = dims.x;
const uint I = dims.y;
const uint J = dims.z;
const uint K = dims.w;
const uint b = tid.z;
const uint i = tid.y;
const uint lane = ltid.x;
const uint tgW = tptg.x;
const ulong rp_base = (ulong)b * (ulong)(I + 1);
const uint start = (uint)row_ptr[rp_base + (ulong)i];
const uint end = (uint)row_ptr[rp_base + (ulong)i + 1];
for (uint k = lane; k < K; k += tgW) {
auto acc = static_cast<accum_t<T>>(T(0));
for (uint p = start; p < end; ++p) {
const uint c = (uint)cols[p];
const auto v = static_cast<accum_t<T>>(vals[p]);
const uint d_off = ((b * J) + c) * K + k;
const auto d = static_cast<accum_t<T>>(dense[d_off]);
acc += mul(v, d);
}
const uint y_off = ((b * I) + i) * K + k;
out[y_off] = static_cast<T>(acc);
}
}
template <typename T> struct MulAccum { using type = float; };
template <> struct MulAccum<float2> { using type = float2; };
template <typename T>
kernel void dense_sparse_mul_kernel(
@ -127,9 +32,10 @@ kernel void dense_sparse_mul_kernel(
ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;
const auto a = static_cast<accum_t<T>>(values[val_idx]);
const auto b = static_cast<accum_t<T>>(dense[dense_idx]);
out_values[val_idx] = static_cast<T>(mul(a, b));
using accum_t = typename MulAccum<T>::type;
const accum_t a = static_cast<accum_t>(values[val_idx]);
const accum_t b = static_cast<accum_t>(dense[dense_idx]);
out_values[val_idx] = static_cast<T>(a * b);
}
kernel void intersect_binary_search(
@ -214,76 +120,6 @@ kernel void fused_gather_mul_kernel(
}
}
kernel void build_batch_ptr_from_sorted_batches(
device const long* batches [[buffer(0)]],
device long* batch_ptr [[buffer(1)]],
constant uint2& nnz_B [[buffer(2)]],
uint3 tid [[thread_position_in_grid]])
{
uint b = tid.x;
uint nnz = nnz_B.x;
uint batch = nnz_B.y;
if (b == batch) {
batch_ptr[b] = (long)nnz;
return;
}
uint lo = 0;
uint hi = nnz;
long key = (long)b;
while (lo < hi) {
uint mid = (lo + hi) >> 1;
long v = batches[mid];
if (v < key) lo = mid + 1;
else hi = mid;
}
batch_ptr[b] = (long)lo;
}
template <typename T>
kernel void spmm_addmm_coo(
device const long* indices2d [[buffer(0)]],
device const T* vals [[buffer(1)]],
device const T* dense [[buffer(2)]],
device const T* t_in [[buffer(3)]],
device T* out [[buffer(4)]],
constant uint3& dims [[buffer(5)]],
constant float2& alpha_beta [[buffer(6)]],
constant uint& nnz [[buffer(7)]],
uint3 tid [[thread_position_in_grid]])
{
const uint K = dims.z;
const uint k = tid.x;
const uint i = tid.z;
const float alpha = alpha_beta.x;
const float beta = alpha_beta.y;
device const long* rows = indices2d;
device const long* cols = indices2d + nnz;
const uint start = lower_bound_i64(rows, 0u, nnz, (long)i);
const uint end = upper_bound_i64(rows, 0u, nnz, (long)i);
// accumulator is float for scalar/half/bfloat and float2 for float2
auto acc = static_cast<accum_t<T>>(T(0));
for (uint p = start; p < end; ++p) {
const uint c = (uint)cols[p];
const auto v = static_cast<accum_t<T>>(vals[p]);
const uint dense_off = c * K + k;
const auto d = static_cast<accum_t<T>>(dense[dense_off]);
acc += mul(v, d);
}
const uint off = i * K + k;
const auto base = (beta != 0.0f) ? (static_cast<accum_t<T>>(t_in[off]) * beta) : static_cast<accum_t<T>>(T(0));
const auto y = base + alpha * acc;
out[off] = static_cast<T>(y);
}
#define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE) \
template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void \
dense_sparse_mul_kernel<DTYPE>( \
@ -315,36 +151,6 @@ INSTANTIATE_DENSE_SPARSE_MUL(float2);
constant uint2& dims_output [[buffer(8)]], \
uint3 gid [[thread_position_in_grid]]);
INSTANTIATE_FOR_FLOAT_TYPES(INSTANTIATE_FUSED_GATHER_MUL);
#define INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED(DTYPE) \
template [[host_name("spmm_bmm_coo_rows_grouped_" #DTYPE)]] kernel void \
spmm_bmm_coo_rows_grouped<DTYPE>( \
device const long* rows [[buffer(0)]], \
device const long* cols [[buffer(1)]], \
device const DTYPE* vals [[buffer(2)]], \
device const DTYPE* dense [[buffer(3)]], \
device DTYPE* out [[buffer(4)]], \
device const long* row_ptr [[buffer(5)]], \
constant uint4& dims [[buffer(6)]], \
uint3 tid [[thread_position_in_grid]], \
uint3 ltid [[thread_position_in_threadgroup]], \
uint3 tptg [[threads_per_threadgroup]]);
INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED);
#define INSTANTIATE_SPMM_ADDMM_COO(DTYPE) \
template [[host_name("spmm_addmm_coo_" #DTYPE)]] kernel void \
spmm_addmm_coo<DTYPE>( \
device const long* indices2d [[buffer(0)]], \
device const DTYPE* vals [[buffer(1)]], \
device const DTYPE* dense [[buffer(2)]], \
device const DTYPE* t_in [[buffer(3)]], \
device DTYPE* out [[buffer(4)]], \
constant uint3& dims [[buffer(5)]], \
constant float2& alpha_beta [[buffer(6)]], \
constant uint& nnz [[buffer(7)]], \
uint3 tid [[thread_position_in_grid]]);
INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_ADDMM_COO);
INSTANTIATE_FUSED_GATHER_MUL(float);
INSTANTIATE_FUSED_GATHER_MUL(half);
INSTANTIATE_FUSED_GATHER_MUL(bfloat);

View File

@ -93,7 +93,3 @@
This operator does not support cudagraphs. The presence of this tag on an operator will cause
Inductor to split the graph around this operator. Note that operators without this tag may still
not support CUDAGraphs. Inductor may have other hardcoded lists around that.
- tag: reduction
desc: |
This tag indicates that an operator performs a reduction operation, computing aggregate values
(sum, mean, max, min, etc.) across one or more dimensions of the input tensor(s).

View File

@ -1751,8 +1751,8 @@ def maybe_snapshot_memory(should_snapshot_memory, suffix):
f"{output_filename.rstrip('.csv')}_{suffix}.pickle",
)
)
except Exception:
log.exception("Failed to save memory snapshot")
except Exception as e:
log.error("Failed to save memory snapshot, %s", e)
torch.cuda.memory._record_memory_history(enabled=None)

View File

@ -124,7 +124,7 @@ with open(MODELS_FILENAME) as fh:
continue
batch_size = int(batch_size)
BATCH_SIZE_KNOWN_MODELS[model_name] = batch_size
assert BATCH_SIZE_KNOWN_MODELS
assert len(BATCH_SIZE_KNOWN_MODELS)
try:

View File

@ -296,8 +296,8 @@ class OperatorInputsLoader:
for key in self.operator_db.keys():
try:
op = eval(key)
except AttributeError:
log.warning("Evaluating an op name into an OpOverload", exc_info=True)
except AttributeError as ae:
log.warning("Evaluating an op name into an OpOverload: %s", ae)
continue
yield op

View File

@ -3,7 +3,6 @@ import sys
from benchmark_base import BenchmarkBase
import torch
from torch._dynamo.utils import CompileTimeInstructionCounter
class Benchmark(BenchmarkBase):
@ -33,11 +32,7 @@ class Benchmark(BenchmarkBase):
def _work(self):
# enable_cpp_symbolic_shape_guards has impact on this benchmark
# Keep using False value for consistency.
with (
torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False),
torch._export.config.patch(use_new_tracer_experimental=True),
CompileTimeInstructionCounter.record(),
):
with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False):
torch.export.export(self.m, (self.input,), strict=True)

View File

@ -38,7 +38,7 @@ update_hint_regression,compile_time_instruction_count,1719000000,0.1
sum_floordiv_regression,compile_time_instruction_count,3686995725,0.1
sum_floordiv_regression,compile_time_instruction_count,966100000,0.1

1 add_loop_eager compile_time_instruction_count 3070000000 0.1
38
39
40
41
42
43
44

View File

@ -127,7 +127,7 @@ def trainbench(
bwd_time = bwd_start_event.elapsed_time(bwd_end_event)
return fwd_time, bwd_time
creator_args = {
creator_args = creator_args = {
"seqLength": seqLength,
"numLayers": numLayers,
"inputSize": inputSize,

View File

@ -12,7 +12,7 @@ def modeldef(request, net_name, executor, fuser):
# Given a 'net_name' provided by generate_tests, build the thing
name, rnn_creator, context = get_nn_runners(net_name)[0]
creator_args = {
creator_args = creator_args = {
"seqLength": 100,
"numLayers": 1,
"inputSize": 512,

View File

@ -85,7 +85,7 @@ class WeightOnlyInt8QuantHandler:
cur_state_dict[f"{fqn}.weight"] = int8_weight
cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype)
elif isinstance(mod, ConditionalFeedForward):
for weight_idx in range(3):
for weight_idx in range(0, 3):
weight_name = f"w{weight_idx + 1}"
scales_name = f"scales{weight_idx + 1}"
weight = getattr(mod, weight_name)

View File

@ -1729,8 +1729,10 @@ def define_buck_targets(
"torch/csrc/jit/backends/backend_debug_info.cpp",
"torch/csrc/jit/backends/backend_interface.cpp",
],
compiler_flags = get_pt_compiler_flags(),
fbandroid_compiler_flags = c2_fbandroid_xplat_compiler_flags,
compiler_flags = get_pt_compiler_flags() + select({
"DEFAULT": [],
"ovr_config//os:android": c2_fbandroid_xplat_compiler_flags
}),
# @lint-ignore BUCKLINT link_whole
link_whole = True,
linker_flags = get_no_as_needed_linker_flag(),
@ -2023,6 +2025,9 @@ def define_buck_targets(
"ovr_config//os:android-x86_64": [
"-mssse3",
],
}) + select({
"DEFAULT": [],
"ovr_config//os:android": c2_fbandroid_xplat_compiler_flags,
}),
exported_preprocessor_flags = get_aten_preprocessor_flags(),
exported_deps = [

View File

@ -1,4 +1,5 @@
#include <c10/core/AllocatorConfig.h>
#include <c10/core/DeviceType.h>
#include <c10/util/env.h>
namespace c10::CachingAllocator {
@ -46,7 +47,7 @@ size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
const size_t interval_end =
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
TORCH_CHECK_VALUE(
TORCH_CHECK(
interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
"kRoundUpPowerOfTwoIntervals mismatch");
@ -65,7 +66,7 @@ size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
std::numeric_limits<size_t>::max() / kMB;
size_t val_env = tokenizer.toSizeT(++i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
val_env >= min_allowed_split_size_mb,
"CachingAllocator option max_split_size_mb too small, must be >= ",
min_allowed_split_size_mb);
@ -84,7 +85,7 @@ size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
std::numeric_limits<size_t>::max() / kMB;
size_t val_env = tokenizer.toSizeT(++i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
val_env >= min_allowed_split_size_mb,
"CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
min_allowed_split_size_mb);
@ -99,7 +100,7 @@ size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
size_t i) {
tokenizer.checkToken(++i, ":");
double val_env = tokenizer.toDouble(++i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
val_env > 0 && val_env < 1.0,
"garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
garbage_collection_threshold_ = val_env;
@ -120,7 +121,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
size_t value_index = i;
tokenizer.checkToken(++i, ":");
size_t value = tokenizer.toSizeT(++i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
value == 0 || llvm::isPowerOf2_64(value),
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
@ -128,13 +129,12 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
std::fill(
std::next(
roundup_power2_divisions_.begin(),
static_cast<std::vector<size_t>::difference_type>(
last_index + 1)),
static_cast<std::vector<size_t>::difference_type>(last_index)),
roundup_power2_divisions_.end(),
value);
} else {
size_t boundary = tokenizer.toSizeT(value_index);
TORCH_CHECK_VALUE(
TORCH_CHECK(
llvm::isPowerOf2_64(boundary),
"For roundups, the intervals have to be power of 2 ");
@ -164,7 +164,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
} else { // Keep this for backwards compatibility
size_t value = tokenizer.toSizeT(i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
llvm::isPowerOf2_64(value),
"For roundups, the divisions has to be power of 2 ");
std::fill(
@ -224,7 +224,7 @@ void AcceleratorAllocatorConfig::parseArgs(const std::string& env) {
// If a device-specific configuration parser hook is registered, it will
// check if the key is unrecognized.
if (device_config_parser_hook_) {
TORCH_CHECK_VALUE(
TORCH_CHECK(
getKeys().find(key) != getKeys().end(),
"Unrecognized key '",
key,

View File

@ -76,7 +76,7 @@ class ConfigTokenizer {
} else if (token == "False") {
return false;
} else {
TORCH_CHECK_VALUE(
TORCH_CHECK(
false,
"Expected 'True' or 'False' at index ",
i,
@ -253,7 +253,7 @@ class C10_API AcceleratorAllocatorConfig {
device_config_parser_hook_ = std::move(hook);
auto& mutable_keys = getMutableKeys();
for (auto& key : keys) {
TORCH_CHECK_VALUE(
TORCH_CHECK(
mutable_keys.insert(key).second,
"Duplicated key '",
key,

View File

@ -102,7 +102,7 @@ uint64_t getNonDeterministicRandom(bool is_cuda) {
} else {
std::random_device rd;
// limit to 53 bits to ensure unique representation in double
s = (((static_cast<uint64_t>(rd())) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
s = ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
}
return s;
}

View File

@ -20,8 +20,7 @@ void maybeApplyRefcountedDeleter(const c10::Storage& storage) {
std::lock_guard<std::mutex> guard(replace_data_ptr_mutex);
c10::DataPtr& data_ptr = storage.mutable_data_ptr();
if (reinterpret_cast<const void*>(data_ptr.get_deleter()) ==
reinterpret_cast<const void*>(&c10::refcounted_deleter)) {
if ((void*)data_ptr.get_deleter() == (void*)&c10::refcounted_deleter) {
// Data pointer is already shared
return;
}

View File

@ -52,19 +52,6 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CONSTANT)
#undef DEFINE_CONSTANT
inline const char* toString(ScalarType t) {
#define DEFINE_CASE(_, name) \
case ScalarType::name: \
return #name;
switch (t) {
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE)
default:
return "UNKNOWN_SCALAR";
}
#undef DEFINE_CASE
}
inline size_t elementSize(ScalarType t) {
#define CASE_ELEMENTSIZE_CASE(ctype, name) \
case ScalarType::name: \
@ -308,12 +295,6 @@ inline bool canCast(const ScalarType from, const ScalarType to) {
C10_API ScalarType promoteTypes(ScalarType a, ScalarType b);
inline std::ostream& operator<<(
std::ostream& stream,
at::ScalarType scalar_type) {
return stream << toString(scalar_type);
}
// Returns a pair of strings representing the names for each dtype.
// The returned pair is (name, legacy_name_if_applicable)
C10_API std::pair<std::string, std::string> getDtypeNames(

View File

@ -4,6 +4,7 @@
#include <c10/core/SymNodeImpl.h>
#include <c10/util/intrusive_ptr.h>
#include <c10/util/safe_numerics.h>
#include <functional>
namespace c10 {
@ -83,7 +84,7 @@ DEFINE_BINARY(max_slow_path, sym_max, SymInt)
SymInt::operator SymFloat() const {
if (auto ma = maybe_as_int()) {
return SymFloat(static_cast<double>(*ma));
return SymFloat(double(*ma));
} else {
return SymFloat(toSymNodeImplUnowned()->sym_float());
}

View File

@ -9,6 +9,7 @@
#include <c10/core/impl/TorchDispatchModeTLS.h>
#include <c10/util/Logging.h>
#include <c10/util/accumulate.h>
#include <c10/util/irange.h>
#include <optional>
#include <utility>

View File

@ -1,5 +1,9 @@
#include <c10/core/TensorOptions.h>
#include <c10/core/Device.h>
#include <c10/core/Layout.h>
#include <c10/util/Optional.h>
#include <iostream>
namespace c10 {

View File

@ -2,6 +2,7 @@
#include <c10/core/Allocator.h>
#include <c10/core/StorageImpl.h>
#include <c10/core/alignment.h>
#include <c10/core/impl/COWDeleter.h>
#include <c10/util/Exception.h>
#include <c10/util/ParallelGuard.h>
@ -44,8 +45,7 @@ bool has_simple_data_ptr(const c10::StorageImpl& storage) {
}
bool is_cow_data_ptr(const c10::DataPtr& data_ptr) {
return reinterpret_cast<const void*>(data_ptr.get_deleter()) ==
reinterpret_cast<const void*>(&cow::cow_deleter);
return (void*)data_ptr.get_deleter() == (void*)&cow::cow_deleter;
}
c10::intrusive_ptr<StorageImpl> lazy_clone_storage(StorageImpl& storage) {

View File

@ -1,4 +1,5 @@
#include <c10/core/DispatchKey.h>
#include <c10/core/SafePyObject.h>
#include <c10/core/impl/LocalDispatchKeySet.h>
#include <c10/core/impl/TorchDispatchModeTLS.h>
#include <c10/util/irange.h>

View File

@ -20,7 +20,7 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig(
tokenizer.checkToken(++i, ":");
i++; // Move to the value after the colon
#ifdef USE_ROCM
TORCH_CHECK_VALUE(
TORCH_CHECK(
((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) ||
(tokenizer[i] == PYTORCH_TOKEN2)),
"Unknown allocator backend, "
@ -36,7 +36,7 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig(
" != ",
get()->name());
#else // USE_ROCM
TORCH_CHECK_VALUE(
TORCH_CHECK(
((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1)),
"Unknown allocator backend, "
"options are native and " PYTORCH_TOKEN1);
@ -109,7 +109,7 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
} else {
const auto& keys =
c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
TORCH_CHECK_VALUE(
TORCH_CHECK(
keys.find(key) != keys.end(),
"Unrecognized key '",
key,
@ -151,12 +151,12 @@ size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
size_t i) {
tokenizer.checkToken(++i, ":");
size_t val2 = tokenizer.toSizeT(++i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
llvm::isPowerOf2_64(val2),
"Number of register threads has to be power of 2, got ",
val2);
auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
TORCH_CHECK_VALUE(
TORCH_CHECK(
val2 <= maxThreads,
"Number of register threads should be less than or equal to ",
maxThreads,
@ -171,8 +171,7 @@ size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
size_t i) {
tokenizer.checkToken(++i, ":");
size_t val2 = tokenizer.toSizeT(++i);
TORCH_CHECK_VALUE(
val2 > 0, "Pinned reserve segment size has to be greater than 0");
TORCH_CHECK(val2 > 0, "Pinned reserve segment size has to be greater than 0");
m_pinned_reserve_segment_size_mb = val2;
return i;
}

View File

@ -3,7 +3,6 @@
#include <c10/core/AllocatorConfig.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAMacros.h>
#include <c10/util/Deprecated.h>
#include <c10/util/Exception.h>
#include <c10/util/env.h>
@ -18,14 +17,9 @@ enum class Expandable_Segments_Handle_Type : int {
// Environment config parser
class C10_CUDA_API CUDAAllocatorConfig {
public:
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.")
static size_t max_split_size() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size();
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.")
static double garbage_collection_threshold() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
garbage_collection_threshold();
@ -70,8 +64,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
return instance().m_pinned_num_register_threads;
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.")
static bool pinned_use_background_threads() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
pinned_use_background_threads();
@ -88,15 +80,11 @@ class C10_CUDA_API CUDAAllocatorConfig {
return 128;
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
static size_t roundup_power2_divisions(size_t size) {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
roundup_power2_divisions(size);
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
static std::vector<size_t> roundup_power2_divisions() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
roundup_power2_divisions();
@ -107,8 +95,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
max_non_split_rounding_size();
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.")
static std::string last_allocator_settings() {
return c10::CachingAllocator::getAllocatorSettings();
}

View File

@ -512,7 +512,7 @@ struct ExpandableSegment {
header.segment_size = segment_size_;
header.num_handles = end - begin;
buf.write(reinterpret_cast<const char*>(&header), sizeof(ShareHeader));
buf.write((const char*)&header, sizeof(ShareHeader));
for (auto i : c10::irange(begin, end)) {
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
auto& handle = handles_.at(i).value();
@ -528,9 +528,7 @@ struct ExpandableSegment {
TORCH_CHECK(
handle.shareable_handle != std::nullopt,
"shareable_handle is null");
buf.write(
reinterpret_cast<const char*>(&*handle.shareable_handle),
sizeof(int));
buf.write((const char*)&*handle.shareable_handle, sizeof(int));
} else {
if (!handle.shareable_handle) {
CUmemFabricHandle fabric_handle;
@ -543,8 +541,7 @@ struct ExpandableSegment {
handle.shareable_handle != std::nullopt,
"shareable_handle is null");
buf.write(
reinterpret_cast<const char*>(&*handle.shareable_handle),
sizeof(CUmemFabricHandle));
(const char*)&*handle.shareable_handle, sizeof(CUmemFabricHandle));
}
}
return rangeFromHandles(begin, end);
@ -555,7 +552,7 @@ struct ExpandableSegment {
std::vector<c10::DeviceIndex> peers,
std::istream& buf) {
ShareHeader header{};
buf.read(reinterpret_cast<char*>(&header), sizeof(ShareHeader));
buf.read((char*)&header, sizeof(ShareHeader));
auto segment = std::make_unique<ExpandableSegment>(
device, std::nullopt, header.segment_size, std::move(peers));
// older build setups (e.g. multiwheels) do not have this syscall, added 2020
@ -577,11 +574,11 @@ struct ExpandableSegment {
for (auto i : c10::irange(header.num_handles)) {
(void)i;
int fd = 0;
buf.read(reinterpret_cast<char*>(&fd), sizeof(int));
buf.read((char*)&fd, sizeof(int));
auto myfd = syscall(SYS_pidfd_getfd, pidfd, fd, 0);
if (myfd == -1) {
auto err = errno;
close(static_cast<int>(pidfd));
close((int)pidfd);
for (auto& h : segment->handles_) {
C10_CUDA_DRIVER_CHECK(
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
@ -601,16 +598,15 @@ struct ExpandableSegment {
(void*)(uintptr_t)myfd,
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
LOG(INFO) << "use posix fd to import expandable segments.";
close(static_cast<int>(myfd));
close((int)myfd);
segment->handles_.emplace_back(Handle{handle, std::nullopt});
}
close(static_cast<int>(pidfd));
close((int)pidfd);
} else {
for (auto i : c10::irange(header.num_handles)) {
(void)i;
CUmemFabricHandle fabric_handle;
buf.read(
reinterpret_cast<char*>(&fabric_handle), sizeof(CUmemFabricHandle));
buf.read((char*)&fabric_handle, sizeof(CUmemFabricHandle));
CUmemGenericAllocationHandle handle = 0;
C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_(
&handle,
@ -1063,7 +1059,7 @@ class RingBuffer {
void setMaxEntries(size_t size) {
std::lock_guard<std::mutex> lk(alloc_trace_lock);
alloc_trace_max_entries_ = std::max(static_cast<size_t>(1), size);
alloc_trace_max_entries_ = std::max(size_t(1), size);
}
void insertEntries(const T& entry) {
@ -1264,9 +1260,6 @@ class DeviceCachingAllocator {
// thread local compile context for each device
static thread_local std::stack<std::string> compile_context;
// thread local user metadata for annotating allocations
static thread_local std::string user_metadata;
public:
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
explicit DeviceCachingAllocator(c10::DeviceIndex id)
@ -1274,7 +1267,7 @@ class DeviceCachingAllocator {
large_blocks(/*small=*/false),
small_blocks(/*small=*/true) {
stats.max_split_size =
static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
context_recorder_.store(nullptr);
}
@ -1309,14 +1302,6 @@ class DeviceCachingAllocator {
}
}
void setUserMetadata(const std::string& metadata) {
user_metadata = metadata;
}
std::string getUserMetadata() {
return user_metadata;
}
bool checkPoolLiveAllocations(
MempoolId_t mempool_id,
const std::unordered_set<void*>& expected_live_allocations) const {
@ -1409,8 +1394,7 @@ class DeviceCachingAllocator {
// Do garbage collection if the flag is set.
if (C10_UNLIKELY(
set_fraction &&
AcceleratorAllocatorConfig::garbage_collection_threshold() >
0.0)) {
CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
garbage_collect_cached_blocks(context);
}
// Attempt allocate
@ -1662,7 +1646,7 @@ class DeviceCachingAllocator {
stats.active_bytes[stat_type].increase(block->size);
stats.requested_bytes[stat_type].increase(block->requested_size);
});
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
if (block->size >= CUDAAllocatorConfig::max_split_size())
stats.oversize_allocations.increase(1);
auto allocated_bytes_gauge =
@ -1931,7 +1915,7 @@ class DeviceCachingAllocator {
block->pool->owner_MempoolId(),
context ? context : block->context_when_allocated);
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
if (block->size >= CUDAAllocatorConfig::max_split_size())
stats.oversize_allocations.decrease(1);
// If the block has been used on more than one stream, handle accordingly.
@ -1995,16 +1979,15 @@ class DeviceCachingAllocator {
while (base_block->prev) {
base_block = base_block->prev;
}
offset = static_cast<const char*>(block->ptr) -
static_cast<const char*>(base_block->ptr);
offset = (char*)block->ptr - (char*)base_block->ptr;
cudaIpcMemHandle_t handle;
C10_CUDA_CHECK(cudaIpcGetMemHandle(&handle, base_block->ptr));
ss.write(reinterpret_cast<const char*>(&handle), CUDA_IPC_HANDLE_SIZE);
ss.write((char*)&handle, CUDA_IPC_HANDLE_SIZE);
} else {
ss.put(SHAREABLE_CUDA_EXPANDABLE_SEGMENT);
auto full_range = block->expandable_segment_->share(
SegmentRange(block->ptr, block->size), ss);
offset = static_cast<const char*>(block->ptr) - full_range.ptr;
offset = (char*)block->ptr - full_range.ptr;
}
return ShareableHandle{offset, ss.str()};
}
@ -2505,8 +2488,7 @@ class DeviceCachingAllocator {
if (size < kMinBlockSize) {
return kMinBlockSize;
} else {
auto divisions =
AcceleratorAllocatorConfig::roundup_power2_divisions(size);
auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
if (divisions > 1 && size > (kMinBlockSize * divisions)) {
return roundup_power2_next_division(size, divisions);
} else {
@ -3000,7 +2982,7 @@ class DeviceCachingAllocator {
if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) {
return remaining >= kMinBlockSize;
} else {
return (size < AcceleratorAllocatorConfig::max_split_size()) &&
return (size < CUDAAllocatorConfig::max_split_size()) &&
(remaining > kSmallSize);
}
}
@ -3020,7 +3002,7 @@ class DeviceCachingAllocator {
if (C10_UNLIKELY(
set_fraction &&
AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
// Track block reuse interval only when garbage collection is enabled.
++pool.get_free_blocks_call_count;
}
@ -3062,13 +3044,13 @@ class DeviceCachingAllocator {
}
// Do not return an oversized block for a large request
if ((p.size() < AcceleratorAllocatorConfig::max_split_size()) &&
((*it)->size >= AcceleratorAllocatorConfig::max_split_size()))
if ((p.size() < CUDAAllocatorConfig::max_split_size()) &&
((*it)->size >= CUDAAllocatorConfig::max_split_size()))
return false;
// Allow oversized block size to be rounded up but within a limit
if ((p.size() >= AcceleratorAllocatorConfig::max_split_size()) &&
if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
((*it)->size >=
p.size() + AcceleratorAllocatorConfig::max_non_split_rounding_size()))
p.size() + CUDAAllocatorConfig::max_non_split_rounding_size()))
return false;
p.block = *it;
pool.blocks.erase(it);
@ -3091,7 +3073,7 @@ class DeviceCachingAllocator {
// therefore should be of less overheads.
size_t gc_threshold = static_cast<size_t>(
AcceleratorAllocatorConfig::garbage_collection_threshold() *
CUDAAllocatorConfig::garbage_collection_threshold() *
static_cast<double>(allowed_memory_maximum));
// No need to trigger GC yet
if (total_allocated_memory <= gc_threshold) {
@ -3234,13 +3216,12 @@ class DeviceCachingAllocator {
}
total_allocated_memory += size;
p.block = new Block(
p.device(), p.stream(), size, p.pool, static_cast<char*>(ptr));
p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
stats.segment[stat_type].increase(1);
stats.reserved_bytes[stat_type].increase(size);
});
if (size >= AcceleratorAllocatorConfig::max_split_size())
if (size >= CUDAAllocatorConfig::max_split_size())
stats.oversize_segments.increase(1);
auto reserved_bytes_gauge =
STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
@ -3269,7 +3250,7 @@ class DeviceCachingAllocator {
bool release_available_cached_blocks(
const AllocParams& p,
const std::shared_ptr<GatheredContext>& context) {
if (AcceleratorAllocatorConfig::max_split_size() ==
if (CUDAAllocatorConfig::max_split_size() ==
std::numeric_limits<size_t>::max())
return false;
BlockPool& pool = *p.pool;
@ -3277,8 +3258,8 @@ class DeviceCachingAllocator {
// because of std::unique_ptr, block cannot be trivially copied
// Use constructor for search key.
Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
key.size = (key.size < AcceleratorAllocatorConfig::max_split_size())
? AcceleratorAllocatorConfig::max_split_size()
key.size = (key.size < CUDAAllocatorConfig::max_split_size())
? CUDAAllocatorConfig::max_split_size()
: key.size;
auto it = pool.blocks.lower_bound(&key);
if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
@ -3291,7 +3272,7 @@ class DeviceCachingAllocator {
--it; // Back up one item. Now on the largest block for the correct
// stream
while ((totalReleased < key.size) &&
((*it)->size >= AcceleratorAllocatorConfig::max_split_size()) &&
((*it)->size >= CUDAAllocatorConfig::max_split_size()) &&
((*it)->stream == p.stream())) {
auto cur = it;
bool is_first = cur == pool.blocks.begin();
@ -3416,7 +3397,7 @@ class DeviceCachingAllocator {
stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
.current);
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
if (block->size >= CUDAAllocatorConfig::max_split_size())
stats.oversize_segments.decrease(1);
pool->blocks.erase(block);
delete block;
@ -3701,8 +3682,7 @@ class DeviceCachingAllocator {
mempool_id,
getApproximateTime(),
record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr,
compile_string,
user_metadata);
compile_string);
// Callbacks should not include any Pytorch call
for (const auto& cb : trace_trackers_) {
@ -3757,7 +3737,6 @@ static void uncached_delete(void* ptr) {
static void local_raw_delete(void* ptr);
thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
thread_local std::string DeviceCachingAllocator::user_metadata;
#ifdef __cpp_lib_hardware_interference_size
using std::hardware_destructive_interference_size;
#else
@ -3783,7 +3762,7 @@ class NativeCachingAllocator : public CUDAAllocator {
allocated_blocks;
static size_t get_mutex_shard_id(void* ptr) {
return twang_mix64(reinterpret_cast<uintptr_t>(ptr)) % kNumMutexShard;
return twang_mix64((size_t)ptr) % kNumMutexShard;
}
void add_allocated_block(Block* block) {
@ -3820,8 +3799,8 @@ class NativeCachingAllocator : public CUDAAllocator {
if (size < device_count) {
device_allocator.resize(device_count);
for (const auto i : c10::irange(size, device_count)) {
device_allocator[i] = std::make_unique<DeviceCachingAllocator>(
static_cast<c10::DeviceIndex>(i));
device_allocator[i] =
std::make_unique<DeviceCachingAllocator>(c10::DeviceIndex(i));
}
}
}
@ -3955,18 +3934,6 @@ class NativeCachingAllocator : public CUDAAllocator {
device_allocator[device]->popCompileContext();
}
void setUserMetadata(const std::string& metadata) override {
c10::DeviceIndex device = 0;
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
device_allocator[device]->setUserMetadata(metadata);
}
std::string getUserMetadata() override {
c10::DeviceIndex device = 0;
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
return device_allocator[device]->getUserMetadata();
}
bool isHistoryEnabled() override {
c10::DeviceIndex device = 0;
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
@ -4067,8 +4034,8 @@ class NativeCachingAllocator : public CUDAAllocator {
auto& md = result.config_metadata;
md.garbage_collection_threshold =
AcceleratorAllocatorConfig::garbage_collection_threshold();
md.max_split_size = AcceleratorAllocatorConfig::max_split_size();
CUDAAllocatorConfig::garbage_collection_threshold();
md.max_split_size = CUDAAllocatorConfig::max_split_size();
md.pinned_num_register_threads =
CUDAAllocatorConfig::pinned_num_register_threads();
md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
@ -4076,12 +4043,11 @@ class NativeCachingAllocator : public CUDAAllocator {
CUDAAllocatorConfig::release_lock_on_cudamalloc();
md.pinned_use_host_register =
CUDAAllocatorConfig::pinned_use_cuda_host_register();
md.last_allocator_settings =
AcceleratorAllocatorConfig::last_allocator_settings();
md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
md.graph_capture_record_stream_reuse =
CUDAAllocatorConfig::graph_capture_record_stream_reuse();
md.roundup_power2_divisions =
AcceleratorAllocatorConfig::roundup_power2_divisions();
CUDAAllocatorConfig::roundup_power2_divisions();
return result;
}
@ -4350,7 +4316,7 @@ class NativeCachingAllocator : public CUDAAllocator {
// SHARABLE_CUDA_MALLOC
if (type == SHAREABLE_CUDA_MALLOC) {
cudaIpcMemHandle_t cuda_handle;
ss.read(reinterpret_cast<char*>(&cuda_handle), CUDA_IPC_HANDLE_SIZE);
ss.read((char*)&cuda_handle, CUDA_IPC_HANDLE_SIZE);
C10_CUDA_CHECK(cudaIpcOpenMemHandle(
&cuda_ipc_ptr_, cuda_handle, cudaIpcMemLazyEnablePeerAccess));
} else if (type == SHAREABLE_CUDA_EXPANDABLE_SEGMENT) {
@ -4459,12 +4425,11 @@ CUDAAllocator* allocator();
} // namespace CudaMallocAsync
struct BackendStaticInitializer {
// Parses the environment configuration for CUDA/ROCm allocator backend at
// load time. This duplicates some logic from CUDAAllocatorConfig to ensure
// lazy initialization without triggering global static constructors. The
// function looks for the key "backend" and returns the appropriate allocator
// instance based on its value. If no valid configuration is found, it falls
// back to the default Native allocator.
// Parses env for backend at load time, duplicating some logic from
// CUDAAllocatorConfig. CUDAAllocatorConfig double-checks it later (at
// runtime). Defers verbose exceptions and error checks, including Cuda
// version checks, to CUDAAllocatorConfig's runtime doublecheck. If this
// works, maybe we should move all of CUDAAllocatorConfig here?
CUDAAllocator* parseEnvForBackend() {
auto val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
#ifdef USE_ROCM
@ -4473,35 +4438,34 @@ struct BackendStaticInitializer {
val = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
}
#endif
if (!val.has_value()) {
val = c10::utils::get_env("PYTORCH_ALLOC_CONF");
}
if (val.has_value()) {
c10::CachingAllocator::ConfigTokenizer tokenizer(val.value());
for (size_t i = 0; i < tokenizer.size(); i++) {
const auto& key = tokenizer[i];
if (key == "backend") {
tokenizer.checkToken(++i, ":");
i++; // Move to the value after the colon
if (tokenizer[i] == "cudaMallocAsync"
const std::string& config = val.value();
std::regex exp("[\\s,]+");
std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
std::sregex_token_iterator end;
std::vector<std::string> options(it, end);
for (auto option : options) {
std::regex exp2("[:]+");
std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
std::sregex_token_iterator end2;
std::vector<std::string> kv(it2, end2);
if (kv.size() >= 2) {
if (kv[0] == "backend") {
#ifdef USE_ROCM
// convenience for ROCm users to allow either CUDA or HIP env var
|| tokenizer[i] == "hipMallocAsync"
// convenience for ROCm users to allow either CUDA or HIP env var
if (kv[1] == "cudaMallocAsync" || kv[1] == "hipMallocAsync")
#else
if (kv[1] == "cudaMallocAsync")
#endif
) {
return CudaMallocAsync::allocator();
return CudaMallocAsync::allocator();
if (kv[1] == "native")
return &Native::allocator;
}
break;
} else {
// Skip the key and its value
i = tokenizer.skipKey(i);
}
if (i + 1 < tokenizer.size()) {
tokenizer.checkToken(++i, ",");
}
}
}
// Default fallback allocator.
return &Native::allocator;
}

View File

@ -118,8 +118,7 @@ struct TraceEntry {
MempoolId_t mempool,
approx_time_t time,
std::shared_ptr<GatheredContext> context = nullptr,
std::string compile_context = "",
std::string user_metadata = "")
std::string compile_context = "")
: action_(action),
device_(device),
addr_(addr),
@ -127,8 +126,7 @@ struct TraceEntry {
stream_(stream),
size_(size),
mempool_(std::move(mempool)),
compile_context_(std::move(compile_context)),
user_metadata_(std::move(user_metadata)) {
compile_context_(std::move(compile_context)) {
time_.approx_t_ = time;
}
Action action_;
@ -140,7 +138,6 @@ struct TraceEntry {
MempoolId_t mempool_;
trace_time_ time_{};
std::string compile_context_;
std::string user_metadata_;
};
// Calls made by record_function will save annotations
@ -300,10 +297,6 @@ class CUDAAllocator : public DeviceAllocator {
const std::vector<std::pair<std::string, std::string>>& /*md*/) {}
virtual void pushCompileContext(std::string& md) {}
virtual void popCompileContext() {}
virtual void setUserMetadata(const std::string& metadata) {}
virtual std::string getUserMetadata() {
return "";
}
virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
// Attached AllocatorTraceTracker callbacks will be called while the
@ -543,14 +536,6 @@ inline void enablePeerAccess(
get()->enablePeerAccess(dev, dev_to_access);
}
inline void setUserMetadata(const std::string& metadata) {
get()->setUserMetadata(metadata);
}
inline std::string getUserMetadata() {
return get()->getUserMetadata();
}
} // namespace c10::cuda::CUDACachingAllocator
namespace c10::cuda {

View File

@ -1,6 +1,8 @@
#include <c10/cuda/CUDADeviceAssertionHost.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAFunctions.h>
#include <c10/util/Backtrace.h>
#include <c10/util/Exception.h>
#include <c10/util/env.h>
#include <c10/util/irange.h>
#include <cuda_runtime.h>

View File

@ -4,6 +4,7 @@
#include <c10/cuda/CUDAGuard.h>
#include <c10/util/UniqueVoidPtr.h>
#include <c10/util/flat_hash_map.h>
#include <c10/util/irange.h>
#include <unordered_set>
#include <vector>
@ -46,7 +47,7 @@ bool operator==(const UsageStream& lhs, const UsageStream& rhs) {
struct UsageStreamHash {
size_t operator()(const UsageStream& us) const noexcept {
return std::hash<void*>{}(us.stream) + static_cast<size_t>(us.device);
return std::hash<void*>{}(us.stream) + size_t(us.device);
}
};

View File

@ -1,6 +1,7 @@
#include <c10/cuda/CUDAMiscFunctions.h>
#include <c10/util/env.h>
#include <cuda_runtime.h>
#include <cstring>
#include <string>
namespace c10::cuda {

View File

@ -128,7 +128,7 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
} else if (s.isExt()) {
stream << "EXT";
} else {
stream << "PRIORITY " << static_cast<int>(s.getStreamType());
stream << "PRIORITY " << int(s.getStreamType());
}
return stream;
}

View File

@ -1,6 +1,7 @@
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/driver_api.h>
#include <c10/util/CallOnce.h>
#include <c10/util/Exception.h>
#include <c10/util/Logging.h>
#include <cuda_runtime.h>

View File

@ -328,21 +328,5 @@ struct pair {
T2 second;
};
#define INSTANTIATE_FOR_ALL_TYPES(MACRO) \
MACRO(float); \
MACRO(half); \
MACRO(bfloat); \
MACRO(float2); \
MACRO(long); \
MACRO(char); \
MACRO(uchar); \
MACRO(short); \
MACRO(int);
#define INSTANTIATE_FOR_FLOAT_TYPES(MACRO) \
MACRO(float); \
MACRO(half); \
MACRO(bfloat);
} // namespace metal
} // namespace c10

View File

@ -67,8 +67,8 @@ TEST(AllocatorConfigTest, allocator_config_test) {
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(128 * kMB), 2);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 4);
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 2);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 4);
// EXPECT_EQ(
// AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 4);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 1);
EXPECT_EQ(
@ -101,8 +101,8 @@ TEST(AllocatorConfigTest, allocator_config_test) {
EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 1);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 0);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 8);
// EXPECT_EQ(
// AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 8);
EXPECT_EQ(
AcceleratorAllocatorConfig::roundup_power2_divisions(4096 * kMB), 2);

View File

@ -1,6 +1,7 @@
#include <c10/util/ApproximateClock.h>
#include <c10/util/Exception.h>
#include <c10/util/ArrayRef.h>
#include <c10/util/irange.h>
#include <fmt/format.h>
namespace c10 {
@ -46,8 +47,7 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
for (const auto i : c10::irange(replicates)) {
auto delta_ns = end_times[i].t_ - start_times_[i].t_;
auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_;
scale_factors[i] =
static_cast<double>(delta_ns) / static_cast<double>(delta_approx);
scale_factors[i] = (double)delta_ns / (double)delta_approx;
}
std::sort(scale_factors.begin(), scale_factors.end());
long double scale_factor = scale_factors[replicates / 2 + 1];
@ -65,8 +65,7 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
for (const auto i : c10::irange(replicates)) {
auto dt = start_times_[i].t_ - t0;
auto dt_approx =
static_cast<double>(start_times_[i].approx_t_ - t0_approx) *
scale_factor;
(double)(start_times_[i].approx_t_ - t0_approx) * scale_factor;
t0_correction[i] = dt - (time_t)dt_approx; // NOLINT
}
t0 += t0_correction[t0_correction.size() / 2 + 1]; // NOLINT
@ -74,9 +73,7 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
return [=](approx_time_t t_approx) {
// See above for why this is more stable than `A * t_approx + B`.
return t_approx > t0_approx
? static_cast<time_t>(
static_cast<double>(t_approx - t0_approx) * scale_factor) +
t0
? (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0
: 0;
};
}

View File

@ -18,6 +18,7 @@
#include <c10/macros/Macros.h>
#include <c10/util/Exception.h>
#include <c10/util/SmallVector.h>
#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
#include <array>
#include <cstddef>
@ -40,200 +41,106 @@ namespace c10 {
///
/// This is intended to be trivially copyable, so it should be passed by
/// value.
///
/// NOTE: We have refactored out the headeronly parts of the ArrayRef struct
/// into HeaderOnlyArrayRef. As adding `virtual` would change the performance of
/// the underlying constexpr calls, we rely on apparent-type dispatch for
/// inheritance. This should be fine because their memory format is the same,
/// and it is never incorrect for ArrayRef to call HeaderOnlyArrayRef methods.
/// However, you should prefer to use ArrayRef when possible, because its use
/// of TORCH_CHECK will lead to better user-facing error messages.
template <typename T>
class ArrayRef final {
class ArrayRef final : public HeaderOnlyArrayRef<T> {
public:
using iterator = const T*;
using const_iterator = const T*;
using size_type = size_t;
using value_type = T;
using reverse_iterator = std::reverse_iterator<iterator>;
private:
/// The start of the array, in an external buffer.
const T* Data;
/// The number of elements.
size_type Length;
void debugCheckNullptrInvariant() {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
Data != nullptr || Length == 0,
"created ArrayRef with nullptr and non-zero length! std::optional relies on this being illegal");
}
public:
/// @name Constructors
/// @name Constructors, all inherited from HeaderOnlyArrayRef except for
/// SmallVector.
/// @{
/// Construct an empty ArrayRef.
/* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {}
using HeaderOnlyArrayRef<T>::HeaderOnlyArrayRef;
/// Construct an ArrayRef from a single element.
// TODO Make this explicit
constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
/// Construct an ArrayRef from a pointer and length.
constexpr ArrayRef(const T* data, size_t length)
: Data(data), Length(length) {
debugCheckNullptrInvariant();
}
/// Construct an ArrayRef from a range.
constexpr ArrayRef(const T* begin, const T* end)
: Data(begin), Length(end - begin) {
debugCheckNullptrInvariant();
}
/// Construct an ArrayRef from a std::vector.
/// This constructor is identical to the one in HeaderOnlyArrayRef, but we
/// include it to help with Class Template Argument Deduction (CTAD).
/// Without it, CTAD can fail sometimes due to the indirect constructor
/// inheritance. So we explicitly include this constructor.
template <typename A>
/* implicit */ ArrayRef(const std::vector<T, A>& Vec)
: HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {}
/// Construct an ArrayRef from a SmallVector. This is templated in order to
/// avoid instantiating SmallVectorTemplateCommon<T> whenever we
/// copy-construct an ArrayRef.
/// NOTE: this is the only constructor that is not inherited from
/// HeaderOnlyArrayRef.
template <typename U>
/* implicit */ ArrayRef(const SmallVectorTemplateCommon<T, U>& Vec)
: Data(Vec.data()), Length(Vec.size()) {
debugCheckNullptrInvariant();
}
template <
typename Container,
typename U = decltype(std::declval<Container>().data()),
typename = std::enable_if_t<
(std::is_same_v<U, T*> || std::is_same_v<U, T const*>)>>
/* implicit */ ArrayRef(const Container& container)
: Data(container.data()), Length(container.size()) {
debugCheckNullptrInvariant();
}
/// Construct an ArrayRef from a std::vector.
// The enable_if stuff here makes sure that this isn't used for
// std::vector<bool>, because ArrayRef can't work on a std::vector<bool>
// bitfield.
template <typename A>
/* implicit */ ArrayRef(const std::vector<T, A>& Vec)
: Data(Vec.data()), Length(Vec.size()) {
static_assert(
!std::is_same_v<T, bool>,
"ArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
}
/// Construct an ArrayRef from a std::array
template <size_t N>
/* implicit */ constexpr ArrayRef(const std::array<T, N>& Arr)
: Data(Arr.data()), Length(N) {}
/// Construct an ArrayRef from a C array.
template <size_t N>
// NOLINTNEXTLINE(*c-arrays*)
/* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
/// Construct an ArrayRef from a std::initializer_list.
/* implicit */ constexpr ArrayRef(const std::initializer_list<T>& Vec)
: Data(
std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr)
: std::begin(Vec)),
Length(Vec.size()) {}
: HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {}
/// @}
/// @name Simple Operations
/// @name Simple Operations, mostly inherited from HeaderOnlyArrayRef
/// @{
constexpr iterator begin() const {
return Data;
}
constexpr iterator end() const {
return Data + Length;
}
// These are actually the same as iterator, since ArrayRef only
// gives you const iterators.
constexpr const_iterator cbegin() const {
return Data;
}
constexpr const_iterator cend() const {
return Data + Length;
}
constexpr reverse_iterator rbegin() const {
return reverse_iterator(end());
}
constexpr reverse_iterator rend() const {
return reverse_iterator(begin());
}
/// Check if all elements in the array satisfy the given expression
constexpr bool allMatch(const std::function<bool(const T&)>& pred) const {
return std::all_of(cbegin(), cend(), pred);
}
/// empty - Check if the array is empty.
constexpr bool empty() const {
return Length == 0;
}
constexpr const T* data() const {
return Data;
}
/// size - Get the array size.
constexpr size_t size() const {
return Length;
}
/// front - Get the first element.
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
/// STD_TORCH_CHECK
constexpr const T& front() const {
TORCH_CHECK(
!empty(), "ArrayRef: attempted to access front() of empty list");
return Data[0];
!this->empty(), "ArrayRef: attempted to access front() of empty list");
return this->Data[0];
}
/// back - Get the last element.
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
/// STD_TORCH_CHECK
constexpr const T& back() const {
TORCH_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
return Data[Length - 1];
}
/// equals - Check for element-wise equality.
constexpr bool equals(ArrayRef RHS) const {
return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
TORCH_CHECK(
!this->empty(), "ArrayRef: attempted to access back() of empty list");
return this->Data[this->Length - 1];
}
/// slice(n, m) - Take M elements of the array starting at element N
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
/// STD_TORCH_CHECK
constexpr ArrayRef<T> slice(size_t N, size_t M) const {
TORCH_CHECK(
N + M <= size(),
N + M <= this->size(),
"ArrayRef: invalid slice, N = ",
N,
"; M = ",
M,
"; size = ",
size());
return ArrayRef<T>(data() + N, M);
this->size());
return ArrayRef<T>(this->data() + N, M);
}
/// slice(n) - Chop off the first N elements of the array.
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
/// STD_TORCH_CHECK
constexpr ArrayRef<T> slice(size_t N) const {
TORCH_CHECK(
N <= size(), "ArrayRef: invalid slice, N = ", N, "; size = ", size());
return slice(N, size() - N);
N <= this->size(),
"ArrayRef: invalid slice, N = ",
N,
"; size = ",
this->size());
return slice(N, this->size() - N); // should this slice be this->slice?
}
/// @}
/// @name Operator Overloads
/// @{
constexpr const T& operator[](size_t Index) const {
return Data[Index];
}
/// Vector compatibility
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
/// STD_TORCH_CHECK
constexpr const T& at(size_t Index) const {
TORCH_CHECK(
Index < Length,
Index < this->Length,
"ArrayRef: invalid index Index = ",
Index,
"; Length = ",
Length);
return Data[Index];
this->Length);
return this->Data[Index];
}
/// Disallow accidental assignment from a temporary.
@ -253,13 +160,6 @@ class ArrayRef final {
std::enable_if_t<std::is_same_v<U, T>, ArrayRef<T>>& operator=(
std::initializer_list<U>) = delete;
/// @}
/// @name Expensive Operations
/// @{
std::vector<T> vec() const {
return std::vector<T>(Data, Data + Length);
}
/// @}
};

View File

@ -1,5 +1,7 @@
#include <c10/util/complex.h>
#include <cmath>
// Note [ Complex Square root in libc++]
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// In libc++ complex square root is computed using polar form

View File

@ -132,15 +132,15 @@ std::ostream& operator<<(std::ostream& o, const uint128& b) {
int div_base_log = 0;
switch (flags & std::ios::basefield) {
case std::ios::hex:
div = static_cast<uint64_t>(0x1000000000000000u); // 16^15
div = (uint64_t)0x1000000000000000u; // 16^15
div_base_log = 15;
break;
case std::ios::oct:
div = static_cast<uint64_t>(01000000000000000000000u); // 8^21
div = (uint64_t)01000000000000000000000u; // 8^21
div_base_log = 21;
break;
default: // std::ios::dec
div = static_cast<uint64_t>(10000000000000000000u); // 10^19
div = (uint64_t)10000000000000000000u; // 10^19
div_base_log = 19;
break;
}

View File

@ -11,6 +11,7 @@
#include <unistd.h>
#include <atomic>
#include <chrono>
#include <condition_variable>
#include <cstdint>
#include <cstdio>

View File

@ -74,7 +74,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
)
code.append(" " + OutType + "* op = &out[rangeIndex * block_size];")
for i in range(uf):
for i in range(0, uf):
j = 8 * i
code.append(" __m256 vop" + str(j) + " = _mm256_setzero_ps();")
@ -158,7 +158,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
"&input[idx_pref_T0 * fused_block_size];"
)
for i in range(uf):
for i in range(0, uf):
j = 8 * i
cachelinesize = 64
byteoffset = sizeof[InType] * j
@ -170,7 +170,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
code.append(" if (!normalize_by_lengths || length == 0) {")
else:
code.append(" if (!normalize_by_lengths || lengths[rangeIndex] == 0) {")
for i in range(uf):
for i in range(0, uf):
j = 8 * i
code.append(" _mm256_storeu_ps(&op[" + str(j) + "], vop" + str(j) + ");")
code.append(" } else {")
@ -181,7 +181,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
code.append(
" __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);"
)
for i in range(uf):
for i in range(0, uf):
j = 8 * i
code.append(
" _mm256_storeu_ps(&op["

View File

@ -224,12 +224,6 @@ AMD/ROCm/HIP
- Jithun Nair (`jithunnair-amd <https://github.com/jithunnair-amd>`__)
- (emeritus) Junjie Bai (`bddppq <https://github.com/bddppq>`__)
XPU/Intel GPU
~~~~~~~~~~~~~
- Eikan Wang (`EikanWang <https://github.com/EikanWang>`__)
- Guangye Yu (`guangyey <https://github.com/guangyey>`__)
Build + CI
~~~~~~~~~~

View File

@ -159,6 +159,8 @@ ignore = [
"EXE001",
"F405",
"FURB122", # writelines
# these ignores are from flake8-logging-format; please fix!
"G101",
# these ignores are from ruff NPY; please fix!
"NPY002",
# these ignores are from ruff PERF; please fix!
@ -202,10 +204,14 @@ select = [
"NPY",
"PERF",
"PGH004",
"PIE",
"PIE790",
"PIE794",
"PIE800",
"PIE804",
"PIE807",
"PIE810",
"PLC0131", # type bivariance
"PLC0132", # type param mismatch
"PLC1802", # len({expression}) used as condition without comparison
"PLC0205", # string as __slots__
"PLC3002", # unnecessary-direct-lambda-call
"PLE",
@ -213,7 +219,6 @@ select = [
"PLR0206", # property with params
"PLR1722", # use sys exit
"PLR1736", # unnecessary list index
"PLW0127", # Self-assignment of variable
"PLW0129", # assert on string literal
"PLW0131", # named expr without context
"PLW0133", # useless exception statement

View File

@ -5,7 +5,6 @@ python-version = "3.12"
project-includes = [
"torch",
"caffe2",
"tools",
"test/test_bundled_images.py",
"test/test_bundled_inputs.py",
"test/test_complex.py",
@ -23,15 +22,10 @@ project-includes = [
project-excludes = [
# ==== below will be enabled directory by directory ====
# ==== to test Pyrefly on a specific directory, simply comment it out ====
"torch/_inductor/runtime",
"torch/_inductor/codegen/triton.py",
"tools/linter/adapters/test_device_bias_linter.py",
"tools/code_analyzer/gen_operators_yaml.py",
"torch/_inductor/runtime/triton_heuristics.py",
"torch/_inductor/runtime/triton_helpers.py",
"torch/_inductor/runtime/halide_helpers.py",
# formatting issues, will turn on after adjusting where suppressions can be
# in import statements
"tools/flight_recorder/components/types.py",
"torch/linalg/__init__.py",
"torch/package/importer.py",
"torch/package/_package_pickler.py",
@ -46,6 +40,17 @@ project-excludes = [
"torch/distributed/elastic/metrics/__init__.py",
"torch/_inductor/fx_passes/bucketing.py",
# ====
"benchmarks/instruction_counts/main.py",
"benchmarks/instruction_counts/definitions/setup.py",
"benchmarks/instruction_counts/applications/ci.py",
"benchmarks/instruction_counts/core/api.py",
"benchmarks/instruction_counts/core/expand.py",
"benchmarks/instruction_counts/core/types.py",
"benchmarks/instruction_counts/core/utils.py",
"benchmarks/instruction_counts/definitions/standard.py",
"benchmarks/instruction_counts/definitions/setup.py",
"benchmarks/instruction_counts/execution/runner.py",
"benchmarks/instruction_counts/execution/work.py",
"torch/include/**",
"torch/csrc/**",
"torch/distributed/elastic/agent/server/api.py",
@ -132,4 +137,3 @@ errors.bad-param-name-override = false
errors.implicit-import = false
permissive-ignores = true
replace-imports-with-any = ["!sympy.printing.*", "sympy.*", "onnxscript.onnx_opset.*"]
search-path = ["tools/experimental"]

View File

@ -190,7 +190,7 @@ class TestActivationSparsifier(TestCase):
if features is None:
assert torch.all(mask * input_data == output)
else:
for feature_idx in range(len(features)):
for feature_idx in range(0, len(features)):
feature = torch.Tensor(
[features[feature_idx]], device=input_data.device
).long()
@ -378,7 +378,7 @@ class TestActivationSparsifier(TestCase):
# some dummy data
data_list = []
num_data_points = 5
for _ in range(num_data_points):
for _ in range(0, num_data_points):
rand_data = torch.randn(16, 1, 28, 28)
activation_sparsifier.model(rand_data)
data_list.append(rand_data)

View File

@ -143,7 +143,7 @@ class TestBaseDataScheduler(TestCase):
# checking step count
step_cnt = 5
for _ in range(step_cnt):
for _ in range(0, step_cnt):
sparsifier.step()
scheduler.step()

View File

@ -123,7 +123,7 @@ class _BaseDataSparsiferTestCase(TestCase):
step_count = 3
for _ in range(step_count):
for _ in range(0, step_count):
sparsifier.step()
for some_data in all_data:
name, data, _ = self._get_name_data_config(some_data)

View File

@ -472,8 +472,8 @@ class TestNearlyDiagonalSparsifier(TestCase):
else:
height, width = mask.shape
dist_to_diagonal = nearliness // 2
for row in range(height):
for col in range(width):
for row in range(0, height):
for col in range(0, width):
if abs(row - col) <= dist_to_diagonal:
assert mask[row, col] == 1
else:

View File

@ -7,6 +7,7 @@ set(AOTI_ABI_CHECK_TEST_SRCS
${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
${AOTI_ABI_CHECK_TEST_ROOT}/test_exception.cpp
${AOTI_ABI_CHECK_TEST_ROOT}/test_headeronlyarrayref.cpp
${AOTI_ABI_CHECK_TEST_ROOT}/test_macros.cpp
${AOTI_ABI_CHECK_TEST_ROOT}/test_math.cpp
${AOTI_ABI_CHECK_TEST_ROOT}/test_rand.cpp

View File

@ -0,0 +1,52 @@
#include <gtest/gtest.h>
#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
#include <vector>
using torch::headeronly::HeaderOnlyArrayRef;
TEST(TestHeaderOnlyArrayRef, TestEmpty) {
HeaderOnlyArrayRef<float> arr;
ASSERT_TRUE(arr.empty());
}
TEST(TestHeaderOnlyArrayRef, TestSingleton) {
float val = 5.0f;
HeaderOnlyArrayRef<float> arr(val);
ASSERT_FALSE(arr.empty());
EXPECT_EQ(arr.size(), 1);
EXPECT_EQ(arr[0], val);
}
TEST(TestHeaderOnlyArrayRef, TestAPIs) {
std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
HeaderOnlyArrayRef<int> arr(vec);
ASSERT_FALSE(arr.empty());
EXPECT_EQ(arr.size(), 7);
for (size_t i = 0; i < arr.size(); i++) {
EXPECT_EQ(arr[i], i + 1);
EXPECT_EQ(arr.at(i), i + 1);
}
EXPECT_EQ(arr.front(), 1);
EXPECT_EQ(arr.back(), 7);
ASSERT_TRUE(arr.slice(3, 4).equals(arr.slice(3)));
}
TEST(TestHeaderOnlyArrayRef, TestFromInitializerList) {
std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
HeaderOnlyArrayRef<int> arr({1, 2, 3, 4, 5, 6, 7});
auto res_vec = arr.vec();
for (size_t i = 0; i < vec.size(); i++) {
EXPECT_EQ(vec[i], res_vec[i]);
}
}
TEST(TestHeaderOnlyArrayRef, TestFromRange) {
std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
HeaderOnlyArrayRef<int> arr(vec.data() + 3, vec.data() + 7);
auto res_vec = arr.vec();
for (size_t i = 0; i < res_vec.size(); i++) {
EXPECT_EQ(vec[i + 3], res_vec[i]);
}
}

View File

@ -53,3 +53,24 @@ TEST_FORALL(AT_FORALL_COMPLEX_TYPES, 2)
#undef DEFINE_CHECK
#undef TEST_FORALL
TEST(TestScalarType, toString) {
using torch::headeronly::ScalarType;
#define DEFINE_CHECK(_, name) EXPECT_EQ(toString(ScalarType::name), #name);
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CHECK);
#undef DEFINE_CHECK
}
TEST(TestScalarType, operator_left_shift) {
using torch::headeronly::ScalarType;
#define DEFINE_CHECK(_, name) \
{ \
std::stringstream ss; \
ss << ScalarType::name; \
EXPECT_EQ(ss.str(), #name); \
}
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CHECK);
#undef DEFINE_CHECK
}

View File

@ -311,10 +311,9 @@ void boxed_fill_infinity(
}
Tensor my_pad(Tensor t) {
std::vector<int64_t> padding = {1, 2, 2, 1};
std::string mode = "constant";
double value = 0.0;
return pad(t, padding, mode, value);
return pad(t, {1, 2, 2, 1}, mode, value);
}
void boxed_my_pad(
@ -342,6 +341,9 @@ void boxed_my_narrow(
}
Tensor my_new_empty_dtype_variant(Tensor t) {
// Still using a std::vector below even though people can just pass in an
// initializer list (which will be implicitly converted to an HeaderOnlyArrayRef)
// directly.
std::vector<int64_t> sizes = {2, 5};
auto dtype = std::make_optional(torch::headeronly::ScalarType::BFloat16);
return new_empty(t, sizes, dtype);
@ -353,9 +355,8 @@ void boxed_my_new_empty_dtype_variant(StableIValue* stack, uint64_t num_args, ui
}
Tensor my_new_zeros_dtype_variant(Tensor t) {
std::vector<int64_t> sizes = {2, 5};
auto dtype = std::make_optional(at::ScalarType::Float);
return new_zeros(t, sizes, dtype);
return new_zeros(t, {2, 5}, dtype);
}
void boxed_my_new_zeros_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
@ -429,8 +430,7 @@ void boxed_my_amax(StableIValue* stack, uint64_t num_args, uint64_t num_outputs)
}
Tensor my_amax_vec(Tensor t) {
std::vector<int64_t> v = {0,1};
return amax(t, v, false);
return amax(t, {0,1}, false);
}
void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {

View File

@ -1166,7 +1166,7 @@ class TestFullyShardPrefetch(FSDPTest):
loss = model(inp)
events.clear()
loss.sum().backward()
expected_backward_events = [
expected_backward_events = expected_backward_events = [
("unshard", "norm, output", TrainingState.PRE_BACKWARD),
# root explicit prefetch layers.2
("unshard", "layers.2", TrainingState.PRE_BACKWARD),

View File

@ -79,7 +79,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
dist.init_process_group(
store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
)
group = list(range(self.world_size))
group = list(range(0, self.world_size))
group_id = dist.group.WORLD
self._test_all_gather(
group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.FP16
@ -94,7 +94,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
dist.init_process_group(
store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
)
group = list(range(self.world_size))
group = list(range(0, self.world_size))
group_id = dist.group.WORLD
self._test_all_gather(
group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.BFP16
@ -111,7 +111,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
dist.init_process_group(
store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
)
group = list(range(self.world_size))
group = list(range(0, self.world_size))
group_id = dist.new_group(range(self.world_size))
rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
self._test_all_to_all(
@ -135,7 +135,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
dist.init_process_group(
store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
)
group = list(range(self.world_size))
group = list(range(0, self.world_size))
group_id = dist.new_group(range(self.world_size))
rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
self._test_all_to_all(
@ -158,7 +158,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
dist.init_process_group(
store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
)
group = list(range(self.world_size))
group = list(range(0, self.world_size))
group_id = dist.new_group(range(self.world_size))
rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
self._test_all_to_all_single(
@ -181,7 +181,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
dist.init_process_group(
store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
)
group = list(range(self.world_size))
group = list(range(0, self.world_size))
group_id = dist.new_group(range(self.world_size))
rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
self._test_all_to_all_single(

View File

@ -66,7 +66,7 @@ if TEST_WITH_DEV_DBG_ASAN:
def create_sharded_tensor(rank, world_size, shards_per_rank, shard_size=8):
shards_metadata = []
local_shards = []
for idx in range(world_size * shards_per_rank):
for idx in range(0, world_size * shards_per_rank):
shard_rank = idx // shards_per_rank
shard_md = ShardMetadata(
shard_offsets=[idx * shard_size],

View File

@ -45,7 +45,7 @@ if TEST_WITH_DEV_DBG_ASAN:
def create_sharded_tensor(rank, world_size, shards_per_rank):
shards_metadata = []
local_shards = []
for idx in range(world_size * shards_per_rank):
for idx in range(0, world_size * shards_per_rank):
shard_rank = idx // shards_per_rank
shard_md = ShardMetadata(
shard_offsets=[idx * 8], shard_sizes=[8], placement=f"rank:{shard_rank}/cpu"

View File

@ -633,7 +633,7 @@ class SimpleElasticAgentTest(unittest.TestCase):
worker_group = agent.get_worker_group()
num_restarts = 3
for _ in range(num_restarts):
for _ in range(0, num_restarts):
agent._restart_workers(worker_group)
self.assertEqual(WorkerState.HEALTHY, worker_group.state)

View File

@ -146,7 +146,7 @@ def echo_large(size: int) -> dict[int, str]:
returns a large output ({0: test0", 1: "test1", ..., (size-1):f"test{size-1}"})
"""
out = {}
for idx in range(size):
for idx in range(0, size):
out[idx] = f"test{idx}"
return out

View File

@ -191,7 +191,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
"""
client = timer.FileTimerClient(file_path)
sem.release()
for _ in range(n):
for _ in range(0, n):
client.acquire("test_scope", 0)
time.sleep(interval)

View File

@ -102,7 +102,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
world_size = 8
processes = []
for i in range(world_size):
for i in range(0, world_size):
if i % 2 == 0:
p = spawn_ctx.Process(target=_stuck_function, args=(i, mp_queue))
else:
@ -110,7 +110,7 @@ if not (IS_WINDOWS or IS_MACOS or IS_ARM64):
p.start()
processes.append(p)
for i in range(world_size):
for i in range(0, world_size):
p = processes[i]
p.join()
if i % 2 == 0:

View File

@ -127,7 +127,7 @@ if not INVALID_PLATFORMS:
interval seconds. Releases the given semaphore once before going to work.
"""
sem.release()
for i in range(n):
for i in range(0, n):
mp_queue.put(TimerRequest(i, "test_scope", 0))
time.sleep(interval)

View File

@ -15,7 +15,7 @@ class CyclingIteratorTest(unittest.TestCase):
def generator(self, epoch, stride, max_epochs):
# generate an continuously incrementing list each epoch
# e.g. [0,1,2] [3,4,5] [6,7,8] ...
return iter([stride * epoch + i for i in range(stride)])
return iter([stride * epoch + i for i in range(0, stride)])
def test_cycling_iterator(self):
stride = 3
@ -25,7 +25,7 @@ class CyclingIteratorTest(unittest.TestCase):
return self.generator(epoch, stride, max_epochs)
it = CyclingIterator(n=max_epochs, generator_fn=generator_fn)
for i in range(stride * max_epochs):
for i in range(0, stride * max_epochs):
self.assertEqual(i, next(it))
with self.assertRaises(StopIteration):

Some files were not shown because too many files have changed in this diff Show More