mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-31 20:34:54 +08:00 
			
		
		
		
	Compare commits
	
		
			6 Commits
		
	
	
		
			cpp-docs-d
			...
			gh/jamesjw
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 44e06f3013 | |||
| 85aa336a76 | |||
| 40ecac596b | |||
| cdb9f631df | |||
| ebca32865c | |||
| ddf3568a48 | 
| @ -83,6 +83,10 @@ function build_cpython { | ||||
|         py_suffix=${py_ver::-1} | ||||
|         py_folder=$py_suffix | ||||
|     fi | ||||
|     # Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4 | ||||
|     if [ "$py_suffix" == "3.14.0" ]; then | ||||
|         py_suffix="3.14.0rc2" | ||||
|     fi | ||||
|     wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz | ||||
|     do_cpython_build $py_ver Python-$py_suffix | ||||
|  | ||||
|  | ||||
| @ -1,11 +1,15 @@ | ||||
| sphinx==7.2.6 | ||||
| sphinx==5.3.0 | ||||
| #Description: This is used to generate PyTorch docs | ||||
| #Pinned versions: 7.2.6 | ||||
| #Pinned versions: 5.3.0 | ||||
|  | ||||
| pytorch_sphinx_theme2==0.1.0 | ||||
| #Description: This is needed to generate PyTorch docs | ||||
| #Pinned versions: 0.1.0 | ||||
| standard-imghdr==3.13.0; python_version >= "3.13" | ||||
| #Description: This is needed by Sphinx, so it needs to be added here. | ||||
| # The reasons are as follows: | ||||
| # 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr); | ||||
| # 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13. | ||||
| # Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency. | ||||
|  | ||||
| -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2 | ||||
| # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering | ||||
| # but it doesn't seem to work and hangs around idly. The initial thought that it is probably | ||||
| # something related to Docker setup. We can investigate this later. | ||||
| @ -32,17 +36,17 @@ tensorboard==2.18.0 ; python_version >= "3.13" | ||||
| #Description: This is used to generate PyTorch docs | ||||
| #Pinned versions: 2.13.0 | ||||
|  | ||||
| breathe==4.36.0 | ||||
| breathe==4.34.0 | ||||
| #Description: This is used to generate PyTorch C++ docs | ||||
| #Pinned versions: 4.36.0 | ||||
| #Pinned versions: 4.34.0 | ||||
|  | ||||
| exhale==0.3.7 | ||||
| exhale==0.2.3 | ||||
| #Description: This is used to generate PyTorch C++ docs | ||||
| #Pinned versions: 0.3.7 | ||||
| #Pinned versions: 0.2.3 | ||||
|  | ||||
| docutils==0.20 | ||||
| docutils==0.16 | ||||
| #Description: This is used to generate PyTorch C++ docs | ||||
| #Pinned versions: 0.20 | ||||
| #Pinned versions: 0.16 | ||||
|  | ||||
| bs4==0.0.1 | ||||
| #Description: This is used to generate PyTorch C++ docs | ||||
| @ -52,13 +56,13 @@ IPython==8.12.0 | ||||
| #Description: This is used to generate PyTorch functorch docs | ||||
| #Pinned versions: 8.12.0 | ||||
|  | ||||
| myst-nb==1.3.0 | ||||
| myst-nb==0.17.2 | ||||
| #Description: This is used to generate PyTorch functorch and torch.compile docs. | ||||
| #Pinned versions: 1.3.0 | ||||
| #Pinned versions: 0.17.2 | ||||
|  | ||||
| # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs | ||||
| python-etcd==0.4.5 | ||||
| sphinx-copybutton==0.5.0 | ||||
| sphinx-design==0.6.1 | ||||
| sphinx-design==0.4.0 | ||||
| sphinxcontrib-mermaid==1.0.0 | ||||
| myst-parser==4.0.1 | ||||
| myst-parser==0.18.1 | ||||
|  | ||||
| @ -57,8 +57,8 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules | ||||
|         logger.info("Successfully cloned %s", target) | ||||
|         return r, commit | ||||
|  | ||||
|     except GitCommandError: | ||||
|         logger.exception("Git operation failed") | ||||
|     except GitCommandError as e: | ||||
|         logger.error("Git operation failed: %s", e) | ||||
|         raise | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -102,18 +102,8 @@ if [ "$is_main_doc" = true ]; then | ||||
|     echo coverage output not found | ||||
|     exit 1 | ||||
|   elif [ $undocumented -gt 0 ]; then | ||||
|     echo "======================================" | ||||
|     echo "ERROR: $undocumented undocumented objects found!" | ||||
|     echo "======================================" | ||||
|     echo "" | ||||
|     echo "Full coverage report:" | ||||
|     echo undocumented objects found: | ||||
|     cat build/coverage/python.txt | ||||
|     echo "" | ||||
|     echo "======================================" | ||||
|     echo "Undocumented modules/objects (lines after TOTAL):" | ||||
|     tail -n +$((lines - undocumented + 1)) build/coverage/python.txt | ||||
|     echo "======================================" | ||||
|     echo "" | ||||
|     echo "Make sure you've updated relevant .rsts in docs/source!" | ||||
|     echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'" | ||||
|     exit 1 | ||||
|  | ||||
| @ -163,13 +163,8 @@ if [[ "$(uname)" != Darwin ]]; then | ||||
|   MEMORY_LIMIT_MAX_JOBS=12 | ||||
|   NUM_CPUS=$(( $(nproc) - 2 )) | ||||
|  | ||||
|   if [[ "$(uname)" == Linux ]]; then | ||||
|     # Defaults here for **binary** linux builds so they can be changed in one place | ||||
|     export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))} | ||||
|   else | ||||
|     # For other builds | ||||
|     export MAX_JOBS=${NUM_CPUS} | ||||
|   fi | ||||
|   # Defaults here for **binary** linux builds so they can be changed in one place | ||||
|   export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))} | ||||
|  | ||||
|   cat >>"$envfile" <<EOL | ||||
|   export MAX_JOBS="${MAX_JOBS}" | ||||
|  | ||||
							
								
								
									
										6
									
								
								.flake8
									
									
									
									
									
								
							
							
						
						
									
										6
									
								
								.flake8
									
									
									
									
									
								
							| @ -7,12 +7,16 @@ max-line-length = 120 | ||||
| # C408 ignored because we like the dict keyword argument syntax | ||||
| # E501 is not flexible enough, we're using B950 instead | ||||
| ignore = | ||||
|     E203,E305,E402,E501,E704,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824, | ||||
|     E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824, | ||||
|     # shebang has extra meaning in fbcode lints, so I think it's not worth trying | ||||
|     # to line this up with executable bit | ||||
|     EXE001, | ||||
|     # these ignores are from flake8-bugbear; please fix! | ||||
|     B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910 | ||||
|     # these ignores are from flake8-comprehensions; please fix! | ||||
|     C407, | ||||
|     # these ignores are from flake8-logging-format; please fix! | ||||
|     G100,G101,G200 | ||||
|     # these ignores are from flake8-simplify. please fix or ignore with commented reason | ||||
|     SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12, | ||||
|     # SIM104 is already covered by pyupgrade ruff | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/ci_commit_pins/audio.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ci_commit_pins/audio.txt
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | ||||
| 69bbe7363897764f9e758d851cd0340147d27f94 | ||||
| 1b013f5b5a87a1882eb143c26d79d091150d6a37 | ||||
|  | ||||
							
								
								
									
										29
									
								
								.github/labeler.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										29
									
								
								.github/labeler.yml
									
									
									
									
										vendored
									
									
								
							| @ -133,32 +133,3 @@ | ||||
|  | ||||
| "ciflow/vllm": | ||||
| - .github/ci_commit_pins/vllm.txt | ||||
|  | ||||
| "ciflow/b200": | ||||
| - test/test_matmul_cuda.py | ||||
| - test/test_scaled_matmul_cuda.py | ||||
| - test/inductor/test_fp8.py | ||||
| - aten/src/ATen/native/cuda/Blas.cpp | ||||
| - torch/**/*cublas* | ||||
| - torch/_inductor/kernel/mm.py | ||||
| - test/inductor/test_max_autotune.py | ||||
| - third_party/fbgemm | ||||
|  | ||||
| "ciflow/h100": | ||||
| - test/test_matmul_cuda.py | ||||
| - test/test_scaled_matmul_cuda.py | ||||
| - test/inductor/test_fp8.py | ||||
| - aten/src/ATen/native/cuda/Blas.cpp | ||||
| - torch/**/*cublas* | ||||
| - torch/_inductor/kernel/mm.py | ||||
| - test/inductor/test_max_autotune.py | ||||
| - third_party/fbgemm | ||||
|  | ||||
| "ciflow/rocm": | ||||
| - test/test_matmul_cuda.py | ||||
| - test/test_scaled_matmul_cuda.py | ||||
| - test/inductor/test_fp8.py | ||||
| - aten/src/ATen/native/cuda/Blas.cpp | ||||
| - torch/_inductor/kernel/mm.py | ||||
| - test/inductor/test_max_autotune.py | ||||
| - third_party/fbgemm | ||||
|  | ||||
							
								
								
									
										1
									
								
								.github/pytorch-probot.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/pytorch-probot.yml
									
									
									
									
										vendored
									
									
								
							| @ -33,7 +33,6 @@ ciflow_push_tags: | ||||
| - ciflow/rocm | ||||
| - ciflow/rocm-mi300 | ||||
| - ciflow/rocm-mi355 | ||||
| - ciflow/rocm-navi31 | ||||
| - ciflow/s390 | ||||
| - ciflow/slow | ||||
| - ciflow/torchbench | ||||
|  | ||||
							
								
								
									
										30
									
								
								.github/scripts/generate_binary_build_matrix.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										30
									
								
								.github/scripts/generate_binary_build_matrix.py
									
									
									
									
										vendored
									
									
								
							| @ -79,21 +79,21 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { | ||||
|         "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'" | ||||
|     ), | ||||
|     "12.9": ( | ||||
|         "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | " | ||||
|         "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | " | ||||
|         "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | " | ||||
|         "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | " | ||||
|         "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | " | ||||
|         "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | " | ||||
|         "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | " | ||||
|         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " | ||||
|         "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | " | ||||
|         "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | " | ||||
|         "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | " | ||||
|         "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | " | ||||
|         "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'" | ||||
|         "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'" | ||||
|     ), | ||||
|     "13.0": ( | ||||
|         "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | " | ||||
|  | ||||
| @ -26,8 +26,9 @@ name: !{{ build_environment }} | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "!{{ py_ver.strip('t') + ('.4' if '3.14' not in py_ver else '.0') }}" | ||||
|           python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}" | ||||
|           freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }} | ||||
| {%- endmacro %} | ||||
|  | ||||
|  | ||||
| @ -79,9 +79,9 @@ jobs: | ||||
|     runs-on: "windows-11-arm64-preview" | ||||
|     {%- else %} | ||||
|     {%- if branches == "nightly" %} | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     {%- else %} | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge.nonephemeral" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" | ||||
|     {%- endif %} | ||||
|     {%- endif %} | ||||
|     timeout-minutes: !{{ common.timeout_minutes_windows_binary }} | ||||
|  | ||||
							
								
								
									
										14
									
								
								.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										14
									
								
								.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -224,7 +224,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_10-cuda-aarch64-12_9 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -473,7 +473,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_11-cuda-aarch64-12_9 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -722,7 +722,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_12-cuda-aarch64-12_9 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -971,7 +971,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_13-cuda-aarch64-12_9 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1220,7 +1220,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_13t-cuda-aarch64-12_9 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1469,7 +1469,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_14-cuda-aarch64-12_9 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1718,7 +1718,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_14t-cuda-aarch64-12_9 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|  | ||||
							
								
								
									
										14
									
								
								.github/workflows/generated-linux-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										14
									
								
								.github/workflows/generated-linux-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -259,7 +259,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_10-cuda12_9 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_10-cuda12_9-test:  # Testing | ||||
| @ -925,7 +925,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_11-cuda12_9 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_11-cuda12_9-test:  # Testing | ||||
| @ -1591,7 +1591,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_12-cuda12_9 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_12-cuda12_9-test:  # Testing | ||||
| @ -2257,7 +2257,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_13-cuda12_9 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_13-cuda12_9-test:  # Testing | ||||
| @ -2923,7 +2923,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_13t-cuda12_9 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_13t-cuda12_9-test:  # Testing | ||||
| @ -3589,7 +3589,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_14-cuda12_9 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_14-cuda12_9-test:  # Testing | ||||
| @ -4255,7 +4255,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_14t-cuda12_9 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_14t-cuda12_9-test:  # Testing | ||||
|  | ||||
							
								
								
									
										1
									
								
								.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -63,6 +63,7 @@ jobs: | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.10.4" | ||||
|           freethreaded: false | ||||
|  | ||||
							
								
								
									
										11
									
								
								.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										11
									
								
								.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -59,6 +59,7 @@ jobs: | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.10.4" | ||||
|           freethreaded: false | ||||
| @ -168,6 +169,7 @@ jobs: | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.11.4" | ||||
|           freethreaded: false | ||||
| @ -277,6 +279,7 @@ jobs: | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.12.4" | ||||
|           freethreaded: false | ||||
| @ -386,6 +389,7 @@ jobs: | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.13.4" | ||||
|           freethreaded: false | ||||
| @ -495,6 +499,7 @@ jobs: | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.13.4" | ||||
|           freethreaded: true | ||||
| @ -604,8 +609,9 @@ jobs: | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.14.0" | ||||
|           python-version: "3.14.0-rc.2" | ||||
|           freethreaded: false | ||||
|       - name: Checkout PyTorch | ||||
|         uses: actions/checkout@v4 | ||||
| @ -713,8 +719,9 @@ jobs: | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.14.0" | ||||
|           python-version: "3.14.0-rc.2" | ||||
|           freethreaded: true | ||||
|       - name: Checkout PyTorch | ||||
|         uses: actions/checkout@v4 | ||||
|  | ||||
							
								
								
									
										8
									
								
								.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										8
									
								
								.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -44,7 +44,7 @@ jobs: | ||||
|   libtorch-cpu-shared-with-deps-debug-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -291,7 +291,7 @@ jobs: | ||||
|   libtorch-cuda12_6-shared-with-deps-debug-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -541,7 +541,7 @@ jobs: | ||||
|   libtorch-cuda12_8-shared-with-deps-debug-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -791,7 +791,7 @@ jobs: | ||||
|   libtorch-cuda13_0-shared-with-deps-debug-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
|  | ||||
							
								
								
									
										8
									
								
								.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										8
									
								
								.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -44,7 +44,7 @@ jobs: | ||||
|   libtorch-cpu-shared-with-deps-release-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -291,7 +291,7 @@ jobs: | ||||
|   libtorch-cuda12_6-shared-with-deps-release-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -541,7 +541,7 @@ jobs: | ||||
|   libtorch-cuda12_8-shared-with-deps-release-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -791,7 +791,7 @@ jobs: | ||||
|   libtorch-cuda13_0-shared-with-deps-release-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
|  | ||||
							
								
								
									
										70
									
								
								.github/workflows/generated-windows-binary-wheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										70
									
								
								.github/workflows/generated-windows-binary-wheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -44,7 +44,7 @@ jobs: | ||||
|   wheel-py3_10-cpu-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -279,7 +279,7 @@ jobs: | ||||
|   wheel-py3_10-cuda12_6-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -517,7 +517,7 @@ jobs: | ||||
|   wheel-py3_10-cuda12_8-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -755,7 +755,7 @@ jobs: | ||||
|   wheel-py3_10-cuda13_0-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -993,7 +993,7 @@ jobs: | ||||
|   wheel-py3_10-xpu-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -1229,7 +1229,7 @@ jobs: | ||||
|   wheel-py3_11-cpu-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -1464,7 +1464,7 @@ jobs: | ||||
|   wheel-py3_11-cuda12_6-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -1702,7 +1702,7 @@ jobs: | ||||
|   wheel-py3_11-cuda12_8-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -1940,7 +1940,7 @@ jobs: | ||||
|   wheel-py3_11-cuda13_0-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -2178,7 +2178,7 @@ jobs: | ||||
|   wheel-py3_11-xpu-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -2414,7 +2414,7 @@ jobs: | ||||
|   wheel-py3_12-cpu-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -2649,7 +2649,7 @@ jobs: | ||||
|   wheel-py3_12-cuda12_6-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -2887,7 +2887,7 @@ jobs: | ||||
|   wheel-py3_12-cuda12_8-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -3125,7 +3125,7 @@ jobs: | ||||
|   wheel-py3_12-cuda13_0-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -3363,7 +3363,7 @@ jobs: | ||||
|   wheel-py3_12-xpu-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -3599,7 +3599,7 @@ jobs: | ||||
|   wheel-py3_13-cpu-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -3834,7 +3834,7 @@ jobs: | ||||
|   wheel-py3_13-cuda12_6-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -4072,7 +4072,7 @@ jobs: | ||||
|   wheel-py3_13-cuda12_8-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -4310,7 +4310,7 @@ jobs: | ||||
|   wheel-py3_13-cuda13_0-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -4548,7 +4548,7 @@ jobs: | ||||
|   wheel-py3_13-xpu-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -4784,7 +4784,7 @@ jobs: | ||||
|   wheel-py3_13t-cpu-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -5019,7 +5019,7 @@ jobs: | ||||
|   wheel-py3_13t-cuda12_6-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -5257,7 +5257,7 @@ jobs: | ||||
|   wheel-py3_13t-cuda12_8-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -5495,7 +5495,7 @@ jobs: | ||||
|   wheel-py3_13t-cuda13_0-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -5733,7 +5733,7 @@ jobs: | ||||
|   wheel-py3_13t-xpu-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -5969,7 +5969,7 @@ jobs: | ||||
|   wheel-py3_14-cpu-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -6204,7 +6204,7 @@ jobs: | ||||
|   wheel-py3_14-cuda12_6-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -6442,7 +6442,7 @@ jobs: | ||||
|   wheel-py3_14-cuda12_8-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -6680,7 +6680,7 @@ jobs: | ||||
|   wheel-py3_14-cuda13_0-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -6918,7 +6918,7 @@ jobs: | ||||
|   wheel-py3_14-xpu-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -7154,7 +7154,7 @@ jobs: | ||||
|   wheel-py3_14t-cpu-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -7389,7 +7389,7 @@ jobs: | ||||
|   wheel-py3_14t-cuda12_6-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -7627,7 +7627,7 @@ jobs: | ||||
|   wheel-py3_14t-cuda12_8-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -7865,7 +7865,7 @@ jobs: | ||||
|   wheel-py3_14t-cuda13_0-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
| @ -8103,7 +8103,7 @@ jobs: | ||||
|   wheel-py3_14t-xpu-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: get-label-type | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge" | ||||
|     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" | ||||
|     timeout-minutes: 360 | ||||
|     env: | ||||
|       PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
|  | ||||
							
								
								
									
										63
									
								
								.github/workflows/rocm-navi31.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										63
									
								
								.github/workflows/rocm-navi31.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,63 +0,0 @@ | ||||
| name: rocm-navi31 | ||||
|  | ||||
| on: | ||||
|   push: | ||||
|     tags: | ||||
|       - ciflow/rocm-navi31/* | ||||
|   workflow_dispatch: | ||||
|   schedule: | ||||
|     # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs. | ||||
|     # Also run less frequently on weekends. | ||||
|     - cron: 45 */2 * * 1-5 | ||||
|     - cron: 45 4,12 * * 0,6 | ||||
|  | ||||
| concurrency: | ||||
|   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | ||||
|   cancel-in-progress: true | ||||
|  | ||||
| permissions: read-all | ||||
|  | ||||
| jobs: | ||||
|   target-determination: | ||||
|     if: github.repository_owner == 'pytorch' | ||||
|     name: before-test | ||||
|     uses: ./.github/workflows/target_determination.yml | ||||
|     permissions: | ||||
|       id-token: write | ||||
|       contents: read | ||||
|  | ||||
|   linux-jammy-rocm-py3_10-build: | ||||
|     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} | ||||
|     name: linux-jammy-rocm-py3.10 | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     with: | ||||
|       build-environment: linux-jammy-rocm-py3.10 | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 | ||||
|       sync-tag: rocm-build | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" }, | ||||
|           { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-rocm-py3_10-test: | ||||
|     permissions: | ||||
|       id-token: write | ||||
|       contents: read | ||||
|     name: linux-jammy-rocm-py3_10 | ||||
|     uses: ./.github/workflows/_rocm-test.yml | ||||
|     needs: | ||||
|       - linux-jammy-rocm-py3_10-build | ||||
|       - target-determination | ||||
|     with: | ||||
|       build-environment: linux-jammy-rocm-py3.10 | ||||
|       docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} | ||||
|       tests-to-include: >- | ||||
|          ${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs | ||||
|          test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark | ||||
|          inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor | ||||
|          inductor/test_torchinductor inductor/test_decompose_mem_bound_mm | ||||
|          inductor/test_flex_attention inductor/test_max_autotune' || '' }} | ||||
|     secrets: inherit | ||||
							
								
								
									
										26
									
								
								.github/workflows/rocm.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										26
									
								
								.github/workflows/rocm.yml
									
									
									
									
										vendored
									
									
								
							| @ -59,3 +59,29 @@ jobs: | ||||
|       docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-rocm-py3_10-gfx1100-test: | ||||
|     if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} | ||||
|     permissions: | ||||
|       id-token: write | ||||
|       contents: read | ||||
|     name: linux-jammy-rocm-py3_10-gfx1100 | ||||
|     uses: ./.github/workflows/_rocm-test.yml | ||||
|     needs: | ||||
|       - linux-jammy-rocm-py3_10-build | ||||
|       - target-determination | ||||
|     with: | ||||
|       build-environment: linux-jammy-rocm-py3.10 | ||||
|       docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" }, | ||||
|           { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" }, | ||||
|         ]} | ||||
|       tests-to-include: > | ||||
|          test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs | ||||
|          test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark | ||||
|          inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor | ||||
|          inductor/test_torchinductor inductor/test_decompose_mem_bound_mm | ||||
|          inductor/test_flex_attention inductor/test_max_autotune | ||||
|     secrets: inherit | ||||
|  | ||||
							
								
								
									
										34
									
								
								.github/workflows/trunk.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										34
									
								
								.github/workflows/trunk.yml
									
									
									
									
										vendored
									
									
								
							| @ -190,40 +190,6 @@ jobs: | ||||
|       runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-rocm-py3_10-build: | ||||
|     if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }} | ||||
|     name: linux-jammy-rocm-py3.10 | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-rocm-py3.10 | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 | ||||
|       sync-tag: rocm-build | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|           { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-rocm-py3_10-test: | ||||
|     if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }} | ||||
|     permissions: | ||||
|       id-token: write | ||||
|       contents: read | ||||
|     name: linux-jammy-rocm-py3.10 | ||||
|     uses: ./.github/workflows/_rocm-test.yml | ||||
|     needs: | ||||
|       - linux-jammy-rocm-py3_10-build | ||||
|       - target-determination | ||||
|     with: | ||||
|       build-environment: linux-jammy-rocm-py3.10 | ||||
|       docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} | ||||
|       tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor" | ||||
|     secrets: inherit | ||||
|  | ||||
|   inductor-build: | ||||
|     name: inductor-build | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|  | ||||
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -374,7 +374,6 @@ third_party/ruy/ | ||||
| third_party/glog/ | ||||
|  | ||||
| # Virtualenv | ||||
| .venv/ | ||||
| venv/ | ||||
|  | ||||
| # Log files | ||||
|  | ||||
							
								
								
									
										14
									
								
								CODEOWNERS
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								CODEOWNERS
									
									
									
									
									
								
							| @ -201,17 +201,3 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A | ||||
| /torch/csrc/stable/ @janeyx99 @mikaylagawarecki | ||||
| /torch/headeronly/ @janeyx99 | ||||
| /torch/header_only_apis.txt @janeyx99 | ||||
|  | ||||
| # FlexAttention | ||||
| /torch/nn/attention/flex_attention.py @drisspg | ||||
| /torch/_higher_order_ops/flex_attention.py @drisspg | ||||
| /torch/_inductor/kernel/flex/ @drisspg | ||||
| /torch/_inductor/codegen/cpp_flex_attention_template.py @drisspg | ||||
| /test/inductor/test_flex_attention.py @drisspg | ||||
| /test/inductor/test_flex_decoding.py @drisspg | ||||
|  | ||||
| # Low Precision GEMMs | ||||
| /aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58 | ||||
| /aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58 | ||||
| /aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58 | ||||
| /test/test_scaled_matmul_cuda.py @drisspg @slayton58 | ||||
|  | ||||
| @ -313,14 +313,13 @@ IF(USE_FBGEMM_GENAI) | ||||
|  | ||||
|     # Add additional HIPCC compiler flags for performance | ||||
|     set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS | ||||
|       -mllvm | ||||
|       -amdgpu-coerce-illegal-types=1 | ||||
|       -mllvm | ||||
|       -enable-post-misched=0 | ||||
|       -mllvm | ||||
|       -greedy-reverse-local-assignment=1 | ||||
|       -fhip-new-launch-api) | ||||
|     if(DEFINED ROCM_VERSION_DEV AND ROCM_VERSION_DEV VERSION_LESS "7.2.0") | ||||
|         list(PREPEND FBGEMM_GENAI_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1) | ||||
|       endif() | ||||
|  | ||||
|     # Only compile for gfx942 for now. | ||||
|     # This is rather hacky, I could not figure out a clean solution :( | ||||
|  | ||||
| @ -39,7 +39,7 @@ struct HostBlock { | ||||
| }; | ||||
|  | ||||
| template <typename B> | ||||
| struct alignas(hardware_destructive_interference_size) FreeBlockList { | ||||
| struct alignas(64) FreeBlockList { | ||||
|   std::mutex mutex_; | ||||
|   std::deque<B*> list_; | ||||
| }; | ||||
| @ -122,7 +122,7 @@ struct TORCH_API HostStats { | ||||
| // Struct containing memory allocator summary statistics for host, as they | ||||
| // are staged for reporting. This is a temporary struct that is used to | ||||
| // avoid locking the allocator while collecting stats. | ||||
| struct alignas(hardware_destructive_interference_size) HostStatsStaged { | ||||
| struct alignas(64) HostStatsStaged { | ||||
|   std::mutex timing_mutex_; | ||||
|   // COUNT: total allocations (active + free) | ||||
|   // LOCK: access to this stat is protected by the allocator's blocks_mutex_ | ||||
| @ -669,7 +669,7 @@ struct CachingHostAllocatorImpl { | ||||
|     TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event"); | ||||
|   } | ||||
|  | ||||
|   alignas(hardware_destructive_interference_size) std::mutex blocks_mutex_; | ||||
|   alignas(64) std::mutex blocks_mutex_; | ||||
|   ska::flat_hash_set<B*> blocks_; // block list | ||||
|   ska::flat_hash_map<void*, B*> ptr_to_block_; | ||||
|  | ||||
| @ -677,17 +677,17 @@ struct CachingHostAllocatorImpl { | ||||
|   // size. This allows us to quickly find a free block of the right size. | ||||
|   // We use deque to store per size free list and guard the list with its own | ||||
|   // mutex. | ||||
|   alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>> free_list_ = | ||||
|   alignas(64) std::vector<FreeBlockList<B>> free_list_ = | ||||
|       std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX); | ||||
|  | ||||
|   alignas(hardware_destructive_interference_size) std::mutex events_mutex_; | ||||
|   alignas(64) std::mutex events_mutex_; | ||||
|   std::deque<std::pair<E, B*>> events_; // event queue paired with block | ||||
|  | ||||
|   // Indicates whether the object is active. | ||||
|   // Set to false in the destructor to signal background threads to stop. | ||||
|   std::atomic<bool> active_{true}; | ||||
| protected: | ||||
|   alignas(hardware_destructive_interference_size) HostStatsStaged stats_; | ||||
|   alignas(64) HostStatsStaged stats_; | ||||
| }; | ||||
|  | ||||
| struct TORCH_API HostAllocator : public at::Allocator { | ||||
|  | ||||
| @ -183,6 +183,11 @@ struct CUDACachingHostAllocatorImpl | ||||
|     return true; | ||||
|   } | ||||
|  | ||||
|   bool pinned_use_background_threads() override { | ||||
|     return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig:: | ||||
|         pinned_use_background_threads(); | ||||
|   } | ||||
|  | ||||
|   EventPool::Event create_event_internal(DeviceIndex idx) { | ||||
|     // Leak the event pool to avoid shutdown issue. | ||||
|     static auto* event_pool = new EventPool(); | ||||
|  | ||||
| @ -177,6 +177,7 @@ inline void segmented_sort_pairs( | ||||
|   } | ||||
| } | ||||
|  | ||||
| #if CUB_SUPPORTS_UNIQUE_BY_KEY() | ||||
| template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename NumSelectedIteratorT> | ||||
| inline void unique_by_key( | ||||
|   KeysInputIteratorT keys_in, ValuesInputIteratorT values_in, | ||||
| @ -192,6 +193,7 @@ inline void unique_by_key( | ||||
|   CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::UniqueByKey, | ||||
|     keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::cuda::getCurrentCUDAStream()); | ||||
| } | ||||
| #endif | ||||
|  | ||||
| namespace impl { | ||||
|  | ||||
| @ -577,6 +579,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT | ||||
| #endif | ||||
| } | ||||
|  | ||||
| #if CUB_SUPPORTS_SCAN_BY_KEY() | ||||
|  | ||||
| template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT> | ||||
| inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items) { | ||||
| @ -604,6 +607,7 @@ inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT | ||||
| #endif | ||||
| } | ||||
|  | ||||
| #endif | ||||
|  | ||||
| template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT> | ||||
| void unique(InputIteratorT input, OutputIteratorT output, | ||||
|  | ||||
| @ -28,6 +28,22 @@ | ||||
| #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false | ||||
| #endif | ||||
|  | ||||
| // cub support for UniqueByKey is added to cub 1.16 in: | ||||
| // https://github.com/NVIDIA/cub/pull/405 | ||||
| #if CUB_VERSION >= 101600 | ||||
| #define CUB_SUPPORTS_UNIQUE_BY_KEY() true | ||||
| #else | ||||
| #define CUB_SUPPORTS_UNIQUE_BY_KEY() false | ||||
| #endif | ||||
|  | ||||
| // cub support for scan by key is added to cub 1.15 | ||||
| // in https://github.com/NVIDIA/cub/pull/376 | ||||
| #if CUB_VERSION >= 101500 | ||||
| #define CUB_SUPPORTS_SCAN_BY_KEY() 1 | ||||
| #else | ||||
| #define CUB_SUPPORTS_SCAN_BY_KEY() 0 | ||||
| #endif | ||||
|  | ||||
| // cub support for cub::FutureValue is added to cub 1.15 in: | ||||
| // https://github.com/NVIDIA/cub/pull/305 | ||||
| #if CUB_VERSION >= 101500 | ||||
|  | ||||
| @ -160,10 +160,6 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({ | ||||
|   DispatchKey::CUDA, | ||||
|   DispatchKey::CPU, | ||||
|   DispatchKey::PrivateUse1, | ||||
|   DispatchKey::SparseCPU, | ||||
|   DispatchKey::SparseCUDA, | ||||
|   DispatchKey::SparseCsrCPU, | ||||
|   DispatchKey::SparseCsrCUDA, | ||||
| }); | ||||
|  | ||||
| inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) { | ||||
|  | ||||
| @ -3620,7 +3620,7 @@ Tensor& _int_mm_out_cpu(const Tensor& self, const Tensor& mat2, Tensor& result) | ||||
|     try { | ||||
|       mkldnn_matmul_i8i8i32(self, mat2, result); | ||||
|       dispatched = true; | ||||
|     } catch ([[maybe_unused]] const std::exception& e) { | ||||
|     } catch (const std::exception& e) { | ||||
|       TORCH_WARN(func_name, " failed, switching to BLAS gemm: ", e.what()); | ||||
|     } | ||||
|   } | ||||
|  | ||||
| @ -2322,23 +2322,12 @@ _scaled_nvfp4_nvfp4( | ||||
|           const Tensor& scale_b, const SwizzleType swizzle_b, | ||||
|           const std::optional<Tensor>& bias, | ||||
|           const c10::ScalarType out_dtype, | ||||
|           Tensor& out, | ||||
|           const std::optional<Tensor>& global_scale_a = std::nullopt, | ||||
|           const std::optional<Tensor>& global_scale_b = std::nullopt) { | ||||
|           const bool single_scale, | ||||
|           Tensor& out) { | ||||
| #ifdef USE_ROCM | ||||
|   TORCH_CHECK_NOT_IMPLEMENTED(false, "NVFP4 scaling not supported on ROCM"); | ||||
| #endif | ||||
|   std::optional<Tensor> alpha = std::nullopt; | ||||
|   // Note: "Or" here means that if only one scale is passed, we check for the other. Otherwise, | ||||
|   //       if this is "And" we would silently do nothing in the case where one global scale is | ||||
|   //       passed and not the other. | ||||
|   if (global_scale_a.has_value() || global_scale_b.has_value()) { | ||||
|     TORCH_CHECK_VALUE(global_scale_a.has_value(), | ||||
|         "For two-level-scaled NVFP4, global_scale_a must have a value"); | ||||
|     TORCH_CHECK_VALUE(global_scale_b.has_value(), | ||||
|         "For two-level-scaled NVFP4, global_scale_b must have a value"); | ||||
|     alpha = global_scale_a.value().mul(global_scale_b.value()); | ||||
|   } | ||||
|   TORCH_CHECK_VALUE(single_scale, "Only single-scaled NVFP4 currently supported"); | ||||
|   // Restrictions: | ||||
|   // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32 | ||||
|   // Scales must be swizzled | ||||
| @ -2360,7 +2349,7 @@ _scaled_nvfp4_nvfp4( | ||||
|  | ||||
|   auto scaling_choice_a = ScalingType::BlockWise1x16; | ||||
|   auto scaling_choice_b = ScalingType::BlockWise1x16; | ||||
|   return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out, alpha); | ||||
|   return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out); | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -2566,10 +2555,9 @@ _scaled_mm_cuda_v2_out( | ||||
|   } else if (gemm_impl == ScaledGemmImplementation::MXFP8_MXFP8) { | ||||
|     return _scaled_mxfp8_mxfp8(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out); | ||||
|   } else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4) { | ||||
|     return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out, | ||||
|                                scale_a[1], scale_b[1]); | ||||
|     TORCH_CHECK_NOT_IMPLEMENTED(false, "Only single-scale NVFP4 currently supported"); | ||||
|   } else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE) { | ||||
|     return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out); | ||||
|     return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, true /* single_scale */, out); | ||||
|   } else if (gemm_impl == ScaledGemmImplementation::MXFP4_MXFP4) { | ||||
|     return _scaled_mxfp4_mxfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out); | ||||
|   } else { | ||||
|  | ||||
| @ -15,7 +15,9 @@ | ||||
| #include <ATen/native/cuda/block_reduce.cuh> | ||||
| #include <ATen/native/cuda/thread_constants.h> | ||||
|  | ||||
| #if CUB_SUPPORTS_SCAN_BY_KEY() | ||||
| #include <thrust/iterator/reverse_iterator.h> | ||||
| #endif | ||||
|  | ||||
| #ifndef AT_PER_OPERATOR_HEADERS | ||||
| #include <ATen/Functions.h> | ||||
| @ -238,6 +240,10 @@ __global__ void renorm_kernel( | ||||
|  | ||||
| } // anonymous namespace | ||||
|  | ||||
| #if !CUB_SUPPORTS_SCAN_BY_KEY() | ||||
| template<typename index_t> | ||||
| void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count); | ||||
| #endif | ||||
|  | ||||
| Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices_, | ||||
|                                int64_t num_weights, int64_t padding_idx, | ||||
| @ -300,6 +306,7 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice | ||||
|  | ||||
|   if (scale_grad_by_freq) { | ||||
|     count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT); | ||||
| #if CUB_SUPPORTS_SCAN_BY_KEY() | ||||
|     AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () { | ||||
|       cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||||
|  | ||||
| @ -326,6 +333,11 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice | ||||
|         num_indices | ||||
|       ); | ||||
|     }); | ||||
| #else | ||||
|     AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () { | ||||
|       embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count); | ||||
|     }); | ||||
| #endif | ||||
|   } | ||||
|  | ||||
|   return embedding_backward_cuda_kernel(grad, orig_indices, | ||||
|  | ||||
| @ -10,7 +10,9 @@ | ||||
|  | ||||
| #include <c10/macros/Macros.h> | ||||
|  | ||||
| #if CUB_SUPPORTS_UNIQUE_BY_KEY() | ||||
| #include <thrust/iterator/counting_iterator.h> | ||||
| #endif | ||||
|  | ||||
| #ifndef AT_PER_OPERATOR_HEADERS | ||||
| #include <ATen/Functions.h> | ||||
| @ -194,9 +196,18 @@ __global__ void compute_num_of_partial_segments(const index_t *partials_per_segm | ||||
|             partials_per_segment_offset[num_of_segments-1]; | ||||
| } | ||||
|  | ||||
| #if !CUB_SUPPORTS_UNIQUE_BY_KEY() | ||||
| __global__ void write_num_of_segments_for_legacy_thrust_path(int64_t *num_of_segments_ptr, int64_t num_of_segments) { | ||||
|   *num_of_segments_ptr = num_of_segments; | ||||
| } | ||||
| #endif | ||||
|  | ||||
| } // anon namespace | ||||
|  | ||||
| #if !CUB_SUPPORTS_UNIQUE_BY_KEY() | ||||
| template<typename index_t> | ||||
| int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets); | ||||
| #endif | ||||
|  | ||||
| Tensor embedding_backward_cuda_kernel( | ||||
|         const Tensor &grad, | ||||
| @ -223,12 +234,20 @@ Tensor embedding_backward_cuda_kernel( | ||||
|   auto segment_offsets = at::empty({numel}, orig_indices.options()); | ||||
|   auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong)); | ||||
|   int64_t *num_of_segments_ptr = num_of_segments_tensor.mutable_data_ptr<int64_t>(); | ||||
| #if !CUB_SUPPORTS_UNIQUE_BY_KEY() | ||||
|   AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () { | ||||
|     int64_t num_of_segments = embedding_backward_cuda_kernel_unique_by_key<index_t>(sorted_indices, segment_offsets); | ||||
|     write_num_of_segments_for_legacy_thrust_path<<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>(num_of_segments_ptr, num_of_segments); | ||||
|     C10_CUDA_KERNEL_LAUNCH_CHECK(); | ||||
|   }); | ||||
| #else | ||||
|   AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () { | ||||
|     cuda::cub::unique_by_key( | ||||
|       sorted_indices.const_data_ptr<index_t>(), thrust::make_counting_iterator(0), | ||||
|       segment_offsets.mutable_data_ptr<index_t>(), | ||||
|       num_of_segments_ptr, sorted_indices.numel()); | ||||
|   }); | ||||
| #endif | ||||
|  | ||||
|   int64_t max_segments = std::min<int64_t>(numel, num_weights); | ||||
|  | ||||
|  | ||||
| @ -31,10 +31,16 @@ | ||||
|  | ||||
| #include <c10/macros/Macros.h> | ||||
|  | ||||
| #if CUB_SUPPORTS_SCAN_BY_KEY() | ||||
| #include <thrust/iterator/reverse_iterator.h> | ||||
| #endif | ||||
|  | ||||
| namespace at::native { | ||||
|  | ||||
| #if !CUB_SUPPORTS_SCAN_BY_KEY() | ||||
| template<typename index_t> | ||||
| void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count); | ||||
| #endif | ||||
|  | ||||
| namespace { | ||||
|  | ||||
| @ -193,6 +199,7 @@ Tensor embedding_bag_backward_cuda_sum_avg( | ||||
|  | ||||
|   if (scale_grad_by_freq) { | ||||
|     count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT); | ||||
| #if CUB_SUPPORTS_SCAN_BY_KEY() | ||||
|     AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () { | ||||
|       cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||||
|  | ||||
| @ -219,6 +226,11 @@ Tensor embedding_bag_backward_cuda_sum_avg( | ||||
|         num_indices | ||||
|       ); | ||||
|     }); | ||||
| #else | ||||
|     AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () { | ||||
|       embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count); | ||||
|     }); | ||||
| #endif | ||||
|   } | ||||
|   return embedding_backward_cuda_kernel(grad, orig_indices, sorted_indices, | ||||
|       count, num_weights, padding_idx, mode == EmbeddingBagMode::MEAN, offset2bag, | ||||
|  | ||||
							
								
								
									
										90
									
								
								aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										90
									
								
								aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,90 @@ | ||||
| #define TORCH_ASSERT_ONLY_METHOD_OPERATORS | ||||
| #include <ATen/core/Tensor.h> | ||||
| #include <ATen/native/cuda/SortingCommon.cuh> | ||||
| #include <ATen/cuda/cub_definitions.cuh> | ||||
|  | ||||
| #ifndef AT_PER_OPERATOR_HEADERS | ||||
| #include <ATen/Functions.h> | ||||
| #else | ||||
| #include <ATen/ops/empty_like.h> | ||||
| #endif | ||||
|  | ||||
| #include <ATen/cuda/ThrustAllocator.h> | ||||
| #include <thrust/device_ptr.h> | ||||
| #include <thrust/execution_policy.h> | ||||
| #include <thrust/sort.h> | ||||
| #include <thrust/unique.h> | ||||
| #include <thrust/device_ptr.h> | ||||
| #include <thrust/iterator/constant_iterator.h> | ||||
|  | ||||
| namespace at::native { | ||||
|  | ||||
| #if !CUB_SUPPORTS_SCAN_BY_KEY() | ||||
|  | ||||
| template<typename index_t> | ||||
| void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count) { | ||||
|   cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||||
|   at::cuda::ThrustAllocator allocator; | ||||
|   auto policy = thrust::cuda::par(allocator).on(stream); | ||||
|  | ||||
|   auto num_indices = count.numel(); | ||||
|  | ||||
|   // Compute an increasing sequence per unique item in sortedIndices: | ||||
|   // sorted: 2 5 5 5 7 7 8 9 9 | ||||
|   //  count: 1 1 2 3 1 2 1 1 2 | ||||
|   auto sorted_data = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>()); | ||||
|   auto count_data = thrust::device_ptr<index_t>(count.mutable_data_ptr<index_t>()); | ||||
|   thrust::inclusive_scan_by_key( | ||||
|     policy, | ||||
|     sorted_data, | ||||
|     sorted_data + num_indices, | ||||
|     thrust::make_constant_iterator(1), | ||||
|     count_data | ||||
|   ); | ||||
|  | ||||
|   // Take the maximum of each count per unique key in reverse: | ||||
|   // sorted: 2 5 5 5 7 7 8 9 9 | ||||
|   //  count: 1 3 3 3 2 2 1 2 2 | ||||
|   thrust::inclusive_scan_by_key( | ||||
|     policy, | ||||
|     thrust::make_reverse_iterator(sorted_data + num_indices), | ||||
|     thrust::make_reverse_iterator(sorted_data), | ||||
|     thrust::make_reverse_iterator(count_data + num_indices), | ||||
|     thrust::make_reverse_iterator(count_data + num_indices), | ||||
|     thrust::equal_to<index_t>(), | ||||
|     thrust::maximum<index_t>() | ||||
|   ); | ||||
| } | ||||
|  | ||||
| template | ||||
| void embedding_dense_backward_cuda_scan<int>(Tensor &sorted_indices, Tensor &count); | ||||
| template | ||||
| void embedding_dense_backward_cuda_scan<int64_t>(Tensor &sorted_indices, Tensor &count); | ||||
|  | ||||
| #endif | ||||
|  | ||||
| template<typename index_t> | ||||
| int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets) { | ||||
|   auto stream = at::cuda::getCurrentCUDAStream(); | ||||
|   at::cuda::ThrustAllocator allocator; | ||||
|   auto policy = thrust::cuda::par(allocator).on(stream); | ||||
|   const ptrdiff_t numel = sorted_indices.numel(); | ||||
|   auto sorted_indices_dev = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>()); | ||||
|   auto dummy = at::empty_like(sorted_indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT); | ||||
|   auto dummy_dev = thrust::device_ptr<index_t>(dummy.mutable_data_ptr<index_t>()); | ||||
|   auto ends = thrust::unique_by_key_copy( | ||||
|           policy, | ||||
|           sorted_indices_dev, | ||||
|           sorted_indices_dev + numel, | ||||
|           thrust::make_counting_iterator(0), | ||||
|           dummy_dev, | ||||
|           thrust::device_ptr<index_t>(segment_offsets.mutable_data_ptr<index_t>())); | ||||
|   return thrust::get<0>(ends) - dummy_dev; | ||||
| } | ||||
|  | ||||
| template | ||||
| int64_t embedding_backward_cuda_kernel_unique_by_key<int>(const Tensor &sorted_indices, Tensor &segment_offsets); | ||||
| template | ||||
| int64_t embedding_backward_cuda_kernel_unique_by_key<int64_t>(const Tensor &sorted_indices, Tensor &segment_offsets); | ||||
|  | ||||
| } // namespace at::native | ||||
| @ -146,7 +146,6 @@ __global__ void nll_loss2d_backward_no_reduce_kernel( | ||||
|   int64_t batch_size = target.size(0); | ||||
|   int64_t H = target.size(1); | ||||
|   int64_t W = target.size(2); | ||||
|   int64_t n_classes = grad_input.size(1); | ||||
|  | ||||
|   CUDA_KERNEL_LOOP(index, n_threads) { | ||||
|     const int64_t b = index % batch_size; | ||||
| @ -157,7 +156,6 @@ __global__ void nll_loss2d_backward_no_reduce_kernel( | ||||
|     if (cur_target == ignore_index) { | ||||
|       continue; | ||||
|     } | ||||
|     CUDA_KERNEL_ASSERT(cur_target >= 0 && cur_target < n_classes); | ||||
|     scalar_t value = -(weight != nullptr ? weight[cur_target] : static_cast<scalar_t>(1)); | ||||
|     grad_input[b][cur_target][h][w] = value * grad_output[b][h][w]; | ||||
|   } | ||||
|  | ||||
| @ -413,12 +413,14 @@ struct ReduceOp { | ||||
|       value = thread_reduce<output_vec_size>(input_slice); | ||||
|     } | ||||
|  | ||||
|     if (config.should_block_x_reduce()) { | ||||
|       value = block_x_reduce<output_vec_size>(value, shared_memory); | ||||
|     } | ||||
|     if (config.should_block_y_reduce()) { | ||||
|       value = block_y_reduce<output_vec_size>(value, shared_memory); | ||||
|     } | ||||
|     __syncthreads(); | ||||
|     if (config.should_block_x_reduce()) { | ||||
|       value = block_x_reduce<output_vec_size>(value, shared_memory); | ||||
|     } | ||||
|  | ||||
|     using out_ptr_vec_t = std::array<out_scalar_t*, output_vec_size>; | ||||
|     using offset_vec_t = std::array<index_t, output_vec_size>; | ||||
|     offset_vec_t base_offsets; | ||||
| @ -655,8 +657,8 @@ struct ReduceOp { | ||||
|     __syncthreads(); | ||||
|     // Intra-warp reduction, fix CUDA to have offset decreasing for better numerics | ||||
|     // matching Triton, etc. | ||||
|     // TODO(PaulZhang12): AMD and internal | ||||
|     #if defined(USE_ROCM) || defined(FBCODE_CAFFE2) | ||||
|     // todo for AMD | ||||
|     #ifdef USE_ROCM | ||||
|     for (int offset = 1; offset < dim_x; offset <<= 1) { | ||||
|     #else | ||||
|     for (int offset = dim_x >> 1; offset > 0; offset >>= 1) { | ||||
|  | ||||
| @ -19,6 +19,7 @@ | ||||
|  | ||||
| namespace at::native { | ||||
|  | ||||
| // TODO: remove this when CUDA <11.6 is no longer supported | ||||
| void topk_out_with_sort( | ||||
|   const Tensor& self, | ||||
|   int64_t k, int64_t dim, bool largest, | ||||
| @ -30,12 +31,21 @@ void topk_out_with_sort( | ||||
|   indices.copy_(sorted_indices.narrow(dim, 0, k)); | ||||
| } | ||||
|  | ||||
| // TODO: remove this when CUDA <11.6 is no longer supported | ||||
| bool disable_sort_for_topk(); | ||||
| bool should_use_sort(const Tensor& self, int64_t dim) { | ||||
| #if defined(USE_ROCM) | ||||
|   if (self.dtype() == kBool) return false; // Bool sort not supported in ROCm: https://github.com/pytorch/pytorch/issues/139972 | ||||
|   return (self.numel() >= 10000 && self.numel() == self.size(dim)); // based on the experiments in https://github.com/pytorch/pytorch/pull/146387 | ||||
| #else | ||||
|   return false; | ||||
|   if (disable_sort_for_topk()) return false; | ||||
|   // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/68632 | ||||
|   if (self.dim() == 0) return false; | ||||
|   if (self.dtype() == kBool) return false; // Bool is not support by topk | ||||
|   int64_t slice_size = self.size(dim); | ||||
|   if (slice_size == 0) return false; | ||||
|   int64_t num_slices = self.numel() / slice_size; | ||||
|   return num_slices <= 10 && slice_size >= 100000; | ||||
| #endif | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -21,6 +21,11 @@ using namespace at::native; | ||||
|  | ||||
| namespace at::native { | ||||
|  | ||||
| // TODO: remove this when CUDA <11.6 is no longer supported | ||||
| bool disable_sort_for_topk() { | ||||
|   return CUB_SUPPORTS_SCAN_BY_KEY(); | ||||
| } | ||||
|  | ||||
| namespace sbtopk { // single_block_topk | ||||
|  | ||||
| template <typename T> | ||||
| @ -413,6 +418,10 @@ __global__ void computeBlockwiseWithinKCounts( | ||||
|   } | ||||
|   __syncthreads(); | ||||
|  | ||||
| #if !CUB_SUPPORTS_SCAN_BY_KEY() | ||||
|   return; | ||||
| #endif | ||||
|  | ||||
|   Bitwise desired_digit = at::cuda::Bitfield<Bitwise>::getBitfield(desired, current_bit, RADIX_BITS); | ||||
|  | ||||
|   // if largest, then only threads that has tidx > desired_digit are active | ||||
| @ -468,6 +477,7 @@ __global__ void computeBlockwiseWithinKCounts( | ||||
|   } | ||||
| } | ||||
|  | ||||
| #if CUB_SUPPORTS_SCAN_BY_KEY() | ||||
| // Assumption: slice_size can not be larger than UINT32_MAX | ||||
| template <typename Bitwise> | ||||
| __global__ void computeBlockwiseKthCounts( | ||||
| @ -599,6 +609,7 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<const T, IndexType> inpu | ||||
|     } | ||||
|   } | ||||
| } | ||||
| #endif | ||||
|  | ||||
| int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) { | ||||
|   // occupancy of this kernel is limited by registers per threads | ||||
| @ -676,12 +687,16 @@ void launch( | ||||
|   uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get()); | ||||
|   AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream)); | ||||
|  | ||||
| #if CUB_SUPPORTS_SCAN_BY_KEY() | ||||
|   auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t)); | ||||
|   uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get()); | ||||
|   AT_CUDA_CHECK(cudaMemsetAsync(withinKCounts, 0, num_blocks * sizeof(uint32_t), stream)); | ||||
|  | ||||
|   auto kthCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t)); | ||||
|   uint32_t* kthCounts = reinterpret_cast<uint32_t*>(kthCounts_buffer.get()); | ||||
| #else | ||||
|   uint32_t* withinKCounts = nullptr; | ||||
| #endif | ||||
|  | ||||
|   Bitwise desiredMask = 0; | ||||
|   dim3 grid; | ||||
| @ -728,6 +743,7 @@ void launch( | ||||
|   } | ||||
|   desired = desired_in; | ||||
|  | ||||
| #if CUB_SUPPORTS_SCAN_BY_KEY() | ||||
|   computeBlockwiseKthCounts<Bitwise><<<std::min(((int64_t)numInputSlices + 255) / 256, (int64_t)1073741824), 256, 0, stream>>>( | ||||
|     desired, counts, num_blocks, blocks_per_slice, kthCounts); | ||||
|   C10_CUDA_KERNEL_LAUNCH_CHECK(); | ||||
| @ -743,6 +759,28 @@ void launch( | ||||
|     topK, topKWithinSliceStride, indices, indicesWithinSliceStride, items_per_thread, | ||||
|     blocks_per_slice, kthValues, withinKCounts, kthCounts, num_blocks); | ||||
|   C10_CUDA_KERNEL_LAUNCH_CHECK(); | ||||
| #else | ||||
|   // Find topk values based on kth values | ||||
|   { | ||||
|     dim3 grid; | ||||
|     TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk"); | ||||
|     int warp_size = at::cuda::warp_size(); | ||||
|     dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024)); | ||||
|     sbtopk::gatherTopK<T, IndexType, Dim, /* WithKthValues= */true><<<grid, block, 0, stream>>>( | ||||
|         input, | ||||
|         inputSliceSize, | ||||
|         outputSliceSize, | ||||
|         largest, | ||||
|         numInputSlices, | ||||
|         inputWithinSliceStride, | ||||
|         topK, | ||||
|         topKWithinSliceStride, | ||||
|         indices, | ||||
|         indicesWithinSliceStride, | ||||
|         kthValues); | ||||
|     C10_CUDA_KERNEL_LAUNCH_CHECK(); | ||||
|   } | ||||
| #endif | ||||
| } | ||||
|  | ||||
| } // namespace mbtopk | ||||
| @ -750,6 +788,7 @@ void launch( | ||||
| bool should_use_multiblock(int64_t num_slices, int64_t slice_size) { | ||||
|   if (num_slices > std::numeric_limits<uint32_t>::max() || | ||||
|       slice_size > std::numeric_limits<uint32_t>::max()) return false; | ||||
| #if CUB_SUPPORTS_SCAN_BY_KEY() | ||||
|   // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/74267 | ||||
|   return (num_slices <= 20 && slice_size >= 20000) || | ||||
|       (num_slices > 20 && num_slices <= 40 && slice_size >= 10000) || | ||||
| @ -758,6 +797,12 @@ bool should_use_multiblock(int64_t num_slices, int64_t slice_size) { | ||||
|       (num_slices >= 200 && num_slices < 800 && slice_size >= 3000) || | ||||
|       (num_slices >= 800 && num_slices <= 4000 && slice_size >= 800) || | ||||
|       (num_slices > 4000 && slice_size >= 400); | ||||
| #else | ||||
|   // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/71081 | ||||
|   return (num_slices <= 400 && slice_size >= 5000) || | ||||
|       (num_slices > 400 && num_slices < 4000 && slice_size >= 1000) || | ||||
|       (num_slices >= 4000 && slice_size >= 300); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| void launch_gather_topk_kernel( | ||||
|  | ||||
| @ -44,7 +44,7 @@ __global__ void triu_tril_kernel( | ||||
|     const int64_t k, | ||||
|     const int64_t N_padded, | ||||
|     const IndexType last_dim_padded) { | ||||
|   int64_t linear_idx = (((int64_t)blockIdx.x) * blockDim.x + threadIdx.x) * elements_per_thread; | ||||
|   int64_t linear_idx = (blockIdx.x * blockDim.x + threadIdx.x) * elements_per_thread; | ||||
|   if (linear_idx >= N_padded) { | ||||
|     return; | ||||
|   } | ||||
|  | ||||
| @ -466,7 +466,7 @@ struct ReduceJitOp { | ||||
|  | ||||
|     __syncthreads(); | ||||
|  | ||||
|     #if defined(USE_ROCM) || defined(FBCODE_CAFFE2) | ||||
|     #ifdef USE_ROCM | ||||
|     for (int offset = 1; offset < dim_x; offset <<= 1) { | ||||
|     #else | ||||
|     for (int offset = dim_x >> 1; offset > 0; offset >>= 1) { | ||||
|  | ||||
| @ -487,7 +487,9 @@ std::unique_ptr<fe::graph::Graph> build_graph( | ||||
|   auto scaled_dot_product_flash_attention_options = | ||||
|       fe::graph::SDPA_attributes() | ||||
|           .set_name("CUDNN_SDPA") | ||||
|           .set_generate_stats(return_softmaxstats) | ||||
|           .set_is_inference(return_softmaxstats == false) | ||||
|           // TODO(eqy): switch to this API once cuDNN FE is upgraded | ||||
|           // .set_generate_stats(return_softmaxstats) | ||||
|           .set_causal_mask(is_causal) | ||||
|           .set_attn_scale(attn_scale); | ||||
|   if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) { | ||||
| @ -705,7 +707,9 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor( | ||||
|   auto scaled_dot_product_flash_attention_options = | ||||
|       fe::graph::SDPA_attributes() | ||||
|           .set_name("CUDNN_SDPA_NESTEDTENSOR") | ||||
|           .set_generate_stats(return_softmaxstats) | ||||
|           .set_is_inference(return_softmaxstats == false) | ||||
|           // TODO(eqy): switch to this API once cuDNN FE is upgraded | ||||
|           // .set_generate_stats(return_softmaxstats) | ||||
|           .set_causal_mask(is_causal) | ||||
|           .set_attn_scale(attn_scale) | ||||
|           .set_seq_len_q(SEQ_LEN_Q_) | ||||
|  | ||||
| @ -441,7 +441,7 @@ kernel void applySYRK( | ||||
|     uint3 tid [[thread_position_in_threadgroup]], | ||||
|     uint3 tgid [[threadgroup_position_in_grid]], | ||||
|     uint3 tpg [[threads_per_threadgroup]], | ||||
|     uint warp_id [[simdgroup_index_in_threadgroup]]) { | ||||
|     uint sgitg [[simdgroup_index_in_threadgroup]]) { | ||||
|   const uint tx = tid.x; | ||||
|   const uint ty = tid.y; | ||||
|   const uint simdGroupsPerThreadgroup = (tpg.x * tpg.y + 31) / 32; | ||||
| @ -474,8 +474,11 @@ kernel void applySYRK( | ||||
|       (actSize_j % 8 == 0) && (actSize_h % 8 == 0) && (actSize_k % 8 == 0); | ||||
|  | ||||
|   if (use_simdgroup) { | ||||
|     uint warp_id = sgitg; | ||||
|  | ||||
|     simdgroup_matrix<float, 8, 8> negative_identity = | ||||
|         simdgroup_matrix<float, 8, 8>(-1.0); | ||||
|     simdgroup_matrix<float, 8, 8> identity = simdgroup_matrix<float, 8, 8>(1.0); | ||||
|     simdgroup_matrix<float, 8, 8> Prod; | ||||
|     simdgroup_matrix<float, 8, 8> Afrag; | ||||
|     simdgroup_matrix<float, 8, 8> Bfrag; | ||||
| @ -518,7 +521,8 @@ kernel void applySYRK( | ||||
|             /* transpose = */ upper); | ||||
|  | ||||
|         simdgroup_multiply(Prod, Afrag, Bfrag); | ||||
|         simdgroup_multiply_accumulate(Cfrag, Prod, negative_identity, Cfrag); | ||||
|         simdgroup_multiply(Prod, Prod, negative_identity); | ||||
|         simdgroup_multiply_accumulate(Cfrag, Cfrag, identity, Prod); | ||||
|       } | ||||
|  | ||||
|       simdgroup_store( | ||||
|  | ||||
| @ -706,7 +706,6 @@ | ||||
|   variants: function, method | ||||
|   dispatch: | ||||
|     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_all | ||||
|   tags: reduction | ||||
|  | ||||
|  | ||||
| - func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor | ||||
| @ -716,7 +715,6 @@ | ||||
|   cpp_no_default_args: ['dim'] | ||||
|   dispatch: | ||||
|     CompositeExplicitAutograd: all_dims_default | ||||
|   tags: reduction | ||||
|  | ||||
| - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
| @ -725,7 +723,6 @@ | ||||
|     CPU, CUDA: all_out | ||||
|     MPS: all_out_mps | ||||
|     MTIA: all_out_mtia | ||||
|   tags: reduction | ||||
|  | ||||
| - func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
| @ -734,16 +731,13 @@ | ||||
|     CPU, CUDA: all_dims_out | ||||
|     CompositeExplicitAutograd: all_dims_out_default | ||||
|   cpp_no_default_args: ['dim'] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: reduction | ||||
|  | ||||
| - func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   tags: reduction | ||||
|  | ||||
| - func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool | ||||
|   variants: function, method | ||||
| @ -755,14 +749,14 @@ | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   structured_delegate: any.out | ||||
|   variants: function, method | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|  | ||||
| - func: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   structured_delegate: any.dims_out | ||||
|   variants: function, method | ||||
|   cpp_no_default_args: ['dim'] | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|   dispatch: | ||||
|     CompositeExplicitAutograd: any_dims_default | ||||
|  | ||||
| @ -772,7 +766,6 @@ | ||||
|   dispatch: | ||||
|     CPU, CUDA: any_out | ||||
|     MPS: any_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| - func: any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
| @ -781,16 +774,13 @@ | ||||
|     CPU, CUDA: any_dims_out | ||||
|     CompositeExplicitAutograd: any_dims_out_default | ||||
|   cpp_no_default_args: ['dim'] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: reduction | ||||
|  | ||||
| - func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   tags: reduction | ||||
|  | ||||
| - func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor | ||||
|   dispatch: | ||||
| @ -836,27 +826,25 @@ | ||||
|   structured_delegate: argmax.out | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|  | ||||
| - func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   structured: True | ||||
|   dispatch: | ||||
|     CPU, CUDA: argmax_out | ||||
|     MPS: argmax_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor | ||||
|   structured_delegate: argmin.out | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|  | ||||
| - func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   structured: True | ||||
|   dispatch: | ||||
|     CPU, CUDA: argmin_out | ||||
|     MPS: argmin_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| - func: acosh(Tensor self) -> Tensor | ||||
|   variants: function, method | ||||
| @ -1382,7 +1370,6 @@ | ||||
|   dispatch: | ||||
|     SparseCPU: bmm_sparse_cpu | ||||
|     SparseCUDA: bmm_sparse_cuda | ||||
|     SparseMPS: bmm_sparse_mps | ||||
|     NestedTensorCPU: bmm_nested | ||||
|     NestedTensorCUDA: bmm_nested_cuda | ||||
|   tags: core | ||||
| @ -1398,7 +1385,6 @@ | ||||
|     MTIA: bmm_out_mtia | ||||
|     SparseCPU: bmm_out_sparse_cpu | ||||
|     SparseCUDA: bmm_out_sparse_cuda | ||||
|     SparseMPS: bmm_out_sparse_mps | ||||
|     SparseCsrCUDA: bmm_out_sparse_csr_cuda | ||||
|  | ||||
| - func: bmm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor | ||||
| @ -1881,14 +1867,12 @@ | ||||
|     CUDA: count_nonzero_cuda | ||||
|     MPS: count_nonzero_mps | ||||
|   autogen: count_nonzero.dim_IntList_out | ||||
|   tags: reduction | ||||
|  | ||||
| - func: count_nonzero(Tensor self, int? dim=None) -> Tensor | ||||
|   variants: function, method | ||||
|   dispatch: | ||||
|     CompositeExplicitAutograd: count_nonzero | ||||
|   autogen: count_nonzero.out | ||||
|   tags: reduction | ||||
|  | ||||
| - func: cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor | ||||
|   variants: function, method | ||||
| @ -3809,23 +3793,19 @@ | ||||
|   variants: function, method | ||||
|   dispatch: | ||||
|     CompositeExplicitAutograd: logsumexp | ||||
|   tags: reduction | ||||
|  | ||||
| - func: logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   dispatch: | ||||
|     # calls squeeze | ||||
|     CompositeExplicitAutogradNonFunctional: logsumexp_out | ||||
|   tags: reduction | ||||
|  | ||||
| - func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: reduction | ||||
|  | ||||
| - func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   tags: reduction | ||||
|  | ||||
| - func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor | ||||
|  | ||||
| @ -3875,7 +3855,6 @@ | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   structured_delegate: aminmax.out | ||||
|   variants: function, method | ||||
|   tags: reduction | ||||
|  | ||||
| - func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
| @ -3883,7 +3862,6 @@ | ||||
|   dispatch: | ||||
|     CPU, CUDA, MTIA: aminmax_out | ||||
|     MPS: aminmax_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor | ||||
|   dispatch: | ||||
| @ -3899,7 +3877,7 @@ | ||||
|   variants: function, method | ||||
|   dispatch: | ||||
|     QuantizedCPU, QuantizedCUDA: qmax | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|  | ||||
| - func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
| @ -3909,16 +3887,13 @@ | ||||
|   dispatch: | ||||
|     CPU, CUDA, MTIA: max_out | ||||
|     MPS: max_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: reduction | ||||
|  | ||||
| - func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   tags: reduction | ||||
|  | ||||
| - func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor | ||||
|   variants: function | ||||
| @ -3931,14 +3906,13 @@ | ||||
| - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor | ||||
|   variants: function, method | ||||
|   structured_delegate: amax.out | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|  | ||||
| - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   structured: True | ||||
|   dispatch: | ||||
|     CPU, CUDA, MTIA: amax_out | ||||
|     MPS: amax_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| # Return: (Tensor output, Tensor indices) | ||||
| - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) | ||||
| @ -4000,14 +3974,13 @@ | ||||
|   variants: function, method | ||||
|   dispatch: | ||||
|     CompositeExplicitAutograd: mean | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|  | ||||
| # For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this. | ||||
| - func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   dispatch: | ||||
|     CompositeExplicitAutograd: mean_dtype_out | ||||
|   tags: reduction | ||||
|  | ||||
| - func: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor | ||||
|   structured_delegate: mean.out | ||||
| @ -4015,7 +3988,7 @@ | ||||
|   variants: function, method | ||||
|   dispatch: | ||||
|     QuantizedCPU: mean_quantized_cpu | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|  | ||||
| - func: mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) | ||||
|   structured: True | ||||
| @ -4024,16 +3997,13 @@ | ||||
|     CPU, CUDA: mean_out | ||||
|     MPS: mean_out_mps | ||||
|     QuantizedCPU: mean_out_quantized_cpu | ||||
|   tags: reduction | ||||
|  | ||||
| - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: reduction | ||||
|  | ||||
| - func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   tags: reduction | ||||
|  | ||||
| - func: nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor | ||||
|   device_check: NoCheck   # Composite | ||||
| @ -4096,7 +4066,7 @@ | ||||
|   variants: function, method | ||||
|   dispatch: | ||||
|     QuantizedCPU, QuantizedCUDA: qmin | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|  | ||||
| - func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
| @ -4106,28 +4076,24 @@ | ||||
|   dispatch: | ||||
|     CPU, CUDA, MTIA: min_out | ||||
|     MPS: min_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: reduction | ||||
|  | ||||
| - func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   tags: reduction | ||||
|  | ||||
| - func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor | ||||
|   variants: function, method | ||||
|   structured_delegate: amin.out | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|  | ||||
| - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   structured: True | ||||
|   dispatch: | ||||
|     CPU, CUDA, MTIA: amin_out | ||||
|     MPS: amin_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| # TODO: Add this function to MPS dispatch key so that we avoid declaring it in | ||||
| # native_functions.yaml | ||||
| @ -4207,7 +4173,7 @@ | ||||
|   structured_delegate: mm.out | ||||
|   variants: function, method | ||||
|   dispatch: | ||||
|     SparseCPU, SparseCUDA, SparseMPS: _sparse_mm | ||||
|     SparseCPU, SparseCUDA: _sparse_mm | ||||
|     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm | ||||
|   tags: core | ||||
|  | ||||
| @ -5892,7 +5858,6 @@ | ||||
|     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo | ||||
|     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr | ||||
|   autogen: sum.out | ||||
|   tags: reduction | ||||
|  | ||||
| - func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor | ||||
|   # TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype | ||||
| @ -5903,12 +5868,11 @@ | ||||
|     NestedTensorCPU: NestedTensor_sum_dim_CPU | ||||
|     SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo | ||||
|     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|  | ||||
| - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: reduction | ||||
|  | ||||
| - func: sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) | ||||
|   structured: True | ||||
| @ -5916,11 +5880,9 @@ | ||||
|   dispatch: | ||||
|     CPU, CUDA: sum_out | ||||
|     MPS: sum_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   tags: reduction | ||||
|  | ||||
| # TODO: this function will be replaced once nested expand semantics have been settled on | ||||
| - func: _nested_sum_backward(Tensor grad, Tensor self, int[1]? dim, bool keepdim=False) -> Tensor | ||||
| @ -5932,13 +5894,11 @@ | ||||
|   dispatch: | ||||
|     CPU, CUDA: nansum | ||||
|     MPS: nansum_mps | ||||
|   tags: reduction | ||||
|  | ||||
| - func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) | ||||
|   dispatch: | ||||
|     CPU, CUDA: nansum_out | ||||
|     MPS: nansum_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| - func: hash_tensor(Tensor self, int[1] dim=[], *, bool keepdim=False, int mode=0) -> Tensor | ||||
|   variants: function, method | ||||
| @ -6002,13 +5962,11 @@ | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
| @ -6017,19 +5975,16 @@ | ||||
|     CPU, CUDA: std | ||||
|     MPS: std_mps | ||||
|     QuantizedCPU: std_quantized_cpu | ||||
|   tags: reduction | ||||
|  | ||||
| - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
| @ -6038,51 +5993,42 @@ | ||||
|     CPU, CUDA: std_mean | ||||
|     MPS: std_mean_mps | ||||
|   autogen: std_mean.correction_out | ||||
|   tags: reduction | ||||
|  | ||||
| - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function | ||||
|   tags: reduction | ||||
|  | ||||
| - func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   dispatch: | ||||
|     CPU, CUDA: std_out | ||||
|     QuantizedCPU: std_out_quantized_cpu | ||||
|   tags: reduction | ||||
|  | ||||
| - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: reduction | ||||
|  | ||||
| - func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function | ||||
|   tags: reduction | ||||
|  | ||||
| - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
| @ -6091,13 +6037,13 @@ | ||||
|     CPU, CUDA: prod | ||||
|     MPS: prod_mps | ||||
|   autogen: prod.out | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|  | ||||
| - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor | ||||
|   structured_delegate: prod.int_out | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|  | ||||
| - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) | ||||
|   structured: True | ||||
| @ -6105,16 +6051,13 @@ | ||||
|   dispatch: | ||||
|     CPU, CUDA: prod_out | ||||
|     MPS: prod_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: reduction | ||||
|  | ||||
| - func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   tags: reduction | ||||
|  | ||||
| - func: t(Tensor(a) self) -> Tensor(a) | ||||
|   device_check: NoCheck | ||||
| @ -6575,12 +6518,11 @@ | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|  | ||||
| - func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor | ||||
| @ -6590,51 +6532,43 @@ | ||||
|     CPU, CUDA: var | ||||
|     MPS: var_mps | ||||
|     MTIA: var_mtia | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|  | ||||
| - func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   dispatch: | ||||
|     CPU, CUDA: var_out | ||||
|   tags: reduction | ||||
|  | ||||
| - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: reduction | ||||
|  | ||||
| - func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function | ||||
|   tags: reduction | ||||
|  | ||||
| - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
| @ -6643,18 +6577,15 @@ | ||||
|     CPU, CUDA: var_mean | ||||
|     MPS: var_mean_mps | ||||
|   autogen: var_mean.correction_out | ||||
|   tags: reduction | ||||
|  | ||||
| - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function | ||||
|   cpp_no_default_args: ["unbiased"] | ||||
|   tags: reduction | ||||
|  | ||||
| - func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function | ||||
|   tags: reduction | ||||
|  | ||||
| - func: view_as(Tensor(a) self, Tensor other) -> Tensor(a) | ||||
|   variants: method | ||||
| @ -6914,7 +6845,6 @@ | ||||
|   dispatch: | ||||
|     CompositeExplicitAutograd: norm | ||||
|   autogen: norm.ScalarOpt_dtype_out | ||||
|   tags: reduction | ||||
|  | ||||
| - func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
| @ -6922,7 +6852,6 @@ | ||||
|   dispatch: | ||||
|     CompositeExplicitAutograd: norm | ||||
|   autogen: norm.Scalar_out | ||||
|   tags: reduction | ||||
|  | ||||
| - func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor | ||||
|   structured_delegate: norm.dtype_out | ||||
| @ -6930,7 +6859,6 @@ | ||||
|   variants: function, method | ||||
|   dispatch: | ||||
|     SparseCPU, SparseCUDA, SparseMPS: sparse_dtype_norm | ||||
|   tags: reduction | ||||
|  | ||||
| - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor | ||||
|   structured_delegate: norm.out | ||||
| @ -6938,7 +6866,6 @@ | ||||
|   variants: function, method | ||||
|   dispatch: | ||||
|     SparseCPU, SparseCUDA, SparseMPS: sparse_norm | ||||
|   tags: reduction | ||||
|  | ||||
| - func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!) | ||||
|   structured: True | ||||
| @ -6946,7 +6873,6 @@ | ||||
|   dispatch: | ||||
|     CPU, CUDA: norm_dtype_out | ||||
|     MPS: norm_dtype_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   structured: True | ||||
| @ -6954,26 +6880,21 @@ | ||||
|   dispatch: | ||||
|     CPU, CUDA: norm_out | ||||
|     MPS: norm_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| # These four redispatch in their implementation, so OK to be CompositeImplicitAutograd | ||||
| - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: reduction | ||||
|  | ||||
| - func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   variants: function, method | ||||
|   tags: reduction | ||||
|  | ||||
| - func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   tags: reduction | ||||
|  | ||||
| - func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   tags: reduction | ||||
|  | ||||
| - func: frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent) | ||||
|   variants: method, function | ||||
| @ -7191,7 +7112,6 @@ | ||||
|     MTIA: addmm_out_mtia | ||||
|     SparseCPU: addmm_out_sparse_dense_cpu | ||||
|     SparseCUDA: addmm_out_sparse_dense_cuda | ||||
|     SparseMPS: addmm_out_sparse_dense_mps | ||||
|     SparseCsrCPU: addmm_out_sparse_compressed_cpu | ||||
|     SparseCsrCUDA: addmm_out_sparse_compressed_cuda | ||||
|  | ||||
| @ -7201,7 +7121,6 @@ | ||||
|   dispatch: | ||||
|     SparseCPU: addmm_sparse_dense_cpu | ||||
|     SparseCUDA: addmm_sparse_dense_cuda | ||||
|     SparseMPS: addmm_sparse_dense_mps | ||||
|     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: addmm_sparse_compressed_dense | ||||
|   tags: core | ||||
|  | ||||
| @ -10159,14 +10078,12 @@ | ||||
|     CPU, CUDA: min | ||||
|     MPS: min_mps | ||||
|     QuantizedCPU: min_quantized_cpu | ||||
|   tags: [reduction] | ||||
|  | ||||
| - func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   dispatch: | ||||
|     CPU, CUDA: min_unary_out | ||||
|     QuantizedCPU: min_quantized_unary_out | ||||
|   tags: [reduction] | ||||
|  | ||||
| - func: fmin(Tensor self, Tensor other) -> Tensor | ||||
|   structured_delegate: fmin.out | ||||
| @ -10189,7 +10106,6 @@ | ||||
|     CPU, CUDA: max | ||||
|     MPS: max_mps | ||||
|     QuantizedCPU: max_quantized_cpu | ||||
|   tags: [reduction] | ||||
|  | ||||
| - func: fmax(Tensor self, Tensor other) -> Tensor | ||||
|   structured_delegate: fmax.out | ||||
| @ -10236,7 +10152,6 @@ | ||||
|   dispatch: | ||||
|     CPU, CUDA: max_unary_out | ||||
|     QuantizedCPU: max_quantized_unary_out | ||||
|   tags: [reduction] | ||||
|  | ||||
| - func: minimum(Tensor self, Tensor other) -> Tensor | ||||
|   structured_delegate: minimum.out | ||||
| @ -10356,7 +10271,6 @@ | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   structured_delegate: all.all_out | ||||
|   variants: method, function | ||||
|   tags: reduction | ||||
|  | ||||
| - func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck | ||||
| @ -10365,7 +10279,6 @@ | ||||
|     CPU, CUDA: all_all_out | ||||
|     MTIA: all_all_out_mtia | ||||
|     MPS: all_all_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| - func: any(Tensor self) -> Tensor | ||||
|   device_check: NoCheck   # TensorIterator | ||||
| @ -10373,7 +10286,7 @@ | ||||
|   variants: method, function | ||||
|   dispatch: | ||||
|     SparseCPU, SparseCUDA, SparseMPS: any_sparse | ||||
|   tags: [core, reduction] | ||||
|   tags: core | ||||
|  | ||||
| - func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck | ||||
| @ -10381,7 +10294,6 @@ | ||||
|   dispatch: | ||||
|     CPU, CUDA: any_all_out | ||||
|     MPS: any_all_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| - func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   device_check: NoCheck   # TensorIterator | ||||
| @ -14433,7 +14345,6 @@ | ||||
|   python_module: linalg | ||||
|   variants: function | ||||
|   structured_delegate: linalg_vector_norm.out | ||||
|   tags: reduction | ||||
|  | ||||
| - func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) | ||||
|   python_module: linalg | ||||
| @ -14441,7 +14352,6 @@ | ||||
|   dispatch: | ||||
|     CPU, CUDA: linalg_vector_norm_out | ||||
|     MPS: linalg_vector_norm_out_mps | ||||
|   tags: reduction | ||||
|  | ||||
| - func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor | ||||
|   python_module: linalg | ||||
|  | ||||
| @ -1,6 +1,5 @@ | ||||
| #define TORCH_ASSERT_ONLY_METHOD_OPERATORS | ||||
| #include <ATen/native/SparseTensorUtils.h> | ||||
| #include <ATen/ExpandUtils.h> | ||||
| #include <ATen/native/mps/OperationUtils.h> | ||||
| #include <ATen/native/sparse/SparseStubs.h> | ||||
| #include <ATen/native/sparse/SparseBinaryOpIntersectionCommon.h> | ||||
| @ -19,8 +18,6 @@ | ||||
| #include <ATen/ops/ones_like.h> | ||||
| #include <ATen/ops/argsort.h> | ||||
| #include <ATen/ops/result_type.h> | ||||
| #include <ATen/ops/bmm_native.h> | ||||
| #include <ATen/ops/addmm_native.h> | ||||
| #include <ATen/ops/copy_sparse_to_sparse.h> | ||||
| #include <ATen/ops/mul.h> | ||||
| #endif | ||||
| @ -36,305 +33,6 @@ static auto& lib = MetalShaderLibrary::getBundledLibrary(); | ||||
| #include <ATen/native/mps/Mul_metallib.h> | ||||
| #endif | ||||
|  | ||||
| static Tensor& s_addmm_out_sparse_dense_mps( | ||||
|     Tensor& r, | ||||
|     const Tensor& t, | ||||
|     const SparseTensor& sparse_, | ||||
|     const Tensor& dense, | ||||
|     const Scalar& beta, | ||||
|     const Scalar& alpha) { | ||||
|   TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: sparse_dim must be 2, got ", sparse_.sparse_dim()); | ||||
|   TORCH_CHECK(sparse_.dense_dim() == 0, "addmm: sparse values must be 0-dense-dim, got ", sparse_.dense_dim()); | ||||
|   TORCH_CHECK(dense.dim() == 2, "addmm: 'dense' must be 2D, got ", dense.dim()); | ||||
|   TORCH_CHECK(t.dim() == 2, "addmm: 't' must be 2D, got ", t.dim()); | ||||
|  | ||||
|   const int64_t I = sparse_.size(0); | ||||
|   const int64_t J = sparse_.size(1); | ||||
|   const int64_t K = dense.size(1); | ||||
|  | ||||
|   TORCH_CHECK(dense.size(0) == J, | ||||
|       "addmm: dense (mat2) dim0 must be ", J, ", got ", dense.size(0)); | ||||
|   TORCH_CHECK(t.size(0) == I && t.size(1) == K, | ||||
|       "addmm: 't' shape must be (", I, ", ", K, "), got (", t.size(0), ", ", t.size(1), ")"); | ||||
|  | ||||
|   r.resize_({I, K}); | ||||
|  | ||||
|   auto sparse = sparse_.coalesce(); | ||||
|   const int64_t nnz = sparse._nnz(); | ||||
|  | ||||
|   if (nnz == 0 || I == 0 || K == 0) { | ||||
|     at::mul_out(r, t, beta); | ||||
|     return r; | ||||
|   } | ||||
|  | ||||
|   const auto v_dtype = sparse._values().scalar_type(); | ||||
|   const auto d_dtype = dense.scalar_type(); | ||||
|   const auto t_dtype = t.scalar_type(); | ||||
|   auto compute_dtype = c10::promoteTypes(c10::promoteTypes(v_dtype, d_dtype), t_dtype); | ||||
|  | ||||
|   TORCH_CHECK(canCast(compute_dtype, r.scalar_type()), | ||||
|               "Can't convert computed type ", compute_dtype, " to output ", r.scalar_type()); | ||||
|  | ||||
|   auto indices2d = sparse._indices().contiguous(); | ||||
|   auto values = sparse._values().to(compute_dtype); | ||||
|   auto dense_c = dense.to(compute_dtype).contiguous(); | ||||
|   auto t_c = t.to(compute_dtype).contiguous(); | ||||
|  | ||||
|   const bool out_needs_cast = (r.scalar_type() != compute_dtype) || !r.is_contiguous(); | ||||
|   Tensor out_buf = out_needs_cast | ||||
|       ? at::empty({I, K}, r.options().dtype(compute_dtype)) | ||||
|       : r; | ||||
|   auto out_contig = out_buf.contiguous(); | ||||
|  | ||||
|   auto device = r.device(); | ||||
|   auto stream = getCurrentMPSStream(); | ||||
|  | ||||
|   const float alpha_f = alpha.to<float>(); | ||||
|   const float beta_f  = beta.to<float>(); | ||||
|  | ||||
|   dispatch_sync_with_rethrow(stream->queue(), ^() { | ||||
|     @autoreleasepool { | ||||
|       const std::string func = "spmm_addmm_coo_" + mps::scalarToMetalTypeString(values); | ||||
|       auto pso = lib.getPipelineStateForFunc(func); | ||||
|       auto enc = stream->commandEncoder(); | ||||
|       [enc setComputePipelineState:pso]; | ||||
|  | ||||
|       const uint32_t tew = pso.threadExecutionWidth; | ||||
|       const uint32_t gridX = static_cast<uint32_t>(K); | ||||
|       const uint32_t gridZ = static_cast<uint32_t>(I); | ||||
|       const uint32_t tgW = std::min<uint32_t>(gridX, tew); | ||||
|  | ||||
|       MTLSize grid = MTLSizeMake(gridX, 1, gridZ); | ||||
|       MTLSize tgs = MTLSizeMake(tgW, 1, 1); | ||||
|  | ||||
|       mtl_setArgs(enc, | ||||
|                   indices2d, | ||||
|                   values, | ||||
|                   dense_c, | ||||
|                   t_c, | ||||
|                   out_contig, | ||||
|                   std::array<uint32_t, 3>{static_cast<uint32_t>(I), | ||||
|                                            static_cast<uint32_t>(J), | ||||
|                                            static_cast<uint32_t>(K)}, | ||||
|                   std::array<float, 2>{alpha_f, beta_f}, | ||||
|                   static_cast<uint32_t>(nnz)); | ||||
|       [enc dispatchThreads:grid threadsPerThreadgroup:tgs]; | ||||
|     } | ||||
|   }); | ||||
|  | ||||
|   if (out_needs_cast) { | ||||
|     r.copy_(out_contig.to(r.scalar_type())); | ||||
|   } | ||||
|  | ||||
|   return r; | ||||
| } | ||||
|  | ||||
|  | ||||
| static void build_batch_ptr_mps( | ||||
|     const Tensor& indices_dim0, | ||||
|     int64_t B, | ||||
|     Tensor& batch_ptr | ||||
| ) { | ||||
|   // Builds an array of pointers which point to each batches elements. Example: | ||||
|   // idx_b = [0, 0, 0, 1, 1, 2, 2, 2, 2]  // 9 non-zero elements | ||||
|   //          └─────┘  └──┘  └─────────┘ | ||||
|   //          batch 0  batch 1  batch 2 | ||||
|   // batch_ptr = [0, 3, 5, 9] | ||||
|   //              │  │  │  └─ end of batch 2 (total nnz) | ||||
|   //              │  │  └──── batch 2 starts at index 5 | ||||
|   //              │  └─────── batch 1 starts at index 3 | ||||
|   //              └────────── batch 0 starts at index 0 | ||||
|   TORCH_CHECK(indices_dim0.is_mps() && batch_ptr.is_mps(), "MPS device expected"); | ||||
|   auto device = indices_dim0.device(); | ||||
|   auto stream = getCurrentMPSStream(); | ||||
|  | ||||
|   const int64_t nnz = indices_dim0.numel(); | ||||
|  | ||||
|   dispatch_sync_with_rethrow(stream->queue(), ^() { | ||||
|     @autoreleasepool { | ||||
|       auto pso = lib.getPipelineStateForFunc("build_batch_ptr_from_sorted_batches"); | ||||
|       auto enc = stream->commandEncoder(); | ||||
|       [enc setComputePipelineState:pso]; | ||||
|  | ||||
|       const uint32_t tew = pso.threadExecutionWidth; | ||||
|       const uint32_t Q = static_cast<uint32_t>(B + 1); | ||||
|       const uint32_t tgW = std::min<uint32_t>(Q, tew); | ||||
|       MTLSize grid = MTLSizeMake(Q, 1, 1); | ||||
|       MTLSize tgs  = MTLSizeMake(tgW, 1, 1); | ||||
|  | ||||
|       mtl_setArgs(enc, | ||||
|                   indices_dim0, | ||||
|                   batch_ptr, | ||||
|                   std::array<uint32_t, 2>{static_cast<uint32_t>(nnz), | ||||
|                                           static_cast<uint32_t>(B)}); | ||||
|       [enc dispatchThreads:grid threadsPerThreadgroup:tgs]; | ||||
|     } | ||||
|   }); | ||||
| } | ||||
|  | ||||
| static void build_row_ptr_per_batch_mps( | ||||
|     const Tensor& rows, | ||||
|     const Tensor& batch_ptr, | ||||
|     int64_t B, | ||||
|     int64_t I, | ||||
|     Tensor& row_ptr | ||||
| ) { | ||||
|   // Build per-batch CSR-style row pointer arrays from row indices sorted by batch | ||||
|   // Given: | ||||
|   //   rows: 1-D array of length nnz with row ids in [0, I), sorted within each batch | ||||
|   //   batch_ptr: length B+1, where [batch_ptr[b], batch_ptr[b+1]) is the subrange for batch b | ||||
|   // Produces: | ||||
|   //   - row_ptr: shape [B, I+1] | ||||
|   // | ||||
|   // Example (B = 2, I = 4): | ||||
|   // rows       = [0,   0,   1,  3,  0,   2,    2]   // 7 non-zero elements | ||||
|   //               └─── batch 0 ──┘  └─ batch 1 ─┘ | ||||
|   // batch_ptr  = [0, 4, 7] | ||||
|   //               │  │  └─ end of batch 1 (total nnz) | ||||
|   //               │  └──── end of batch 0/start of batch 1 | ||||
|   //               └─────── start of batch 0 | ||||
|   // | ||||
|   // per-batch row pointers (I+1 entries each): | ||||
|   //   row_ptr[0] = [0, 2, 3, 3, 4] | ||||
|   //   row_ptr[1] = [0, 1, 1, 3, 3] | ||||
|   // laid out in memory: [0, 2, 3, 3, 4,  0, 1, 1, 3, 3] | ||||
|   TORCH_CHECK(rows.is_mps() && batch_ptr.is_mps() && row_ptr.is_mps(), "MPS device expected"); | ||||
|   auto stream = getCurrentMPSStream(); | ||||
|  | ||||
|   dispatch_sync_with_rethrow(stream->queue(), ^() { | ||||
|     @autoreleasepool { | ||||
|       auto pso = lib.getPipelineStateForFunc("build_row_ptr_from_sorted_rows_by_batch"); | ||||
|       auto enc = stream->commandEncoder(); | ||||
|       [enc setComputePipelineState:pso]; | ||||
|  | ||||
|       const uint32_t tew = pso.threadExecutionWidth; | ||||
|       const uint32_t Qx = static_cast<uint32_t>(I + 1); | ||||
|       const uint32_t Qy = static_cast<uint32_t>(B); | ||||
|       const uint32_t tgW = std::min<uint32_t>(Qx, tew); | ||||
|  | ||||
|       MTLSize grid = MTLSizeMake(Qx, Qy, 1); | ||||
|       MTLSize tgs = MTLSizeMake(tgW, 1, 1); | ||||
|  | ||||
|       mtl_setArgs(enc, | ||||
|                   rows, | ||||
|                   batch_ptr, | ||||
|                   row_ptr, | ||||
|                   std::array<uint32_t, 2>{static_cast<uint32_t>(I), | ||||
|                                            static_cast<uint32_t>(B)}); | ||||
|       [enc dispatchThreads:grid threadsPerThreadgroup:tgs]; | ||||
|     } | ||||
|   }); | ||||
| } | ||||
|  | ||||
| Tensor& bmm_out_sparse_mps(const SparseTensor& self_, const Tensor& mat2_, Tensor& result_) { | ||||
|   TORCH_CHECK(result_.is_mps(), "bmm_sparse: expected 'out' to be MPS, got ", result_.device()); | ||||
|   TORCH_CHECK(self_.is_mps(),  "bmm_sparse: expected 'self' to be MPS, got ", self_.device()); | ||||
|   TORCH_CHECK(mat2_.is_mps(),  "bmm_sparse: expected 'mat2' to be MPS, got ", mat2_.device()); | ||||
|  | ||||
|   TORCH_CHECK(self_.dense_dim() == 0, "bmm_sparse: Tensor 'self' must have 0 dense dims, but has ", self_.dense_dim()); | ||||
|   TORCH_CHECK(self_.sparse_dim() == 3, "bmm_sparse: Tensor 'self' must have 3 sparse dims, but has ", self_.sparse_dim()); | ||||
|   TORCH_CHECK(mat2_.dim() == 3, "bmm_sparse: Tensor 'mat2' must have 3 dims, but has ", mat2_.dim()); | ||||
|  | ||||
|   TORCH_CHECK(self_.size(0) == mat2_.size(0), "bmm_sparse: 'self.size(0)' and 'mat2.size(0)' must match"); | ||||
|   TORCH_CHECK(self_.size(2) == mat2_.size(1), "bmm_sparse: 'self.size(2)' and 'mat2.size(1)' must match"); | ||||
|  | ||||
|   const int64_t B = self_.size(0); | ||||
|   const int64_t I = self_.size(1); | ||||
|   const int64_t J = self_.size(2); | ||||
|   const int64_t K = mat2_.size(2); | ||||
|  | ||||
|   auto self = self_.coalesce(); | ||||
|   const int64_t nnz = self._nnz(); | ||||
|   if (nnz == 0) { | ||||
|     return result_.zero_(); | ||||
|   } | ||||
|  | ||||
|   const auto computeDtype = at::kFloat; | ||||
|  | ||||
|   auto indices = self._indices(); | ||||
|   auto values  = self._values(); | ||||
|  | ||||
|   auto values_c = values.scalar_type() == computeDtype ? values : values.to(computeDtype); | ||||
|   auto mat2_c = mat2_.scalar_type()   == computeDtype ? mat2_   : mat2_.to(computeDtype); | ||||
|   auto mat2_contig = mat2_c.contiguous(); | ||||
|  | ||||
|   auto idx_b = indices.select(0, 0).contiguous(); | ||||
|   auto idx_i = indices.select(0, 1).contiguous(); | ||||
|   auto idx_j = indices.select(0, 2).contiguous(); | ||||
|  | ||||
|   // builds an array of pointers of where the batch_idx's pointer starts and ends | ||||
|   // look in function for better explanation | ||||
|   auto batch_ptr = at::empty({B + 1}, at::device(result_.device()).dtype(kLong)); | ||||
|   build_batch_ptr_mps(idx_b, B, batch_ptr); | ||||
|   // build row_ptr per batch: for each (b, i) get [start, end) into rows/cols/vals | ||||
|   auto row_ptr = at::empty({B * (I + 1)}, at::device(result_.device()).dtype(kLong)); | ||||
|   build_row_ptr_per_batch_mps(idx_i, batch_ptr, B, I, row_ptr); | ||||
|  | ||||
|   const bool out_needs_cast = (result_.scalar_type() != computeDtype) || !result_.is_contiguous(); | ||||
|   Tensor out_buf = out_needs_cast | ||||
|       ? at::empty({B, I, K}, result_.options().dtype(computeDtype)) | ||||
|       : result_; | ||||
|   auto out_contig = out_buf.contiguous(); | ||||
|  | ||||
|   auto stream = getCurrentMPSStream(); | ||||
|   dispatch_sync_with_rethrow(stream->queue(), ^() { | ||||
|     @autoreleasepool { | ||||
|       auto pso = lib.getPipelineStateForFunc("spmm_bmm_coo_rows_grouped_" + mps::scalarToMetalTypeString(values)); | ||||
|       auto enc = stream->commandEncoder(); | ||||
|       [enc setComputePipelineState:pso]; | ||||
|  | ||||
|       const uint32_t tew = pso.threadExecutionWidth; | ||||
|       const uint32_t tgW = std::min<uint32_t>((uint32_t)K, tew); | ||||
|  | ||||
|       // One threadgroup per (row i, batch b), lanes cover K | ||||
|       MTLSize grid = MTLSizeMake(tgW, (uint32_t)I, (uint32_t)B); | ||||
|       MTLSize tgs  = MTLSizeMake(tgW, 1, 1); | ||||
|  | ||||
|       mtl_setArgs(enc, | ||||
|                   idx_i, | ||||
|                   idx_j, | ||||
|                   values_c, | ||||
|                   mat2_contig, | ||||
|                   out_contig, | ||||
|                   row_ptr, | ||||
|                   std::array<uint32_t, 4>{(uint32_t)B, (uint32_t)I, (uint32_t)J, (uint32_t)K}); | ||||
|       [enc dispatchThreads:grid threadsPerThreadgroup:tgs]; | ||||
|     } | ||||
|   }); | ||||
|   if (out_needs_cast) { | ||||
|     result_.copy_(out_contig.to(result_.scalar_type())); | ||||
|   } | ||||
|   return result_; | ||||
| } | ||||
|  | ||||
| Tensor bmm_sparse_mps(const Tensor& self, const Tensor& mat2) { | ||||
|   Tensor result = at::zeros({self.size(0), self.size(1), mat2.size(2)}, mat2.options()); | ||||
|   return bmm_out_sparse_mps(self, mat2, result); | ||||
| } | ||||
|  | ||||
| Tensor& addmm_out_sparse_dense_mps( | ||||
|     const Tensor& self, | ||||
|     const SparseTensor& mat1, | ||||
|     const Tensor& mat2, | ||||
|     const Scalar& beta, | ||||
|     const Scalar& alpha, | ||||
|     Tensor& result) { | ||||
|   c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out"); | ||||
|   return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha); | ||||
| } | ||||
|  | ||||
| Tensor addmm_sparse_dense_mps( | ||||
|     const Tensor& self, | ||||
|     const SparseTensor& mat1, | ||||
|     const Tensor& mat2, | ||||
|     const Scalar& beta, | ||||
|     const Scalar& alpha | ||||
| ) { | ||||
|   c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out"); | ||||
|   Tensor result = at::empty({0}, self.options()); | ||||
|   return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha); | ||||
| } | ||||
|  | ||||
| static SparseTensor& mul_out_dense_sparse_mps( | ||||
|     const Tensor& dense, | ||||
|     const Tensor& sparse, | ||||
|  | ||||
| @ -1,105 +1,10 @@ | ||||
| #include <metal_stdlib> | ||||
| #include <c10/metal/indexing.h> | ||||
| #include <c10/metal/utils.h> | ||||
| using namespace c10::metal; | ||||
| using namespace metal; | ||||
|  | ||||
| inline uint lower_bound_i64(device const long* arr, uint lo, uint hi, long key) { | ||||
|   uint l = lo, r = hi; | ||||
|   while (l < r) { | ||||
|     uint m = (l + r) >> 1; | ||||
|     long v = arr[m]; | ||||
|     if (v < key) { | ||||
|       l = m + 1; | ||||
|     } else { | ||||
|       r = m; | ||||
|     } | ||||
|   } | ||||
|   return l; | ||||
| } | ||||
|  | ||||
| inline uint upper_bound_i64(device const long* arr, uint lo, uint hi, long key) { | ||||
|   uint l = lo, r = hi; | ||||
|   while (l < r) { | ||||
|     uint m = (l + r) >> 1; | ||||
|     long v = arr[m]; | ||||
|     if (v <= key) { | ||||
|       l = m + 1; | ||||
|     } else { | ||||
|       r = m; | ||||
|     } | ||||
|   } | ||||
|   return l; | ||||
| } | ||||
|  | ||||
| kernel void build_row_ptr_from_sorted_rows_by_batch( | ||||
|     device const long* rows        [[buffer(0)]], | ||||
|     device const long* batch_ptr   [[buffer(1)]], | ||||
|     device long*       row_ptr     [[buffer(2)]], | ||||
|     constant uint2&    dims        [[buffer(3)]], | ||||
|     uint3              tid         [[thread_position_in_grid]]) | ||||
| { | ||||
|   const uint I = dims.x; | ||||
|   const uint B = dims.y; | ||||
|  | ||||
|   const uint i = tid.x; | ||||
|   const uint b = tid.y; | ||||
|  | ||||
|   if (b >= B || i > I) return; | ||||
|  | ||||
|   const uint base = (uint)batch_ptr[b]; | ||||
|   const uint lim  = (uint)batch_ptr[b + 1]; | ||||
|  | ||||
|   const ulong out_base = (ulong)b * (ulong)(I + 1); | ||||
|  | ||||
|   if (i == I) { | ||||
|     row_ptr[out_base + (ulong)I] = (long)lim; | ||||
|   } else { | ||||
|     const long key = (long)i; | ||||
|     const uint pos = lower_bound_i64(rows, base, lim, key); | ||||
|     row_ptr[out_base + (ulong)i] = (long)pos; | ||||
|   } | ||||
| } | ||||
|  | ||||
| template <typename T> | ||||
| kernel void spmm_bmm_coo_rows_grouped( | ||||
|     device const long*   rows      [[buffer(0)]], | ||||
|     device const long*   cols      [[buffer(1)]], | ||||
|     device const T*      vals      [[buffer(2)]], | ||||
|     device const T*      dense     [[buffer(3)]], | ||||
|     device T*            out       [[buffer(4)]], | ||||
|     device const long*   row_ptr   [[buffer(5)]], | ||||
|     constant uint4&      dims      [[buffer(6)]], | ||||
|     uint3                tid       [[thread_position_in_grid]], | ||||
|     uint3                ltid      [[thread_position_in_threadgroup]], | ||||
|     uint3                tptg      [[threads_per_threadgroup]]) | ||||
| { | ||||
|   const uint B = dims.x; | ||||
|   const uint I = dims.y; | ||||
|   const uint J = dims.z; | ||||
|   const uint K = dims.w; | ||||
|  | ||||
|   const uint b = tid.z; | ||||
|   const uint i = tid.y; | ||||
|   const uint lane = ltid.x; | ||||
|   const uint tgW  = tptg.x; | ||||
|  | ||||
|   const ulong rp_base = (ulong)b * (ulong)(I + 1); | ||||
|   const uint start = (uint)row_ptr[rp_base + (ulong)i]; | ||||
|   const uint end   = (uint)row_ptr[rp_base + (ulong)i + 1]; | ||||
|  | ||||
|   for (uint k = lane; k < K; k += tgW) { | ||||
|     auto acc = static_cast<accum_t<T>>(T(0)); | ||||
|     for (uint p = start; p < end; ++p) { | ||||
|       const uint c = (uint)cols[p]; | ||||
|       const auto v = static_cast<accum_t<T>>(vals[p]); | ||||
|       const uint d_off = ((b * J) + c) * K + k; | ||||
|       const auto d = static_cast<accum_t<T>>(dense[d_off]); | ||||
|       acc += mul(v, d); | ||||
|     } | ||||
|     const uint y_off = ((b * I) + i) * K + k; | ||||
|     out[y_off] = static_cast<T>(acc); | ||||
|   } | ||||
| } | ||||
| template <typename T> struct MulAccum { using type = float; }; | ||||
| template <> struct MulAccum<float2> { using type = float2; }; | ||||
|  | ||||
| template <typename T> | ||||
| kernel void dense_sparse_mul_kernel( | ||||
| @ -127,9 +32,10 @@ kernel void dense_sparse_mul_kernel( | ||||
|   ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col; | ||||
|   ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col; | ||||
|  | ||||
|   const auto a = static_cast<accum_t<T>>(values[val_idx]); | ||||
|   const auto b = static_cast<accum_t<T>>(dense[dense_idx]); | ||||
|   out_values[val_idx] = static_cast<T>(mul(a, b)); | ||||
|   using accum_t = typename MulAccum<T>::type; | ||||
|   const accum_t a = static_cast<accum_t>(values[val_idx]); | ||||
|   const accum_t b = static_cast<accum_t>(dense[dense_idx]); | ||||
|   out_values[val_idx] = static_cast<T>(a * b); | ||||
| } | ||||
|  | ||||
| kernel void intersect_binary_search( | ||||
| @ -214,76 +120,6 @@ kernel void fused_gather_mul_kernel( | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
| kernel void build_batch_ptr_from_sorted_batches( | ||||
|     device const long* batches       [[buffer(0)]], | ||||
|     device long*       batch_ptr     [[buffer(1)]], | ||||
|     constant uint2&    nnz_B         [[buffer(2)]], | ||||
|     uint3              tid           [[thread_position_in_grid]]) | ||||
| { | ||||
|   uint b = tid.x; | ||||
|   uint nnz = nnz_B.x; | ||||
|   uint batch = nnz_B.y; | ||||
|  | ||||
|   if (b == batch) { | ||||
|     batch_ptr[b] = (long)nnz; | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   uint lo = 0; | ||||
|   uint hi = nnz; | ||||
|   long key = (long)b; | ||||
|   while (lo < hi) { | ||||
|     uint mid = (lo + hi) >> 1; | ||||
|     long v = batches[mid]; | ||||
|     if (v < key) lo = mid + 1; | ||||
|     else         hi = mid; | ||||
|   } | ||||
|   batch_ptr[b] = (long)lo; | ||||
| } | ||||
|  | ||||
| template <typename T> | ||||
| kernel void spmm_addmm_coo( | ||||
|     device const long*   indices2d   [[buffer(0)]], | ||||
|     device const T*      vals        [[buffer(1)]], | ||||
|     device const T*      dense       [[buffer(2)]], | ||||
|     device const T*      t_in        [[buffer(3)]], | ||||
|     device T*            out         [[buffer(4)]], | ||||
|     constant uint3&      dims        [[buffer(5)]], | ||||
|     constant float2&     alpha_beta  [[buffer(6)]], | ||||
|     constant uint&       nnz         [[buffer(7)]], | ||||
|     uint3                tid         [[thread_position_in_grid]]) | ||||
| { | ||||
|   const uint K = dims.z; | ||||
|   const uint k = tid.x; | ||||
|   const uint i = tid.z; | ||||
|   const float alpha = alpha_beta.x; | ||||
|   const float beta = alpha_beta.y; | ||||
|  | ||||
|   device const long* rows = indices2d; | ||||
|   device const long* cols = indices2d + nnz; | ||||
|  | ||||
|   const uint start = lower_bound_i64(rows, 0u, nnz, (long)i); | ||||
|   const uint end = upper_bound_i64(rows, 0u, nnz, (long)i); | ||||
|  | ||||
|   // accumulator is float for scalar/half/bfloat and float2 for float2 | ||||
|   auto acc = static_cast<accum_t<T>>(T(0)); | ||||
|  | ||||
|   for (uint p = start; p < end; ++p) { | ||||
|     const uint c = (uint)cols[p]; | ||||
|     const auto v = static_cast<accum_t<T>>(vals[p]); | ||||
|     const uint dense_off = c * K + k; | ||||
|     const auto d = static_cast<accum_t<T>>(dense[dense_off]); | ||||
|     acc += mul(v, d); | ||||
|   } | ||||
|  | ||||
|   const uint off = i * K + k; | ||||
|   const auto base = (beta != 0.0f) ? (static_cast<accum_t<T>>(t_in[off]) * beta) : static_cast<accum_t<T>>(T(0)); | ||||
|   const auto y = base + alpha * acc; | ||||
|   out[off] = static_cast<T>(y); | ||||
| } | ||||
|  | ||||
|  | ||||
| #define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE)                                 \ | ||||
|   template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void     \ | ||||
|   dense_sparse_mul_kernel<DTYPE>(                                           \ | ||||
| @ -315,36 +151,6 @@ INSTANTIATE_DENSE_SPARSE_MUL(float2); | ||||
|       constant uint2&     dims_output   [[buffer(8)]],                       \ | ||||
|       uint3               gid           [[thread_position_in_grid]]); | ||||
|  | ||||
| INSTANTIATE_FOR_FLOAT_TYPES(INSTANTIATE_FUSED_GATHER_MUL); | ||||
|  | ||||
|  | ||||
| #define INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED(DTYPE)                         \ | ||||
|   template [[host_name("spmm_bmm_coo_rows_grouped_" #DTYPE)]] kernel void    \ | ||||
|   spmm_bmm_coo_rows_grouped<DTYPE>(                                          \ | ||||
|       device const long*   rows      [[buffer(0)]],                          \ | ||||
|       device const long*   cols      [[buffer(1)]],                          \ | ||||
|       device const DTYPE*  vals      [[buffer(2)]],                          \ | ||||
|       device const DTYPE*  dense     [[buffer(3)]],                          \ | ||||
|       device DTYPE*        out       [[buffer(4)]],                          \ | ||||
|       device const long*   row_ptr   [[buffer(5)]],                          \ | ||||
|       constant uint4&      dims      [[buffer(6)]],                          \ | ||||
|       uint3                tid       [[thread_position_in_grid]],            \ | ||||
|       uint3                ltid      [[thread_position_in_threadgroup]],     \ | ||||
|       uint3                tptg      [[threads_per_threadgroup]]); | ||||
|  | ||||
| INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED); | ||||
|  | ||||
| #define INSTANTIATE_SPMM_ADDMM_COO(DTYPE) \ | ||||
|   template [[host_name("spmm_addmm_coo_" #DTYPE)]] kernel void  \ | ||||
|   spmm_addmm_coo<DTYPE>(                                        \ | ||||
|     device const long*   indices2d   [[buffer(0)]],             \ | ||||
|     device const DTYPE*  vals        [[buffer(1)]],             \ | ||||
|     device const DTYPE*  dense       [[buffer(2)]],             \ | ||||
|     device const DTYPE*  t_in        [[buffer(3)]],             \ | ||||
|     device DTYPE*        out         [[buffer(4)]],             \ | ||||
|     constant uint3&      dims        [[buffer(5)]],             \ | ||||
|     constant float2&     alpha_beta  [[buffer(6)]],             \ | ||||
|     constant uint&       nnz         [[buffer(7)]],             \ | ||||
|     uint3                tid         [[thread_position_in_grid]]); | ||||
|  | ||||
| INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_ADDMM_COO); | ||||
| INSTANTIATE_FUSED_GATHER_MUL(float); | ||||
| INSTANTIATE_FUSED_GATHER_MUL(half); | ||||
| INSTANTIATE_FUSED_GATHER_MUL(bfloat); | ||||
| @ -93,7 +93,3 @@ | ||||
|           This operator does not support cudagraphs. The presence of this tag on an operator will cause | ||||
|           Inductor to split the graph around this operator. Note that operators without this tag may still | ||||
|           not support CUDAGraphs. Inductor may have other hardcoded lists around that. | ||||
| - tag: reduction | ||||
|   desc: | | ||||
|           This tag indicates that an operator performs a reduction operation, computing aggregate values | ||||
|           (sum, mean, max, min, etc.) across one or more dimensions of the input tensor(s). | ||||
|  | ||||
| @ -202,6 +202,7 @@ supported: | ||||
|   - select_backward | ||||
|   - _trilinear | ||||
|   - linalg_pinv.atol_rtol_tensor | ||||
|   - svd | ||||
|   - logsumexp.out | ||||
| symint: | ||||
|   - empty.memory_format | ||||
|  | ||||
| @ -1751,8 +1751,8 @@ def maybe_snapshot_memory(should_snapshot_memory, suffix): | ||||
|                         f"{output_filename.rstrip('.csv')}_{suffix}.pickle", | ||||
|                     ) | ||||
|                 ) | ||||
|             except Exception: | ||||
|                 log.exception("Failed to save memory snapshot") | ||||
|             except Exception as e: | ||||
|                 log.error("Failed to save memory snapshot, %s", e) | ||||
|  | ||||
|             torch.cuda.memory._record_memory_history(enabled=None) | ||||
|  | ||||
|  | ||||
| @ -124,7 +124,7 @@ with open(MODELS_FILENAME) as fh: | ||||
|             continue | ||||
|         batch_size = int(batch_size) | ||||
|         BATCH_SIZE_KNOWN_MODELS[model_name] = batch_size | ||||
| assert BATCH_SIZE_KNOWN_MODELS | ||||
| assert len(BATCH_SIZE_KNOWN_MODELS) | ||||
|  | ||||
|  | ||||
| try: | ||||
|  | ||||
| @ -296,8 +296,8 @@ class OperatorInputsLoader: | ||||
|         for key in self.operator_db.keys(): | ||||
|             try: | ||||
|                 op = eval(key) | ||||
|             except AttributeError: | ||||
|                 log.warning("Evaluating an op name into an OpOverload", exc_info=True) | ||||
|             except AttributeError as ae: | ||||
|                 log.warning("Evaluating an op name into an OpOverload: %s", ae) | ||||
|                 continue | ||||
|             yield op | ||||
|  | ||||
|  | ||||
| @ -3,7 +3,6 @@ import sys | ||||
| from benchmark_base import BenchmarkBase | ||||
|  | ||||
| import torch | ||||
| from torch._dynamo.utils import CompileTimeInstructionCounter | ||||
|  | ||||
|  | ||||
| class Benchmark(BenchmarkBase): | ||||
| @ -33,11 +32,7 @@ class Benchmark(BenchmarkBase): | ||||
|     def _work(self): | ||||
|         # enable_cpp_symbolic_shape_guards has impact on this benchmark | ||||
|         # Keep using False value for consistency. | ||||
|         with ( | ||||
|             torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False), | ||||
|             torch._export.config.patch(use_new_tracer_experimental=True), | ||||
|             CompileTimeInstructionCounter.record(), | ||||
|         ): | ||||
|         with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False): | ||||
|             torch.export.export(self.m, (self.input,), strict=True) | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -38,7 +38,7 @@ update_hint_regression,compile_time_instruction_count,1719000000,0.1 | ||||
|  | ||||
|  | ||||
|  | ||||
| sum_floordiv_regression,compile_time_instruction_count,3686995725,0.1 | ||||
| sum_floordiv_regression,compile_time_instruction_count,966100000,0.1 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -127,7 +127,7 @@ def trainbench( | ||||
|         bwd_time = bwd_start_event.elapsed_time(bwd_end_event) | ||||
|         return fwd_time, bwd_time | ||||
|  | ||||
|     creator_args = { | ||||
|     creator_args = creator_args = { | ||||
|         "seqLength": seqLength, | ||||
|         "numLayers": numLayers, | ||||
|         "inputSize": inputSize, | ||||
|  | ||||
| @ -12,7 +12,7 @@ def modeldef(request, net_name, executor, fuser): | ||||
|  | ||||
|     # Given a 'net_name' provided by generate_tests, build the thing | ||||
|     name, rnn_creator, context = get_nn_runners(net_name)[0] | ||||
|     creator_args = { | ||||
|     creator_args = creator_args = { | ||||
|         "seqLength": 100, | ||||
|         "numLayers": 1, | ||||
|         "inputSize": 512, | ||||
|  | ||||
| @ -85,7 +85,7 @@ class WeightOnlyInt8QuantHandler: | ||||
|                 cur_state_dict[f"{fqn}.weight"] = int8_weight | ||||
|                 cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype) | ||||
|             elif isinstance(mod, ConditionalFeedForward): | ||||
|                 for weight_idx in range(3): | ||||
|                 for weight_idx in range(0, 3): | ||||
|                     weight_name = f"w{weight_idx + 1}" | ||||
|                     scales_name = f"scales{weight_idx + 1}" | ||||
|                     weight = getattr(mod, weight_name) | ||||
|  | ||||
| @ -1729,8 +1729,10 @@ def define_buck_targets( | ||||
|             "torch/csrc/jit/backends/backend_debug_info.cpp", | ||||
|             "torch/csrc/jit/backends/backend_interface.cpp", | ||||
|         ], | ||||
|         compiler_flags = get_pt_compiler_flags(), | ||||
|         fbandroid_compiler_flags = c2_fbandroid_xplat_compiler_flags, | ||||
|         compiler_flags = get_pt_compiler_flags() + select({ | ||||
|             "DEFAULT": [], | ||||
|             "ovr_config//os:android": c2_fbandroid_xplat_compiler_flags | ||||
|         }), | ||||
|         # @lint-ignore BUCKLINT link_whole | ||||
|         link_whole = True, | ||||
|         linker_flags = get_no_as_needed_linker_flag(), | ||||
| @ -2023,6 +2025,9 @@ def define_buck_targets( | ||||
|                 "ovr_config//os:android-x86_64": [ | ||||
|                     "-mssse3", | ||||
|                 ], | ||||
|             }) + select({ | ||||
|                 "DEFAULT": [], | ||||
|                 "ovr_config//os:android": c2_fbandroid_xplat_compiler_flags, | ||||
|             }), | ||||
|             exported_preprocessor_flags = get_aten_preprocessor_flags(), | ||||
|             exported_deps = [ | ||||
|  | ||||
| @ -9,7 +9,6 @@ | ||||
|  | ||||
| #include <c10/core/Device.h> | ||||
| #include <c10/core/DeviceType.h> | ||||
| #include <c10/core/alignment.h> | ||||
| #include <c10/macros/Export.h> | ||||
| #include <c10/macros/Macros.h> | ||||
| #include <c10/util/Exception.h> | ||||
|  | ||||
| @ -1,4 +1,5 @@ | ||||
| #include <c10/core/AllocatorConfig.h> | ||||
| #include <c10/core/DeviceType.h> | ||||
| #include <c10/util/env.h> | ||||
|  | ||||
| namespace c10::CachingAllocator { | ||||
| @ -46,7 +47,7 @@ size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) { | ||||
|       63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart); | ||||
|   const size_t interval_end = | ||||
|       63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd); | ||||
|   TORCH_CHECK_VALUE( | ||||
|   TORCH_CHECK( | ||||
|       interval_end - interval_start == kRoundUpPowerOfTwoIntervals, | ||||
|       "kRoundUpPowerOfTwoIntervals mismatch"); | ||||
|  | ||||
| @ -65,7 +66,7 @@ size_t AcceleratorAllocatorConfig::parseMaxSplitSize( | ||||
|       std::numeric_limits<size_t>::max() / kMB; | ||||
|  | ||||
|   size_t val_env = tokenizer.toSizeT(++i); | ||||
|   TORCH_CHECK_VALUE( | ||||
|   TORCH_CHECK( | ||||
|       val_env >= min_allowed_split_size_mb, | ||||
|       "CachingAllocator option max_split_size_mb too small, must be >= ", | ||||
|       min_allowed_split_size_mb); | ||||
| @ -84,7 +85,7 @@ size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize( | ||||
|       std::numeric_limits<size_t>::max() / kMB; | ||||
|  | ||||
|   size_t val_env = tokenizer.toSizeT(++i); | ||||
|   TORCH_CHECK_VALUE( | ||||
|   TORCH_CHECK( | ||||
|       val_env >= min_allowed_split_size_mb, | ||||
|       "CachingAllocator option max_non_split_rounding_mb too small, must be >= ", | ||||
|       min_allowed_split_size_mb); | ||||
| @ -99,7 +100,7 @@ size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold( | ||||
|     size_t i) { | ||||
|   tokenizer.checkToken(++i, ":"); | ||||
|   double val_env = tokenizer.toDouble(++i); | ||||
|   TORCH_CHECK_VALUE( | ||||
|   TORCH_CHECK( | ||||
|       val_env > 0 && val_env < 1.0, | ||||
|       "garbage_collect_threshold is invalid, set it in (0.0, 1.0)"); | ||||
|   garbage_collection_threshold_ = val_env; | ||||
| @ -120,7 +121,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions( | ||||
|       size_t value_index = i; | ||||
|       tokenizer.checkToken(++i, ":"); | ||||
|       size_t value = tokenizer.toSizeT(++i); | ||||
|       TORCH_CHECK_VALUE( | ||||
|       TORCH_CHECK( | ||||
|           value == 0 || llvm::isPowerOf2_64(value), | ||||
|           "For roundups, the divisions has to be power of 2 or 0 to disable roundup "); | ||||
|  | ||||
| @ -128,13 +129,12 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions( | ||||
|         std::fill( | ||||
|             std::next( | ||||
|                 roundup_power2_divisions_.begin(), | ||||
|                 static_cast<std::vector<size_t>::difference_type>( | ||||
|                     last_index + 1)), | ||||
|                 static_cast<std::vector<size_t>::difference_type>(last_index)), | ||||
|             roundup_power2_divisions_.end(), | ||||
|             value); | ||||
|       } else { | ||||
|         size_t boundary = tokenizer.toSizeT(value_index); | ||||
|         TORCH_CHECK_VALUE( | ||||
|         TORCH_CHECK( | ||||
|             llvm::isPowerOf2_64(boundary), | ||||
|             "For roundups, the intervals have to be power of 2 "); | ||||
|  | ||||
| @ -164,7 +164,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions( | ||||
|         "Expected closing bracket ']' in ConfigTokenizer but reached end of config"); | ||||
|   } else { // Keep this for backwards compatibility | ||||
|     size_t value = tokenizer.toSizeT(i); | ||||
|     TORCH_CHECK_VALUE( | ||||
|     TORCH_CHECK( | ||||
|         llvm::isPowerOf2_64(value), | ||||
|         "For roundups, the divisions has to be power of 2 "); | ||||
|     std::fill( | ||||
| @ -224,7 +224,7 @@ void AcceleratorAllocatorConfig::parseArgs(const std::string& env) { | ||||
|       // If a device-specific configuration parser hook is registered, it will | ||||
|       // check if the key is unrecognized. | ||||
|       if (device_config_parser_hook_) { | ||||
|         TORCH_CHECK_VALUE( | ||||
|         TORCH_CHECK( | ||||
|             getKeys().find(key) != getKeys().end(), | ||||
|             "Unrecognized key '", | ||||
|             key, | ||||
|  | ||||
| @ -76,7 +76,7 @@ class ConfigTokenizer { | ||||
|     } else if (token == "False") { | ||||
|       return false; | ||||
|     } else { | ||||
|       TORCH_CHECK_VALUE( | ||||
|       TORCH_CHECK( | ||||
|           false, | ||||
|           "Expected 'True' or 'False' at index ", | ||||
|           i, | ||||
| @ -253,7 +253,7 @@ class C10_API AcceleratorAllocatorConfig { | ||||
|     device_config_parser_hook_ = std::move(hook); | ||||
|     auto& mutable_keys = getMutableKeys(); | ||||
|     for (auto& key : keys) { | ||||
|       TORCH_CHECK_VALUE( | ||||
|       TORCH_CHECK( | ||||
|           mutable_keys.insert(key).second, | ||||
|           "Duplicated key '", | ||||
|           key, | ||||
|  | ||||
| @ -52,9 +52,7 @@ constexpr DispatchKeySet math_dispatch_keyset = backend_dispatch_keyset | | ||||
|     // where we would like to support composite implicit kernels but not | ||||
|     // explicit kernels therefore we manually add the key to the | ||||
|     // math_dispatch_keyset | ||||
|     DispatchKeySet{DispatchKey::NestedTensor} | | ||||
|     // Functionalize should always reuse CompositeImplicit decomps. | ||||
|     DispatchKeySet{DispatchKey::Functionalize}; | ||||
|     DispatchKeySet{DispatchKey::NestedTensor}; | ||||
|  | ||||
| constexpr DispatchKeySet nested_dispatch_keyset = | ||||
|     DispatchKeySet( | ||||
|  | ||||
| @ -102,7 +102,7 @@ uint64_t getNonDeterministicRandom(bool is_cuda) { | ||||
|   } else { | ||||
|     std::random_device rd; | ||||
|     // limit to 53 bits to ensure unique representation in double | ||||
|     s = (((static_cast<uint64_t>(rd())) << 32) + rd()) & 0x1FFFFFFFFFFFFF; | ||||
|     s = ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF; | ||||
|   } | ||||
|   return s; | ||||
| } | ||||
|  | ||||
| @ -20,8 +20,7 @@ void maybeApplyRefcountedDeleter(const c10::Storage& storage) { | ||||
|   std::lock_guard<std::mutex> guard(replace_data_ptr_mutex); | ||||
|   c10::DataPtr& data_ptr = storage.mutable_data_ptr(); | ||||
|  | ||||
|   if (reinterpret_cast<const void*>(data_ptr.get_deleter()) == | ||||
|       reinterpret_cast<const void*>(&c10::refcounted_deleter)) { | ||||
|   if ((void*)data_ptr.get_deleter() == (void*)&c10::refcounted_deleter) { | ||||
|     // Data pointer is already shared | ||||
|     return; | ||||
|   } | ||||
|  | ||||
| @ -52,19 +52,6 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType) | ||||
| AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CONSTANT) | ||||
| #undef DEFINE_CONSTANT | ||||
|  | ||||
| inline const char* toString(ScalarType t) { | ||||
| #define DEFINE_CASE(_, name) \ | ||||
|   case ScalarType::name:     \ | ||||
|     return #name; | ||||
|  | ||||
|   switch (t) { | ||||
|     AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE) | ||||
|     default: | ||||
|       return "UNKNOWN_SCALAR"; | ||||
|   } | ||||
| #undef DEFINE_CASE | ||||
| } | ||||
|  | ||||
| inline size_t elementSize(ScalarType t) { | ||||
| #define CASE_ELEMENTSIZE_CASE(ctype, name) \ | ||||
|   case ScalarType::name:                   \ | ||||
| @ -308,12 +295,6 @@ inline bool canCast(const ScalarType from, const ScalarType to) { | ||||
|  | ||||
| C10_API ScalarType promoteTypes(ScalarType a, ScalarType b); | ||||
|  | ||||
| inline std::ostream& operator<<( | ||||
|     std::ostream& stream, | ||||
|     at::ScalarType scalar_type) { | ||||
|   return stream << toString(scalar_type); | ||||
| } | ||||
|  | ||||
| // Returns a pair of strings representing the names for each dtype. | ||||
| // The returned pair is (name, legacy_name_if_applicable) | ||||
| C10_API std::pair<std::string, std::string> getDtypeNames( | ||||
|  | ||||
| @ -4,6 +4,7 @@ | ||||
| #include <c10/core/SymNodeImpl.h> | ||||
| #include <c10/util/intrusive_ptr.h> | ||||
| #include <c10/util/safe_numerics.h> | ||||
| #include <functional> | ||||
|  | ||||
| namespace c10 { | ||||
|  | ||||
| @ -83,7 +84,7 @@ DEFINE_BINARY(max_slow_path, sym_max, SymInt) | ||||
|  | ||||
| SymInt::operator SymFloat() const { | ||||
|   if (auto ma = maybe_as_int()) { | ||||
|     return SymFloat(static_cast<double>(*ma)); | ||||
|     return SymFloat(double(*ma)); | ||||
|   } else { | ||||
|     return SymFloat(toSymNodeImplUnowned()->sym_float()); | ||||
|   } | ||||
|  | ||||
| @ -9,6 +9,7 @@ | ||||
| #include <c10/core/impl/TorchDispatchModeTLS.h> | ||||
| #include <c10/util/Logging.h> | ||||
| #include <c10/util/accumulate.h> | ||||
| #include <c10/util/irange.h> | ||||
| #include <optional> | ||||
|  | ||||
| #include <utility> | ||||
|  | ||||
| @ -1,5 +1,9 @@ | ||||
| #include <c10/core/TensorOptions.h> | ||||
|  | ||||
| #include <c10/core/Device.h> | ||||
| #include <c10/core/Layout.h> | ||||
| #include <c10/util/Optional.h> | ||||
|  | ||||
| #include <iostream> | ||||
|  | ||||
| namespace c10 { | ||||
|  | ||||
| @ -1,7 +1,6 @@ | ||||
| #pragma once | ||||
|  | ||||
| #include <cstddef> | ||||
| #include <new> | ||||
|  | ||||
| namespace c10 { | ||||
|  | ||||
| @ -19,12 +18,4 @@ constexpr size_t gPagesize = 4096; | ||||
| // since the default thp pagesize is 2MB, enable thp only | ||||
| // for buffers of size 2MB or larger to avoid memory bloating | ||||
| constexpr size_t gAlloc_threshold_thp = static_cast<size_t>(2) * 1024 * 1024; | ||||
|  | ||||
| // Cache line size used to avoid false sharing between threads. Falls back to 64 | ||||
| // bytes if C++17 feature is unavailable. | ||||
| #ifdef __cpp_lib_hardware_interference_size | ||||
| using std::hardware_destructive_interference_size; | ||||
| #else | ||||
| constexpr std::size_t hardware_destructive_interference_size = 64; | ||||
| #endif | ||||
| } // namespace c10 | ||||
|  | ||||
| @ -2,6 +2,7 @@ | ||||
|  | ||||
| #include <c10/core/Allocator.h> | ||||
| #include <c10/core/StorageImpl.h> | ||||
| #include <c10/core/alignment.h> | ||||
| #include <c10/core/impl/COWDeleter.h> | ||||
| #include <c10/util/Exception.h> | ||||
| #include <c10/util/ParallelGuard.h> | ||||
| @ -44,8 +45,7 @@ bool has_simple_data_ptr(const c10::StorageImpl& storage) { | ||||
| } | ||||
|  | ||||
| bool is_cow_data_ptr(const c10::DataPtr& data_ptr) { | ||||
|   return reinterpret_cast<const void*>(data_ptr.get_deleter()) == | ||||
|       reinterpret_cast<const void*>(&cow::cow_deleter); | ||||
|   return (void*)data_ptr.get_deleter() == (void*)&cow::cow_deleter; | ||||
| } | ||||
|  | ||||
| c10::intrusive_ptr<StorageImpl> lazy_clone_storage(StorageImpl& storage) { | ||||
|  | ||||
| @ -1,4 +1,5 @@ | ||||
| #include <c10/core/DispatchKey.h> | ||||
| #include <c10/core/SafePyObject.h> | ||||
| #include <c10/core/impl/LocalDispatchKeySet.h> | ||||
| #include <c10/core/impl/TorchDispatchModeTLS.h> | ||||
| #include <c10/util/irange.h> | ||||
|  | ||||
| @ -20,7 +20,7 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig( | ||||
|   tokenizer.checkToken(++i, ":"); | ||||
|   i++; // Move to the value after the colon | ||||
| #ifdef USE_ROCM | ||||
|   TORCH_CHECK_VALUE( | ||||
|   TORCH_CHECK( | ||||
|       ((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) || | ||||
|        (tokenizer[i] == PYTORCH_TOKEN2)), | ||||
|       "Unknown allocator backend, " | ||||
| @ -36,7 +36,7 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig( | ||||
|       " != ", | ||||
|       get()->name()); | ||||
| #else // USE_ROCM | ||||
|   TORCH_CHECK_VALUE( | ||||
|   TORCH_CHECK( | ||||
|       ((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1)), | ||||
|       "Unknown allocator backend, " | ||||
|       "options are native and " PYTORCH_TOKEN1); | ||||
| @ -109,7 +109,7 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) { | ||||
|     } else { | ||||
|       const auto& keys = | ||||
|           c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys(); | ||||
|       TORCH_CHECK_VALUE( | ||||
|       TORCH_CHECK( | ||||
|           keys.find(key) != keys.end(), | ||||
|           "Unrecognized key '", | ||||
|           key, | ||||
| @ -151,12 +151,12 @@ size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads( | ||||
|     size_t i) { | ||||
|   tokenizer.checkToken(++i, ":"); | ||||
|   size_t val2 = tokenizer.toSizeT(++i); | ||||
|   TORCH_CHECK_VALUE( | ||||
|   TORCH_CHECK( | ||||
|       llvm::isPowerOf2_64(val2), | ||||
|       "Number of register threads has to be power of 2, got ", | ||||
|       val2); | ||||
|   auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads(); | ||||
|   TORCH_CHECK_VALUE( | ||||
|   TORCH_CHECK( | ||||
|       val2 <= maxThreads, | ||||
|       "Number of register threads should be less than or equal to ", | ||||
|       maxThreads, | ||||
| @ -171,8 +171,7 @@ size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize( | ||||
|     size_t i) { | ||||
|   tokenizer.checkToken(++i, ":"); | ||||
|   size_t val2 = tokenizer.toSizeT(++i); | ||||
|   TORCH_CHECK_VALUE( | ||||
|       val2 > 0, "Pinned reserve segment size has to be greater than 0"); | ||||
|   TORCH_CHECK(val2 > 0, "Pinned reserve segment size has to be greater than 0"); | ||||
|   m_pinned_reserve_segment_size_mb = val2; | ||||
|   return i; | ||||
| } | ||||
|  | ||||
| @ -3,7 +3,6 @@ | ||||
| #include <c10/core/AllocatorConfig.h> | ||||
| #include <c10/cuda/CUDAException.h> | ||||
| #include <c10/cuda/CUDAMacros.h> | ||||
| #include <c10/util/Deprecated.h> | ||||
| #include <c10/util/Exception.h> | ||||
| #include <c10/util/env.h> | ||||
|  | ||||
| @ -18,14 +17,9 @@ enum class Expandable_Segments_Handle_Type : int { | ||||
| // Environment config parser | ||||
| class C10_CUDA_API CUDAAllocatorConfig { | ||||
|  public: | ||||
|   C10_DEPRECATED_MESSAGE( | ||||
|       "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.") | ||||
|   static size_t max_split_size() { | ||||
|     return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size(); | ||||
|   } | ||||
|  | ||||
|   C10_DEPRECATED_MESSAGE( | ||||
|       "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.") | ||||
|   static double garbage_collection_threshold() { | ||||
|     return c10::CachingAllocator::AcceleratorAllocatorConfig:: | ||||
|         garbage_collection_threshold(); | ||||
| @ -70,8 +64,6 @@ class C10_CUDA_API CUDAAllocatorConfig { | ||||
|     return instance().m_pinned_num_register_threads; | ||||
|   } | ||||
|  | ||||
|   C10_DEPRECATED_MESSAGE( | ||||
|       "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.") | ||||
|   static bool pinned_use_background_threads() { | ||||
|     return c10::CachingAllocator::AcceleratorAllocatorConfig:: | ||||
|         pinned_use_background_threads(); | ||||
| @ -88,15 +80,11 @@ class C10_CUDA_API CUDAAllocatorConfig { | ||||
|     return 128; | ||||
|   } | ||||
|  | ||||
|   C10_DEPRECATED_MESSAGE( | ||||
|       "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.") | ||||
|   static size_t roundup_power2_divisions(size_t size) { | ||||
|     return c10::CachingAllocator::AcceleratorAllocatorConfig:: | ||||
|         roundup_power2_divisions(size); | ||||
|   } | ||||
|  | ||||
|   C10_DEPRECATED_MESSAGE( | ||||
|       "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.") | ||||
|   static std::vector<size_t> roundup_power2_divisions() { | ||||
|     return c10::CachingAllocator::AcceleratorAllocatorConfig:: | ||||
|         roundup_power2_divisions(); | ||||
| @ -107,8 +95,6 @@ class C10_CUDA_API CUDAAllocatorConfig { | ||||
|         max_non_split_rounding_size(); | ||||
|   } | ||||
|  | ||||
|   C10_DEPRECATED_MESSAGE( | ||||
|       "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.") | ||||
|   static std::string last_allocator_settings() { | ||||
|     return c10::CachingAllocator::getAllocatorSettings(); | ||||
|   } | ||||
|  | ||||
| @ -512,7 +512,7 @@ struct ExpandableSegment { | ||||
|     header.segment_size = segment_size_; | ||||
|     header.num_handles = end - begin; | ||||
|  | ||||
|     buf.write(reinterpret_cast<const char*>(&header), sizeof(ShareHeader)); | ||||
|     buf.write((const char*)&header, sizeof(ShareHeader)); | ||||
|     for (auto i : c10::irange(begin, end)) { | ||||
|       // NOLINTNEXTLINE(bugprone-unchecked-optional-access) | ||||
|       auto& handle = handles_.at(i).value(); | ||||
| @ -528,9 +528,7 @@ struct ExpandableSegment { | ||||
|         TORCH_CHECK( | ||||
|             handle.shareable_handle != std::nullopt, | ||||
|             "shareable_handle is null"); | ||||
|         buf.write( | ||||
|             reinterpret_cast<const char*>(&*handle.shareable_handle), | ||||
|             sizeof(int)); | ||||
|         buf.write((const char*)&*handle.shareable_handle, sizeof(int)); | ||||
|       } else { | ||||
|         if (!handle.shareable_handle) { | ||||
|           CUmemFabricHandle fabric_handle; | ||||
| @ -543,8 +541,7 @@ struct ExpandableSegment { | ||||
|             handle.shareable_handle != std::nullopt, | ||||
|             "shareable_handle is null"); | ||||
|         buf.write( | ||||
|             reinterpret_cast<const char*>(&*handle.shareable_handle), | ||||
|             sizeof(CUmemFabricHandle)); | ||||
|             (const char*)&*handle.shareable_handle, sizeof(CUmemFabricHandle)); | ||||
|       } | ||||
|     } | ||||
|     return rangeFromHandles(begin, end); | ||||
| @ -555,7 +552,7 @@ struct ExpandableSegment { | ||||
|       std::vector<c10::DeviceIndex> peers, | ||||
|       std::istream& buf) { | ||||
|     ShareHeader header{}; | ||||
|     buf.read(reinterpret_cast<char*>(&header), sizeof(ShareHeader)); | ||||
|     buf.read((char*)&header, sizeof(ShareHeader)); | ||||
|     auto segment = std::make_unique<ExpandableSegment>( | ||||
|         device, std::nullopt, header.segment_size, std::move(peers)); | ||||
| // older build setups (e.g. multiwheels) do not have this syscall, added 2020 | ||||
| @ -577,11 +574,11 @@ struct ExpandableSegment { | ||||
|       for (auto i : c10::irange(header.num_handles)) { | ||||
|         (void)i; | ||||
|         int fd = 0; | ||||
|         buf.read(reinterpret_cast<char*>(&fd), sizeof(int)); | ||||
|         buf.read((char*)&fd, sizeof(int)); | ||||
|         auto myfd = syscall(SYS_pidfd_getfd, pidfd, fd, 0); | ||||
|         if (myfd == -1) { | ||||
|           auto err = errno; | ||||
|           close(static_cast<int>(pidfd)); | ||||
|           close((int)pidfd); | ||||
|           for (auto& h : segment->handles_) { | ||||
|             C10_CUDA_DRIVER_CHECK( | ||||
|                 // NOLINTNEXTLINE(bugprone-unchecked-optional-access) | ||||
| @ -601,16 +598,15 @@ struct ExpandableSegment { | ||||
|             (void*)(uintptr_t)myfd, | ||||
|             CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); | ||||
|         LOG(INFO) << "use posix fd to import expandable segments."; | ||||
|         close(static_cast<int>(myfd)); | ||||
|         close((int)myfd); | ||||
|         segment->handles_.emplace_back(Handle{handle, std::nullopt}); | ||||
|       } | ||||
|       close(static_cast<int>(pidfd)); | ||||
|       close((int)pidfd); | ||||
|     } else { | ||||
|       for (auto i : c10::irange(header.num_handles)) { | ||||
|         (void)i; | ||||
|         CUmemFabricHandle fabric_handle; | ||||
|         buf.read( | ||||
|             reinterpret_cast<char*>(&fabric_handle), sizeof(CUmemFabricHandle)); | ||||
|         buf.read((char*)&fabric_handle, sizeof(CUmemFabricHandle)); | ||||
|         CUmemGenericAllocationHandle handle = 0; | ||||
|         C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_( | ||||
|             &handle, | ||||
| @ -941,7 +937,7 @@ class EventPool { | ||||
|  | ||||
|  private: | ||||
|   struct PerDevicePool { | ||||
|     alignas(hardware_destructive_interference_size) std::mutex mutex_; | ||||
|     alignas(64) std::mutex mutex_; | ||||
|     std::vector<std::unique_ptr<cudaEvent_t>> event_pool_; | ||||
|   }; | ||||
|   std::vector<PerDevicePool> pools_; | ||||
| @ -1063,7 +1059,7 @@ class RingBuffer { | ||||
|  | ||||
|   void setMaxEntries(size_t size) { | ||||
|     std::lock_guard<std::mutex> lk(alloc_trace_lock); | ||||
|     alloc_trace_max_entries_ = std::max(static_cast<size_t>(1), size); | ||||
|     alloc_trace_max_entries_ = std::max(size_t(1), size); | ||||
|   } | ||||
|  | ||||
|   void insertEntries(const T& entry) { | ||||
| @ -1264,9 +1260,6 @@ class DeviceCachingAllocator { | ||||
|   // thread local compile context for each device | ||||
|   static thread_local std::stack<std::string> compile_context; | ||||
|  | ||||
|   // thread local user metadata for annotating allocations | ||||
|   static thread_local std::string user_metadata; | ||||
|  | ||||
|  public: | ||||
|   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) | ||||
|   explicit DeviceCachingAllocator(c10::DeviceIndex id) | ||||
| @ -1274,7 +1267,7 @@ class DeviceCachingAllocator { | ||||
|         large_blocks(/*small=*/false), | ||||
|         small_blocks(/*small=*/true) { | ||||
|     stats.max_split_size = | ||||
|         static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size()); | ||||
|         static_cast<int64_t>(CUDAAllocatorConfig::max_split_size()); | ||||
|     context_recorder_.store(nullptr); | ||||
|   } | ||||
|  | ||||
| @ -1309,14 +1302,6 @@ class DeviceCachingAllocator { | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   void setUserMetadata(const std::string& metadata) { | ||||
|     user_metadata = metadata; | ||||
|   } | ||||
|  | ||||
|   std::string getUserMetadata() { | ||||
|     return user_metadata; | ||||
|   } | ||||
|  | ||||
|   bool checkPoolLiveAllocations( | ||||
|       MempoolId_t mempool_id, | ||||
|       const std::unordered_set<void*>& expected_live_allocations) const { | ||||
| @ -1409,8 +1394,7 @@ class DeviceCachingAllocator { | ||||
|       // Do garbage collection if the flag is set. | ||||
|       if (C10_UNLIKELY( | ||||
|               set_fraction && | ||||
|               AcceleratorAllocatorConfig::garbage_collection_threshold() > | ||||
|                   0.0)) { | ||||
|               CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) { | ||||
|         garbage_collect_cached_blocks(context); | ||||
|       } | ||||
|       // Attempt allocate | ||||
| @ -1662,7 +1646,7 @@ class DeviceCachingAllocator { | ||||
|       stats.active_bytes[stat_type].increase(block->size); | ||||
|       stats.requested_bytes[stat_type].increase(block->requested_size); | ||||
|     }); | ||||
|     if (block->size >= AcceleratorAllocatorConfig::max_split_size()) | ||||
|     if (block->size >= CUDAAllocatorConfig::max_split_size()) | ||||
|       stats.oversize_allocations.increase(1); | ||||
|  | ||||
|     auto allocated_bytes_gauge = | ||||
| @ -1931,7 +1915,7 @@ class DeviceCachingAllocator { | ||||
|         block->pool->owner_MempoolId(), | ||||
|         context ? context : block->context_when_allocated); | ||||
|  | ||||
|     if (block->size >= AcceleratorAllocatorConfig::max_split_size()) | ||||
|     if (block->size >= CUDAAllocatorConfig::max_split_size()) | ||||
|       stats.oversize_allocations.decrease(1); | ||||
|  | ||||
|     // If the block has been used on more than one stream, handle accordingly. | ||||
| @ -1995,16 +1979,15 @@ class DeviceCachingAllocator { | ||||
|       while (base_block->prev) { | ||||
|         base_block = base_block->prev; | ||||
|       } | ||||
|       offset = static_cast<const char*>(block->ptr) - | ||||
|           static_cast<const char*>(base_block->ptr); | ||||
|       offset = (char*)block->ptr - (char*)base_block->ptr; | ||||
|       cudaIpcMemHandle_t handle; | ||||
|       C10_CUDA_CHECK(cudaIpcGetMemHandle(&handle, base_block->ptr)); | ||||
|       ss.write(reinterpret_cast<const char*>(&handle), CUDA_IPC_HANDLE_SIZE); | ||||
|       ss.write((char*)&handle, CUDA_IPC_HANDLE_SIZE); | ||||
|     } else { | ||||
|       ss.put(SHAREABLE_CUDA_EXPANDABLE_SEGMENT); | ||||
|       auto full_range = block->expandable_segment_->share( | ||||
|           SegmentRange(block->ptr, block->size), ss); | ||||
|       offset = static_cast<const char*>(block->ptr) - full_range.ptr; | ||||
|       offset = (char*)block->ptr - full_range.ptr; | ||||
|     } | ||||
|     return ShareableHandle{offset, ss.str()}; | ||||
|   } | ||||
| @ -2505,8 +2488,7 @@ class DeviceCachingAllocator { | ||||
|     if (size < kMinBlockSize) { | ||||
|       return kMinBlockSize; | ||||
|     } else { | ||||
|       auto divisions = | ||||
|           AcceleratorAllocatorConfig::roundup_power2_divisions(size); | ||||
|       auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size); | ||||
|       if (divisions > 1 && size > (kMinBlockSize * divisions)) { | ||||
|         return roundup_power2_next_division(size, divisions); | ||||
|       } else { | ||||
| @ -3000,7 +2982,7 @@ class DeviceCachingAllocator { | ||||
|     if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) { | ||||
|       return remaining >= kMinBlockSize; | ||||
|     } else { | ||||
|       return (size < AcceleratorAllocatorConfig::max_split_size()) && | ||||
|       return (size < CUDAAllocatorConfig::max_split_size()) && | ||||
|           (remaining > kSmallSize); | ||||
|     } | ||||
|   } | ||||
| @ -3020,7 +3002,7 @@ class DeviceCachingAllocator { | ||||
|  | ||||
|     if (C10_UNLIKELY( | ||||
|             set_fraction && | ||||
|             AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) { | ||||
|             CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) { | ||||
|       // Track block reuse interval only when garbage collection is enabled. | ||||
|       ++pool.get_free_blocks_call_count; | ||||
|     } | ||||
| @ -3062,13 +3044,13 @@ class DeviceCachingAllocator { | ||||
|     } | ||||
|  | ||||
|     // Do not return an oversized block for a large request | ||||
|     if ((p.size() < AcceleratorAllocatorConfig::max_split_size()) && | ||||
|         ((*it)->size >= AcceleratorAllocatorConfig::max_split_size())) | ||||
|     if ((p.size() < CUDAAllocatorConfig::max_split_size()) && | ||||
|         ((*it)->size >= CUDAAllocatorConfig::max_split_size())) | ||||
|       return false; | ||||
|     // Allow oversized block size to be rounded up but within a limit | ||||
|     if ((p.size() >= AcceleratorAllocatorConfig::max_split_size()) && | ||||
|     if ((p.size() >= CUDAAllocatorConfig::max_split_size()) && | ||||
|         ((*it)->size >= | ||||
|          p.size() + AcceleratorAllocatorConfig::max_non_split_rounding_size())) | ||||
|          p.size() + CUDAAllocatorConfig::max_non_split_rounding_size())) | ||||
|       return false; | ||||
|     p.block = *it; | ||||
|     pool.blocks.erase(it); | ||||
| @ -3091,7 +3073,7 @@ class DeviceCachingAllocator { | ||||
|     // therefore should be of less overheads. | ||||
|  | ||||
|     size_t gc_threshold = static_cast<size_t>( | ||||
|         AcceleratorAllocatorConfig::garbage_collection_threshold() * | ||||
|         CUDAAllocatorConfig::garbage_collection_threshold() * | ||||
|         static_cast<double>(allowed_memory_maximum)); | ||||
|     // No need to trigger GC yet | ||||
|     if (total_allocated_memory <= gc_threshold) { | ||||
| @ -3234,13 +3216,12 @@ class DeviceCachingAllocator { | ||||
|     } | ||||
|  | ||||
|     total_allocated_memory += size; | ||||
|     p.block = new Block( | ||||
|         p.device(), p.stream(), size, p.pool, static_cast<char*>(ptr)); | ||||
|     p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr); | ||||
|     for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) { | ||||
|       stats.segment[stat_type].increase(1); | ||||
|       stats.reserved_bytes[stat_type].increase(size); | ||||
|     }); | ||||
|     if (size >= AcceleratorAllocatorConfig::max_split_size()) | ||||
|     if (size >= CUDAAllocatorConfig::max_split_size()) | ||||
|       stats.oversize_segments.increase(1); | ||||
|     auto reserved_bytes_gauge = | ||||
|         STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes); | ||||
| @ -3269,7 +3250,7 @@ class DeviceCachingAllocator { | ||||
|   bool release_available_cached_blocks( | ||||
|       const AllocParams& p, | ||||
|       const std::shared_ptr<GatheredContext>& context) { | ||||
|     if (AcceleratorAllocatorConfig::max_split_size() == | ||||
|     if (CUDAAllocatorConfig::max_split_size() == | ||||
|         std::numeric_limits<size_t>::max()) | ||||
|       return false; | ||||
|     BlockPool& pool = *p.pool; | ||||
| @ -3277,8 +3258,8 @@ class DeviceCachingAllocator { | ||||
|     // because of std::unique_ptr, block cannot be trivially copied | ||||
|     // Use constructor for search key. | ||||
|     Block key(p.search_key.device, p.search_key.stream, p.search_key.size); | ||||
|     key.size = (key.size < AcceleratorAllocatorConfig::max_split_size()) | ||||
|         ? AcceleratorAllocatorConfig::max_split_size() | ||||
|     key.size = (key.size < CUDAAllocatorConfig::max_split_size()) | ||||
|         ? CUDAAllocatorConfig::max_split_size() | ||||
|         : key.size; | ||||
|     auto it = pool.blocks.lower_bound(&key); | ||||
|     if (it == pool.blocks.end() || (*it)->stream != p.stream() || | ||||
| @ -3291,7 +3272,7 @@ class DeviceCachingAllocator { | ||||
|       --it; // Back up one item.  Now on the largest block for the correct | ||||
|             // stream | ||||
|       while ((totalReleased < key.size) && | ||||
|              ((*it)->size >= AcceleratorAllocatorConfig::max_split_size()) && | ||||
|              ((*it)->size >= CUDAAllocatorConfig::max_split_size()) && | ||||
|              ((*it)->stream == p.stream())) { | ||||
|         auto cur = it; | ||||
|         bool is_first = cur == pool.blocks.begin(); | ||||
| @ -3416,7 +3397,7 @@ class DeviceCachingAllocator { | ||||
|         stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)] | ||||
|             .current); | ||||
|  | ||||
|     if (block->size >= AcceleratorAllocatorConfig::max_split_size()) | ||||
|     if (block->size >= CUDAAllocatorConfig::max_split_size()) | ||||
|       stats.oversize_segments.decrease(1); | ||||
|     pool->blocks.erase(block); | ||||
|     delete block; | ||||
| @ -3701,8 +3682,7 @@ class DeviceCachingAllocator { | ||||
|         mempool_id, | ||||
|         getApproximateTime(), | ||||
|         record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr, | ||||
|         compile_string, | ||||
|         user_metadata); | ||||
|         compile_string); | ||||
|  | ||||
|     // Callbacks should not include any Pytorch call | ||||
|     for (const auto& cb : trace_trackers_) { | ||||
| @ -3757,7 +3737,11 @@ static void uncached_delete(void* ptr) { | ||||
|  | ||||
| static void local_raw_delete(void* ptr); | ||||
| thread_local std::stack<std::string> DeviceCachingAllocator::compile_context; | ||||
| thread_local std::string DeviceCachingAllocator::user_metadata; | ||||
| #ifdef __cpp_lib_hardware_interference_size | ||||
| using std::hardware_destructive_interference_size; | ||||
| #else | ||||
| static constexpr std::size_t hardware_destructive_interference_size = 64; | ||||
| #endif | ||||
|  | ||||
| class NativeCachingAllocator : public CUDAAllocator { | ||||
|  private: | ||||
| @ -3778,7 +3762,7 @@ class NativeCachingAllocator : public CUDAAllocator { | ||||
|       allocated_blocks; | ||||
|  | ||||
|   static size_t get_mutex_shard_id(void* ptr) { | ||||
|     return twang_mix64(reinterpret_cast<uintptr_t>(ptr)) % kNumMutexShard; | ||||
|     return twang_mix64((size_t)ptr) % kNumMutexShard; | ||||
|   } | ||||
|  | ||||
|   void add_allocated_block(Block* block) { | ||||
| @ -3815,8 +3799,8 @@ class NativeCachingAllocator : public CUDAAllocator { | ||||
|     if (size < device_count) { | ||||
|       device_allocator.resize(device_count); | ||||
|       for (const auto i : c10::irange(size, device_count)) { | ||||
|         device_allocator[i] = std::make_unique<DeviceCachingAllocator>( | ||||
|             static_cast<c10::DeviceIndex>(i)); | ||||
|         device_allocator[i] = | ||||
|             std::make_unique<DeviceCachingAllocator>(c10::DeviceIndex(i)); | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| @ -3950,18 +3934,6 @@ class NativeCachingAllocator : public CUDAAllocator { | ||||
|     device_allocator[device]->popCompileContext(); | ||||
|   } | ||||
|  | ||||
|   void setUserMetadata(const std::string& metadata) override { | ||||
|     c10::DeviceIndex device = 0; | ||||
|     C10_CUDA_CHECK(c10::cuda::GetDevice(&device)); | ||||
|     device_allocator[device]->setUserMetadata(metadata); | ||||
|   } | ||||
|  | ||||
|   std::string getUserMetadata() override { | ||||
|     c10::DeviceIndex device = 0; | ||||
|     C10_CUDA_CHECK(c10::cuda::GetDevice(&device)); | ||||
|     return device_allocator[device]->getUserMetadata(); | ||||
|   } | ||||
|  | ||||
|   bool isHistoryEnabled() override { | ||||
|     c10::DeviceIndex device = 0; | ||||
|     C10_CUDA_CHECK(c10::cuda::GetDevice(&device)); | ||||
| @ -4062,8 +4034,8 @@ class NativeCachingAllocator : public CUDAAllocator { | ||||
|  | ||||
|     auto& md = result.config_metadata; | ||||
|     md.garbage_collection_threshold = | ||||
|         AcceleratorAllocatorConfig::garbage_collection_threshold(); | ||||
|     md.max_split_size = AcceleratorAllocatorConfig::max_split_size(); | ||||
|         CUDAAllocatorConfig::garbage_collection_threshold(); | ||||
|     md.max_split_size = CUDAAllocatorConfig::max_split_size(); | ||||
|     md.pinned_num_register_threads = | ||||
|         CUDAAllocatorConfig::pinned_num_register_threads(); | ||||
|     md.expandable_segments = CUDAAllocatorConfig::expandable_segments(); | ||||
| @ -4071,12 +4043,11 @@ class NativeCachingAllocator : public CUDAAllocator { | ||||
|         CUDAAllocatorConfig::release_lock_on_cudamalloc(); | ||||
|     md.pinned_use_host_register = | ||||
|         CUDAAllocatorConfig::pinned_use_cuda_host_register(); | ||||
|     md.last_allocator_settings = | ||||
|         AcceleratorAllocatorConfig::last_allocator_settings(); | ||||
|     md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings(); | ||||
|     md.graph_capture_record_stream_reuse = | ||||
|         CUDAAllocatorConfig::graph_capture_record_stream_reuse(); | ||||
|     md.roundup_power2_divisions = | ||||
|         AcceleratorAllocatorConfig::roundup_power2_divisions(); | ||||
|         CUDAAllocatorConfig::roundup_power2_divisions(); | ||||
|  | ||||
|     return result; | ||||
|   } | ||||
| @ -4345,7 +4316,7 @@ class NativeCachingAllocator : public CUDAAllocator { | ||||
|         // SHARABLE_CUDA_MALLOC | ||||
|       if (type == SHAREABLE_CUDA_MALLOC) { | ||||
|         cudaIpcMemHandle_t cuda_handle; | ||||
|         ss.read(reinterpret_cast<char*>(&cuda_handle), CUDA_IPC_HANDLE_SIZE); | ||||
|         ss.read((char*)&cuda_handle, CUDA_IPC_HANDLE_SIZE); | ||||
|         C10_CUDA_CHECK(cudaIpcOpenMemHandle( | ||||
|             &cuda_ipc_ptr_, cuda_handle, cudaIpcMemLazyEnablePeerAccess)); | ||||
|       } else if (type == SHAREABLE_CUDA_EXPANDABLE_SEGMENT) { | ||||
| @ -4454,12 +4425,11 @@ CUDAAllocator* allocator(); | ||||
| } // namespace CudaMallocAsync | ||||
|  | ||||
| struct BackendStaticInitializer { | ||||
|   // Parses the environment configuration for CUDA/ROCm allocator backend at | ||||
|   // load time. This duplicates some logic from CUDAAllocatorConfig to ensure | ||||
|   // lazy initialization without triggering global static constructors. The | ||||
|   // function looks for the key "backend" and returns the appropriate allocator | ||||
|   // instance based on its value. If no valid configuration is found, it falls | ||||
|   // back to the default Native allocator. | ||||
|   // Parses env for backend at load time, duplicating some logic from | ||||
|   // CUDAAllocatorConfig. CUDAAllocatorConfig double-checks it later (at | ||||
|   // runtime). Defers verbose exceptions and error checks, including Cuda | ||||
|   // version checks, to CUDAAllocatorConfig's runtime doublecheck. If this | ||||
|   // works, maybe we should move all of CUDAAllocatorConfig here? | ||||
|   CUDAAllocator* parseEnvForBackend() { | ||||
|     auto val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF"); | ||||
| #ifdef USE_ROCM | ||||
| @ -4468,35 +4438,34 @@ struct BackendStaticInitializer { | ||||
|       val = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF"); | ||||
|     } | ||||
| #endif | ||||
|     if (!val.has_value()) { | ||||
|       val = c10::utils::get_env("PYTORCH_ALLOC_CONF"); | ||||
|     } | ||||
|     if (val.has_value()) { | ||||
|       c10::CachingAllocator::ConfigTokenizer tokenizer(val.value()); | ||||
|       for (size_t i = 0; i < tokenizer.size(); i++) { | ||||
|         const auto& key = tokenizer[i]; | ||||
|         if (key == "backend") { | ||||
|           tokenizer.checkToken(++i, ":"); | ||||
|           i++; // Move to the value after the colon | ||||
|           if (tokenizer[i] == "cudaMallocAsync" | ||||
|       const std::string& config = val.value(); | ||||
|  | ||||
|       std::regex exp("[\\s,]+"); | ||||
|       std::sregex_token_iterator it(config.begin(), config.end(), exp, -1); | ||||
|       std::sregex_token_iterator end; | ||||
|       std::vector<std::string> options(it, end); | ||||
|  | ||||
|       for (auto option : options) { | ||||
|         std::regex exp2("[:]+"); | ||||
|         std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1); | ||||
|         std::sregex_token_iterator end2; | ||||
|         std::vector<std::string> kv(it2, end2); | ||||
|         if (kv.size() >= 2) { | ||||
|           if (kv[0] == "backend") { | ||||
| #ifdef USE_ROCM | ||||
|               // convenience for ROCm users to allow either CUDA or HIP env var | ||||
|               || tokenizer[i] == "hipMallocAsync" | ||||
|             // convenience for ROCm users to allow either CUDA or HIP env var | ||||
|             if (kv[1] == "cudaMallocAsync" || kv[1] == "hipMallocAsync") | ||||
| #else | ||||
|             if (kv[1] == "cudaMallocAsync") | ||||
| #endif | ||||
|           ) { | ||||
|             return CudaMallocAsync::allocator(); | ||||
|               return CudaMallocAsync::allocator(); | ||||
|             if (kv[1] == "native") | ||||
|               return &Native::allocator; | ||||
|           } | ||||
|           break; | ||||
|         } else { | ||||
|           // Skip the key and its value | ||||
|           i = tokenizer.skipKey(i); | ||||
|         } | ||||
|         if (i + 1 < tokenizer.size()) { | ||||
|           tokenizer.checkToken(++i, ","); | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|     // Default fallback allocator. | ||||
|     return &Native::allocator; | ||||
|   } | ||||
|  | ||||
|  | ||||
| @ -118,8 +118,7 @@ struct TraceEntry { | ||||
|       MempoolId_t mempool, | ||||
|       approx_time_t time, | ||||
|       std::shared_ptr<GatheredContext> context = nullptr, | ||||
|       std::string compile_context = "", | ||||
|       std::string user_metadata = "") | ||||
|       std::string compile_context = "") | ||||
|       : action_(action), | ||||
|         device_(device), | ||||
|         addr_(addr), | ||||
| @ -127,8 +126,7 @@ struct TraceEntry { | ||||
|         stream_(stream), | ||||
|         size_(size), | ||||
|         mempool_(std::move(mempool)), | ||||
|         compile_context_(std::move(compile_context)), | ||||
|         user_metadata_(std::move(user_metadata)) { | ||||
|         compile_context_(std::move(compile_context)) { | ||||
|     time_.approx_t_ = time; | ||||
|   } | ||||
|   Action action_; | ||||
| @ -140,7 +138,6 @@ struct TraceEntry { | ||||
|   MempoolId_t mempool_; | ||||
|   trace_time_ time_{}; | ||||
|   std::string compile_context_; | ||||
|   std::string user_metadata_; | ||||
| }; | ||||
|  | ||||
| // Calls made by record_function will save annotations | ||||
| @ -300,10 +297,6 @@ class CUDAAllocator : public DeviceAllocator { | ||||
|       const std::vector<std::pair<std::string, std::string>>& /*md*/) {} | ||||
|   virtual void pushCompileContext(std::string& md) {} | ||||
|   virtual void popCompileContext() {} | ||||
|   virtual void setUserMetadata(const std::string& metadata) {} | ||||
|   virtual std::string getUserMetadata() { | ||||
|     return ""; | ||||
|   } | ||||
|   virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0; | ||||
|  | ||||
|   // Attached AllocatorTraceTracker callbacks will be called while the | ||||
| @ -543,14 +536,6 @@ inline void enablePeerAccess( | ||||
|   get()->enablePeerAccess(dev, dev_to_access); | ||||
| } | ||||
|  | ||||
| inline void setUserMetadata(const std::string& metadata) { | ||||
|   get()->setUserMetadata(metadata); | ||||
| } | ||||
|  | ||||
| inline std::string getUserMetadata() { | ||||
|   return get()->getUserMetadata(); | ||||
| } | ||||
|  | ||||
| } // namespace c10::cuda::CUDACachingAllocator | ||||
|  | ||||
| namespace c10::cuda { | ||||
|  | ||||
| @ -1,6 +1,8 @@ | ||||
| #include <c10/cuda/CUDADeviceAssertionHost.h> | ||||
| #include <c10/cuda/CUDAException.h> | ||||
| #include <c10/cuda/CUDAFunctions.h> | ||||
| #include <c10/util/Backtrace.h> | ||||
| #include <c10/util/Exception.h> | ||||
| #include <c10/util/env.h> | ||||
| #include <c10/util/irange.h> | ||||
| #include <cuda_runtime.h> | ||||
|  | ||||
| @ -4,6 +4,7 @@ | ||||
| #include <c10/cuda/CUDAGuard.h> | ||||
| #include <c10/util/UniqueVoidPtr.h> | ||||
| #include <c10/util/flat_hash_map.h> | ||||
| #include <c10/util/irange.h> | ||||
|  | ||||
| #include <unordered_set> | ||||
| #include <vector> | ||||
| @ -46,7 +47,7 @@ bool operator==(const UsageStream& lhs, const UsageStream& rhs) { | ||||
|  | ||||
| struct UsageStreamHash { | ||||
|   size_t operator()(const UsageStream& us) const noexcept { | ||||
|     return std::hash<void*>{}(us.stream) + static_cast<size_t>(us.device); | ||||
|     return std::hash<void*>{}(us.stream) + size_t(us.device); | ||||
|   } | ||||
| }; | ||||
|  | ||||
|  | ||||
| @ -1,6 +1,7 @@ | ||||
| #include <c10/cuda/CUDAMiscFunctions.h> | ||||
| #include <c10/util/env.h> | ||||
| #include <cuda_runtime.h> | ||||
| #include <cstring> | ||||
| #include <string> | ||||
|  | ||||
| namespace c10::cuda { | ||||
|  | ||||
| @ -128,7 +128,7 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s) { | ||||
|   } else if (s.isExt()) { | ||||
|     stream << "EXT"; | ||||
|   } else { | ||||
|     stream << "PRIORITY " << static_cast<int>(s.getStreamType()); | ||||
|     stream << "PRIORITY " << int(s.getStreamType()); | ||||
|   } | ||||
|   return stream; | ||||
| } | ||||
|  | ||||
| @ -1,6 +1,7 @@ | ||||
| #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) | ||||
| #include <c10/cuda/CUDAException.h> | ||||
| #include <c10/cuda/driver_api.h> | ||||
| #include <c10/util/CallOnce.h> | ||||
| #include <c10/util/Exception.h> | ||||
| #include <c10/util/Logging.h> | ||||
| #include <cuda_runtime.h> | ||||
|  | ||||
| @ -328,21 +328,5 @@ struct pair { | ||||
|   T2 second; | ||||
| }; | ||||
|  | ||||
| #define INSTANTIATE_FOR_ALL_TYPES(MACRO) \ | ||||
|   MACRO(float);                          \ | ||||
|   MACRO(half);                           \ | ||||
|   MACRO(bfloat);                         \ | ||||
|   MACRO(float2);                         \ | ||||
|   MACRO(long);                           \ | ||||
|   MACRO(char);                           \ | ||||
|   MACRO(uchar);                          \ | ||||
|   MACRO(short);                          \ | ||||
|   MACRO(int); | ||||
|  | ||||
| #define INSTANTIATE_FOR_FLOAT_TYPES(MACRO) \ | ||||
|   MACRO(float);                            \ | ||||
|   MACRO(half);                             \ | ||||
|   MACRO(bfloat); | ||||
|  | ||||
| } // namespace metal | ||||
| } // namespace c10 | ||||
|  | ||||
| @ -67,8 +67,8 @@ TEST(AllocatorConfigTest, allocator_config_test) { | ||||
|   EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(128 * kMB), 2); | ||||
|   EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(256 * kMB), 4); | ||||
|   EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 2); | ||||
|   EXPECT_EQ( | ||||
|       AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 4); | ||||
|   // EXPECT_EQ( | ||||
|   //     AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 4); | ||||
|   EXPECT_EQ( | ||||
|       AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 1); | ||||
|   EXPECT_EQ( | ||||
| @ -101,8 +101,8 @@ TEST(AllocatorConfigTest, allocator_config_test) { | ||||
|   EXPECT_EQ(AcceleratorAllocatorConfig::roundup_power2_divisions(512 * kMB), 1); | ||||
|   EXPECT_EQ( | ||||
|       AcceleratorAllocatorConfig::roundup_power2_divisions(1024 * kMB), 0); | ||||
|   EXPECT_EQ( | ||||
|       AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 8); | ||||
|   // EXPECT_EQ( | ||||
|   //     AcceleratorAllocatorConfig::roundup_power2_divisions(2048 * kMB), 8); | ||||
|   EXPECT_EQ( | ||||
|       AcceleratorAllocatorConfig::roundup_power2_divisions(4096 * kMB), 2); | ||||
|  | ||||
|  | ||||
| @ -1,6 +1,7 @@ | ||||
| #include <c10/util/ApproximateClock.h> | ||||
| #include <c10/util/Exception.h> | ||||
| #include <c10/util/ArrayRef.h> | ||||
| #include <c10/util/irange.h> | ||||
| #include <fmt/format.h> | ||||
|  | ||||
| namespace c10 { | ||||
|  | ||||
| @ -46,8 +47,7 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter:: | ||||
|   for (const auto i : c10::irange(replicates)) { | ||||
|     auto delta_ns = end_times[i].t_ - start_times_[i].t_; | ||||
|     auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_; | ||||
|     scale_factors[i] = | ||||
|         static_cast<double>(delta_ns) / static_cast<double>(delta_approx); | ||||
|     scale_factors[i] = (double)delta_ns / (double)delta_approx; | ||||
|   } | ||||
|   std::sort(scale_factors.begin(), scale_factors.end()); | ||||
|   long double scale_factor = scale_factors[replicates / 2 + 1]; | ||||
| @ -65,8 +65,7 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter:: | ||||
|   for (const auto i : c10::irange(replicates)) { | ||||
|     auto dt = start_times_[i].t_ - t0; | ||||
|     auto dt_approx = | ||||
|         static_cast<double>(start_times_[i].approx_t_ - t0_approx) * | ||||
|         scale_factor; | ||||
|         (double)(start_times_[i].approx_t_ - t0_approx) * scale_factor; | ||||
|     t0_correction[i] = dt - (time_t)dt_approx; // NOLINT | ||||
|   } | ||||
|   t0 += t0_correction[t0_correction.size() / 2 + 1]; // NOLINT | ||||
| @ -74,9 +73,7 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter:: | ||||
|   return [=](approx_time_t t_approx) { | ||||
|     // See above for why this is more stable than `A * t_approx + B`. | ||||
|     return t_approx > t0_approx | ||||
|         ? static_cast<time_t>( | ||||
|               static_cast<double>(t_approx - t0_approx) * scale_factor) + | ||||
|             t0 | ||||
|         ? (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0 | ||||
|         : 0; | ||||
|   }; | ||||
| } | ||||
|  | ||||
| @ -18,6 +18,7 @@ | ||||
| #include <c10/macros/Macros.h> | ||||
| #include <c10/util/Exception.h> | ||||
| #include <c10/util/SmallVector.h> | ||||
| #include <torch/headeronly/util/HeaderOnlyArrayRef.h> | ||||
|  | ||||
| #include <array> | ||||
| #include <cstddef> | ||||
| @ -40,200 +41,106 @@ namespace c10 { | ||||
| /// | ||||
| /// This is intended to be trivially copyable, so it should be passed by | ||||
| /// value. | ||||
| /// | ||||
| /// NOTE: We have refactored out the headeronly parts of the ArrayRef struct | ||||
| /// into HeaderOnlyArrayRef. As adding `virtual` would change the performance of | ||||
| /// the underlying constexpr calls, we rely on apparent-type dispatch for | ||||
| /// inheritance. This should be fine because their memory format is the same, | ||||
| /// and it is never incorrect for ArrayRef to call HeaderOnlyArrayRef methods. | ||||
| /// However, you should prefer to use ArrayRef when possible, because its use | ||||
| /// of TORCH_CHECK will lead to better user-facing error messages. | ||||
| template <typename T> | ||||
| class ArrayRef final { | ||||
| class ArrayRef final : public HeaderOnlyArrayRef<T> { | ||||
|  public: | ||||
|   using iterator = const T*; | ||||
|   using const_iterator = const T*; | ||||
|   using size_type = size_t; | ||||
|   using value_type = T; | ||||
|  | ||||
|   using reverse_iterator = std::reverse_iterator<iterator>; | ||||
|  | ||||
|  private: | ||||
|   /// The start of the array, in an external buffer. | ||||
|   const T* Data; | ||||
|  | ||||
|   /// The number of elements. | ||||
|   size_type Length; | ||||
|  | ||||
|   void debugCheckNullptrInvariant() { | ||||
|     TORCH_INTERNAL_ASSERT_DEBUG_ONLY( | ||||
|         Data != nullptr || Length == 0, | ||||
|         "created ArrayRef with nullptr and non-zero length! std::optional relies on this being illegal"); | ||||
|   } | ||||
|  | ||||
|  public: | ||||
|   /// @name Constructors | ||||
|   /// @name Constructors, all inherited from HeaderOnlyArrayRef except for | ||||
|   /// SmallVector. | ||||
|   /// @{ | ||||
|  | ||||
|   /// Construct an empty ArrayRef. | ||||
|   /* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {} | ||||
|   using HeaderOnlyArrayRef<T>::HeaderOnlyArrayRef; | ||||
|  | ||||
|   /// Construct an ArrayRef from a single element. | ||||
|   // TODO Make this explicit | ||||
|   constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {} | ||||
|  | ||||
|   /// Construct an ArrayRef from a pointer and length. | ||||
|   constexpr ArrayRef(const T* data, size_t length) | ||||
|       : Data(data), Length(length) { | ||||
|     debugCheckNullptrInvariant(); | ||||
|   } | ||||
|  | ||||
|   /// Construct an ArrayRef from a range. | ||||
|   constexpr ArrayRef(const T* begin, const T* end) | ||||
|       : Data(begin), Length(end - begin) { | ||||
|     debugCheckNullptrInvariant(); | ||||
|   } | ||||
|   /// Construct an ArrayRef from a std::vector. | ||||
|   /// This constructor is identical to the one in HeaderOnlyArrayRef, but we | ||||
|   /// include it to help with Class Template Argument Deduction (CTAD). | ||||
|   /// Without it, CTAD can fail sometimes due to the indirect constructor | ||||
|   /// inheritance. So we explicitly include this constructor. | ||||
|   template <typename A> | ||||
|   /* implicit */ ArrayRef(const std::vector<T, A>& Vec) | ||||
|       : HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {} | ||||
|  | ||||
|   /// Construct an ArrayRef from a SmallVector. This is templated in order to | ||||
|   /// avoid instantiating SmallVectorTemplateCommon<T> whenever we | ||||
|   /// copy-construct an ArrayRef. | ||||
|   /// NOTE: this is the only constructor that is not inherited from | ||||
|   /// HeaderOnlyArrayRef. | ||||
|   template <typename U> | ||||
|   /* implicit */ ArrayRef(const SmallVectorTemplateCommon<T, U>& Vec) | ||||
|       : Data(Vec.data()), Length(Vec.size()) { | ||||
|     debugCheckNullptrInvariant(); | ||||
|   } | ||||
|  | ||||
|   template < | ||||
|       typename Container, | ||||
|       typename U = decltype(std::declval<Container>().data()), | ||||
|       typename = std::enable_if_t< | ||||
|           (std::is_same_v<U, T*> || std::is_same_v<U, T const*>)>> | ||||
|   /* implicit */ ArrayRef(const Container& container) | ||||
|       : Data(container.data()), Length(container.size()) { | ||||
|     debugCheckNullptrInvariant(); | ||||
|   } | ||||
|  | ||||
|   /// Construct an ArrayRef from a std::vector. | ||||
|   // The enable_if stuff here makes sure that this isn't used for | ||||
|   // std::vector<bool>, because ArrayRef can't work on a std::vector<bool> | ||||
|   // bitfield. | ||||
|   template <typename A> | ||||
|   /* implicit */ ArrayRef(const std::vector<T, A>& Vec) | ||||
|       : Data(Vec.data()), Length(Vec.size()) { | ||||
|     static_assert( | ||||
|         !std::is_same_v<T, bool>, | ||||
|         "ArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield."); | ||||
|   } | ||||
|  | ||||
|   /// Construct an ArrayRef from a std::array | ||||
|   template <size_t N> | ||||
|   /* implicit */ constexpr ArrayRef(const std::array<T, N>& Arr) | ||||
|       : Data(Arr.data()), Length(N) {} | ||||
|  | ||||
|   /// Construct an ArrayRef from a C array. | ||||
|   template <size_t N> | ||||
|   // NOLINTNEXTLINE(*c-arrays*) | ||||
|   /* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {} | ||||
|  | ||||
|   /// Construct an ArrayRef from a std::initializer_list. | ||||
|   /* implicit */ constexpr ArrayRef(const std::initializer_list<T>& Vec) | ||||
|       : Data( | ||||
|             std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr) | ||||
|                                              : std::begin(Vec)), | ||||
|         Length(Vec.size()) {} | ||||
|       : HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {} | ||||
|  | ||||
|   /// @} | ||||
|   /// @name Simple Operations | ||||
|   /// @name Simple Operations, mostly inherited from HeaderOnlyArrayRef | ||||
|   /// @{ | ||||
|  | ||||
|   constexpr iterator begin() const { | ||||
|     return Data; | ||||
|   } | ||||
|   constexpr iterator end() const { | ||||
|     return Data + Length; | ||||
|   } | ||||
|  | ||||
|   // These are actually the same as iterator, since ArrayRef only | ||||
|   // gives you const iterators. | ||||
|   constexpr const_iterator cbegin() const { | ||||
|     return Data; | ||||
|   } | ||||
|   constexpr const_iterator cend() const { | ||||
|     return Data + Length; | ||||
|   } | ||||
|  | ||||
|   constexpr reverse_iterator rbegin() const { | ||||
|     return reverse_iterator(end()); | ||||
|   } | ||||
|   constexpr reverse_iterator rend() const { | ||||
|     return reverse_iterator(begin()); | ||||
|   } | ||||
|  | ||||
|   /// Check if all elements in the array satisfy the given expression | ||||
|   constexpr bool allMatch(const std::function<bool(const T&)>& pred) const { | ||||
|     return std::all_of(cbegin(), cend(), pred); | ||||
|   } | ||||
|  | ||||
|   /// empty - Check if the array is empty. | ||||
|   constexpr bool empty() const { | ||||
|     return Length == 0; | ||||
|   } | ||||
|  | ||||
|   constexpr const T* data() const { | ||||
|     return Data; | ||||
|   } | ||||
|  | ||||
|   /// size - Get the array size. | ||||
|   constexpr size_t size() const { | ||||
|     return Length; | ||||
|   } | ||||
|  | ||||
|   /// front - Get the first element. | ||||
|   /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of | ||||
|   /// STD_TORCH_CHECK | ||||
|   constexpr const T& front() const { | ||||
|     TORCH_CHECK( | ||||
|         !empty(), "ArrayRef: attempted to access front() of empty list"); | ||||
|     return Data[0]; | ||||
|         !this->empty(), "ArrayRef: attempted to access front() of empty list"); | ||||
|     return this->Data[0]; | ||||
|   } | ||||
|  | ||||
|   /// back - Get the last element. | ||||
|   /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of | ||||
|   /// STD_TORCH_CHECK | ||||
|   constexpr const T& back() const { | ||||
|     TORCH_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list"); | ||||
|     return Data[Length - 1]; | ||||
|   } | ||||
|  | ||||
|   /// equals - Check for element-wise equality. | ||||
|   constexpr bool equals(ArrayRef RHS) const { | ||||
|     return Length == RHS.Length && std::equal(begin(), end(), RHS.begin()); | ||||
|     TORCH_CHECK( | ||||
|         !this->empty(), "ArrayRef: attempted to access back() of empty list"); | ||||
|     return this->Data[this->Length - 1]; | ||||
|   } | ||||
|  | ||||
|   /// slice(n, m) - Take M elements of the array starting at element N | ||||
|   /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of | ||||
|   /// STD_TORCH_CHECK | ||||
|   constexpr ArrayRef<T> slice(size_t N, size_t M) const { | ||||
|     TORCH_CHECK( | ||||
|         N + M <= size(), | ||||
|         N + M <= this->size(), | ||||
|         "ArrayRef: invalid slice, N = ", | ||||
|         N, | ||||
|         "; M = ", | ||||
|         M, | ||||
|         "; size = ", | ||||
|         size()); | ||||
|     return ArrayRef<T>(data() + N, M); | ||||
|         this->size()); | ||||
|     return ArrayRef<T>(this->data() + N, M); | ||||
|   } | ||||
|  | ||||
|   /// slice(n) - Chop off the first N elements of the array. | ||||
|   /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of | ||||
|   /// STD_TORCH_CHECK | ||||
|   constexpr ArrayRef<T> slice(size_t N) const { | ||||
|     TORCH_CHECK( | ||||
|         N <= size(), "ArrayRef: invalid slice, N = ", N, "; size = ", size()); | ||||
|     return slice(N, size() - N); | ||||
|         N <= this->size(), | ||||
|         "ArrayRef: invalid slice, N = ", | ||||
|         N, | ||||
|         "; size = ", | ||||
|         this->size()); | ||||
|     return slice(N, this->size() - N); // should this slice be this->slice? | ||||
|   } | ||||
|  | ||||
|   /// @} | ||||
|   /// @name Operator Overloads | ||||
|   /// @{ | ||||
|   constexpr const T& operator[](size_t Index) const { | ||||
|     return Data[Index]; | ||||
|   } | ||||
|  | ||||
|   /// Vector compatibility | ||||
|   /// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of | ||||
|   /// STD_TORCH_CHECK | ||||
|   constexpr const T& at(size_t Index) const { | ||||
|     TORCH_CHECK( | ||||
|         Index < Length, | ||||
|         Index < this->Length, | ||||
|         "ArrayRef: invalid index Index = ", | ||||
|         Index, | ||||
|         "; Length = ", | ||||
|         Length); | ||||
|     return Data[Index]; | ||||
|         this->Length); | ||||
|     return this->Data[Index]; | ||||
|   } | ||||
|  | ||||
|   /// Disallow accidental assignment from a temporary. | ||||
| @ -253,13 +160,6 @@ class ArrayRef final { | ||||
|   std::enable_if_t<std::is_same_v<U, T>, ArrayRef<T>>& operator=( | ||||
|       std::initializer_list<U>) = delete; | ||||
|  | ||||
|   /// @} | ||||
|   /// @name Expensive Operations | ||||
|   /// @{ | ||||
|   std::vector<T> vec() const { | ||||
|     return std::vector<T>(Data, Data + Length); | ||||
|   } | ||||
|  | ||||
|   /// @} | ||||
| }; | ||||
|  | ||||
|  | ||||
| @ -1,5 +1,7 @@ | ||||
| #include <c10/util/complex.h> | ||||
|  | ||||
| #include <cmath> | ||||
|  | ||||
| // Note [ Complex Square root in libc++] | ||||
| // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
| // In libc++ complex square root is computed using polar form | ||||
|  | ||||
| @ -132,15 +132,15 @@ std::ostream& operator<<(std::ostream& o, const uint128& b) { | ||||
|   int div_base_log = 0; | ||||
|   switch (flags & std::ios::basefield) { | ||||
|     case std::ios::hex: | ||||
|       div = static_cast<uint64_t>(0x1000000000000000u); // 16^15 | ||||
|       div = (uint64_t)0x1000000000000000u; // 16^15 | ||||
|       div_base_log = 15; | ||||
|       break; | ||||
|     case std::ios::oct: | ||||
|       div = static_cast<uint64_t>(01000000000000000000000u); // 8^21 | ||||
|       div = (uint64_t)01000000000000000000000u; // 8^21 | ||||
|       div_base_log = 21; | ||||
|       break; | ||||
|     default: // std::ios::dec | ||||
|       div = static_cast<uint64_t>(10000000000000000000u); // 10^19 | ||||
|       div = (uint64_t)10000000000000000000u; // 10^19 | ||||
|       div_base_log = 19; | ||||
|       break; | ||||
|   } | ||||
|  | ||||
| @ -11,6 +11,7 @@ | ||||
| #include <unistd.h> | ||||
|  | ||||
| #include <atomic> | ||||
| #include <chrono> | ||||
| #include <condition_variable> | ||||
| #include <cstdint> | ||||
| #include <cstdio> | ||||
|  | ||||
| @ -554,7 +554,7 @@ static void local_raw_delete(void* ptr); | ||||
|  | ||||
| class XPUAllocator : public DeviceAllocator { | ||||
|  private: | ||||
|   alignas(hardware_destructive_interference_size) std::mutex mutex; | ||||
|   std::mutex mutex; | ||||
|   ska::flat_hash_map<void*, Block*> allocated_blocks; | ||||
|  | ||||
|   void add_allocated_block(Block* block) { | ||||
|  | ||||
| @ -74,7 +74,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets) | ||||
|         ) | ||||
|  | ||||
|     code.append("      " + OutType + "* op = &out[rangeIndex * block_size];") | ||||
|     for i in range(uf): | ||||
|     for i in range(0, uf): | ||||
|         j = 8 * i | ||||
|         code.append("      __m256 vop" + str(j) + " = _mm256_setzero_ps();") | ||||
|  | ||||
| @ -158,7 +158,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets) | ||||
|         "&input[idx_pref_T0 * fused_block_size];" | ||||
|     ) | ||||
|  | ||||
|     for i in range(uf): | ||||
|     for i in range(0, uf): | ||||
|         j = 8 * i | ||||
|         cachelinesize = 64 | ||||
|         byteoffset = sizeof[InType] * j | ||||
| @ -170,7 +170,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets) | ||||
|         code.append("      if (!normalize_by_lengths || length == 0) {") | ||||
|     else: | ||||
|         code.append("      if (!normalize_by_lengths || lengths[rangeIndex] == 0) {") | ||||
|     for i in range(uf): | ||||
|     for i in range(0, uf): | ||||
|         j = 8 * i | ||||
|         code.append("        _mm256_storeu_ps(&op[" + str(j) + "], vop" + str(j) + ");") | ||||
|     code.append("      } else {") | ||||
| @ -181,7 +181,7 @@ def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets) | ||||
|         code.append( | ||||
|             "        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);" | ||||
|         ) | ||||
|     for i in range(uf): | ||||
|     for i in range(0, uf): | ||||
|         j = 8 * i | ||||
|         code.append( | ||||
|             "        _mm256_storeu_ps(&op[" | ||||
|  | ||||
| @ -16,7 +16,7 @@ find_path(vecLib_INCLUDE_DIR vecLib.h | ||||
|           DOC "vecLib include directory" | ||||
|           PATHS /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix} | ||||
|                 /System/Library/${__veclib_include_suffix} | ||||
|                 /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/ | ||||
|                 /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/ | ||||
|                 ${CMAKE_OSX_SYSROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix} | ||||
|           NO_DEFAULT_PATH) | ||||
|  | ||||
|  | ||||
| @ -224,12 +224,6 @@ AMD/ROCm/HIP | ||||
| -  Jithun Nair (`jithunnair-amd <https://github.com/jithunnair-amd>`__) | ||||
| -  (emeritus) Junjie Bai (`bddppq <https://github.com/bddppq>`__) | ||||
|  | ||||
| XPU/Intel GPU | ||||
| ~~~~~~~~~~~~~ | ||||
|  | ||||
| - Eikan Wang (`EikanWang <https://github.com/EikanWang>`__) | ||||
| - Guangye Yu (`guangyey <https://github.com/guangyey>`__) | ||||
|  | ||||
| Build + CI | ||||
| ~~~~~~~~~~ | ||||
|  | ||||
|  | ||||
| @ -207,42 +207,6 @@ templates_path = [ | ||||
| ] | ||||
| # TODO: document these and remove them from here. | ||||
|  | ||||
| # Fixes the duplicated | ||||
| autosummary_filename_map = { | ||||
|     "torch.nn.utils.prune.identity": "torch.nn.utils.prune.identity_function", | ||||
|     "torch.nn.utils.prune.Identity": "torch.nn.utils.prune.Identity_class", | ||||
|     "torch.optim.adamw.adamw": "torch.optim.adamw.adamw_function", | ||||
|     "torch.optim.adamw.AdamW": "torch.optim.adamw.AdamW_class", | ||||
|     "torch.optim.asgd.asgd": "torch.optim.asgd.asgd_function", | ||||
|     "torch.optim.asgd.ASGD": "torch.optim.asgd.ASGD_class", | ||||
|     "torch.optim.nadam.nadam": "torch.optim.nadam.nadam_function", | ||||
|     "torch.optim.nadam.NAdam": "torch.optim.nadam.NAdam_class", | ||||
|     "torch.optim.radam.radam": "torch.optim.radam.radam_function", | ||||
|     "torch.optim.radam.RAdam": "torch.optim.radam.RAdam_class", | ||||
|     "torch.optim.rmsprop.rmsprop": "torch.optim.rmsprop.rmsprop_function", | ||||
|     "torch.optim.rmsprop.RMSprop": "torch.optim.rmsprop.RMSprop_class", | ||||
|     "torch.optim.rprop.rprop": "torch.optim.rprop.rprop_function", | ||||
|     "torch.optim.rprop.Rprop": "torch.optim.rprop.Rprop_class", | ||||
|     "torch.optim.sgd.sgd": "torch.optim.sgd.sgd_function", | ||||
|     "torch.optim.sgd.SGD": "torch.optim.sgd.SGD_class", | ||||
|     "torch.optim.adadelta.adadelta": "torch.optim.adadelta.adadelta_function", | ||||
|     "torch.optim.adadelta.Adadelta": "torch.optim.adadelta.Adadelta_class", | ||||
|     "torch.optim.adagrad.adagrad": "torch.optim.adagrad.adagrad_function", | ||||
|     "torch.optim.adagrad.Adagrad": "torch.optim.adagrad.Adagrad_class", | ||||
|     "torch.optim.adam.adam": "torch.optim.adam.adam_function", | ||||
|     "torch.optim.adam.Adam": "torch.optim.adam.Adam_class", | ||||
|     "torch.optim.adamax.adamax": "torch.optim.adamax.adamax_function", | ||||
|     "torch.optim.adamax.Adamax": "torch.optim.adamax.Adamax_class", | ||||
|     "torch.mtia.stream": "torch.mtia.stream_function", | ||||
|     "torch.mtia.Stream": "torch.mtia.Stream_class", | ||||
|     "torch.cpu.stream": "torch.cpu.stream_function", | ||||
|     "torch.cpu.Stream": "torch.cpu.Stream_class", | ||||
|     "torch.cuda.stream": "torch.cuda.stream_function", | ||||
|     "torch.cuda.Stream": "torch.cuda.Stream_class", | ||||
|     "torch.xpu.stream": "torch.xpu.stream_function", | ||||
|     "torch.xpu.Stream": "torch.xpu.Stream_class", | ||||
| } | ||||
|  | ||||
| coverage_ignore_functions = [ | ||||
|     # torch | ||||
|     "typename", | ||||
| @ -3229,11 +3193,6 @@ autodoc_type_aliases = { | ||||
| # Enable overriding of function signatures in the first line of the docstring. | ||||
| autodoc_docstring_signature = True | ||||
|  | ||||
| # Exclude inherited IntEnum methods that have RST formatting issues in their docstrings | ||||
| autodoc_default_options = { | ||||
|     "exclude-members": "from_bytes, to_bytes", | ||||
| } | ||||
|  | ||||
| # -- katex javascript in header | ||||
| # | ||||
| #    def setup(app): | ||||
|  | ||||
| @ -158,11 +158,11 @@ This format captures everything present in the Node class, with the exception of | ||||
|  | ||||
| Concretely: | ||||
|  | ||||
| - **<name>** is the name of the node as it would appear in `node.name`. | ||||
| - **<op_name>** is the `node.op` field, which must be one of these: | ||||
| - **\<name>** is the name of the node as it would appear in `node.name`. | ||||
| - **\<op_name>** is the `node.op` field, which must be one of these: | ||||
|   `<call_function>`, `<placeholder>`, | ||||
|   `<get_attr>`, or `<output>`. | ||||
| - **<target>** is the target of the node as `node.target`. The meaning of this | ||||
| - **\<target>** is the target of the node as `node.target`. The meaning of this | ||||
|   field depends on `op_name`. | ||||
| - **args1, … args 4…** are what is listed in the `node.args` tuple. If a | ||||
|   value in the list is an {class}`torch.fx.Node`, then it will be especially | ||||
|  | ||||
| @ -253,6 +253,7 @@ regular full-precision tensor. | ||||
| .. autosummary:: | ||||
|     :toctree: generated | ||||
|     :nosignatures: | ||||
|     :template: classtemplate.rst | ||||
|  | ||||
|     view | ||||
|     as_strided | ||||
|  | ||||
| @ -159,6 +159,8 @@ ignore = [ | ||||
|     "EXE001", | ||||
|     "F405", | ||||
|     "FURB122", # writelines | ||||
|     # these ignores are from flake8-logging-format; please fix! | ||||
|     "G101", | ||||
|     # these ignores are from ruff NPY; please fix! | ||||
|     "NPY002", | ||||
|     # these ignores are from ruff PERF; please fix! | ||||
| @ -202,10 +204,14 @@ select = [ | ||||
|     "NPY", | ||||
|     "PERF", | ||||
|     "PGH004", | ||||
|     "PIE", | ||||
|     "PIE790", | ||||
|     "PIE794", | ||||
|     "PIE800", | ||||
|     "PIE804", | ||||
|     "PIE807", | ||||
|     "PIE810", | ||||
|     "PLC0131", # type bivariance | ||||
|     "PLC0132", # type param mismatch | ||||
|     "PLC1802", # len({expression}) used as condition without comparison | ||||
|     "PLC0205", # string as __slots__ | ||||
|     "PLC3002", # unnecessary-direct-lambda-call | ||||
|     "PLE", | ||||
| @ -213,7 +219,6 @@ select = [ | ||||
|     "PLR0206", # property with params | ||||
|     "PLR1722", # use sys exit | ||||
|     "PLR1736", # unnecessary list index | ||||
|     "PLW0127", # Self-assignment of variable | ||||
|     "PLW0129", # assert on string literal | ||||
|     "PLW0131", # named expr without context | ||||
|     "PLW0133", # useless exception statement | ||||
|  | ||||
							
								
								
									
										20
									
								
								pyrefly.toml
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								pyrefly.toml
									
									
									
									
									
								
							| @ -5,7 +5,6 @@ python-version = "3.12" | ||||
| project-includes = [ | ||||
|     "torch", | ||||
|     "caffe2", | ||||
|     "tools", | ||||
|     "test/test_bundled_images.py", | ||||
|     "test/test_bundled_inputs.py", | ||||
|     "test/test_complex.py", | ||||
| @ -23,15 +22,10 @@ project-includes = [ | ||||
| project-excludes = [ | ||||
|   # ==== below will be enabled directory by directory ==== | ||||
|   # ==== to test Pyrefly on a specific directory, simply comment it out ==== | ||||
|   "torch/_inductor/runtime", | ||||
|   "torch/_inductor/codegen/triton.py", | ||||
|   "tools/linter/adapters/test_device_bias_linter.py", | ||||
|   "tools/code_analyzer/gen_operators_yaml.py", | ||||
|   "torch/_inductor/runtime/triton_heuristics.py", | ||||
|   "torch/_inductor/runtime/triton_helpers.py", | ||||
|   "torch/_inductor/runtime/halide_helpers.py", | ||||
|   # formatting issues, will turn on after adjusting where suppressions can be | ||||
|   # in import statements | ||||
|   "tools/flight_recorder/components/types.py", | ||||
|   "torch/linalg/__init__.py", | ||||
|   "torch/package/importer.py", | ||||
|   "torch/package/_package_pickler.py", | ||||
| @ -46,6 +40,17 @@ project-excludes = [ | ||||
|   "torch/distributed/elastic/metrics/__init__.py", | ||||
|   "torch/_inductor/fx_passes/bucketing.py", | ||||
|   # ==== | ||||
|   "benchmarks/instruction_counts/main.py", | ||||
|   "benchmarks/instruction_counts/definitions/setup.py", | ||||
|   "benchmarks/instruction_counts/applications/ci.py", | ||||
|   "benchmarks/instruction_counts/core/api.py", | ||||
|   "benchmarks/instruction_counts/core/expand.py", | ||||
|   "benchmarks/instruction_counts/core/types.py", | ||||
|   "benchmarks/instruction_counts/core/utils.py", | ||||
|   "benchmarks/instruction_counts/definitions/standard.py", | ||||
|   "benchmarks/instruction_counts/definitions/setup.py", | ||||
|   "benchmarks/instruction_counts/execution/runner.py", | ||||
|   "benchmarks/instruction_counts/execution/work.py", | ||||
|   "torch/include/**", | ||||
|   "torch/csrc/**", | ||||
|   "torch/distributed/elastic/agent/server/api.py", | ||||
| @ -132,4 +137,3 @@ errors.bad-param-name-override = false | ||||
| errors.implicit-import = false | ||||
| permissive-ignores = true | ||||
| replace-imports-with-any = ["!sympy.printing.*", "sympy.*", "onnxscript.onnx_opset.*"] | ||||
| search-path = ["tools/experimental"] | ||||
|  | ||||
| @ -190,7 +190,7 @@ class TestActivationSparsifier(TestCase): | ||||
|                 if features is None: | ||||
|                     assert torch.all(mask * input_data == output) | ||||
|                 else: | ||||
|                     for feature_idx in range(len(features)): | ||||
|                     for feature_idx in range(0, len(features)): | ||||
|                         feature = torch.Tensor( | ||||
|                             [features[feature_idx]], device=input_data.device | ||||
|                         ).long() | ||||
| @ -378,7 +378,7 @@ class TestActivationSparsifier(TestCase): | ||||
|         # some dummy data | ||||
|         data_list = [] | ||||
|         num_data_points = 5 | ||||
|         for _ in range(num_data_points): | ||||
|         for _ in range(0, num_data_points): | ||||
|             rand_data = torch.randn(16, 1, 28, 28) | ||||
|             activation_sparsifier.model(rand_data) | ||||
|             data_list.append(rand_data) | ||||
|  | ||||
| @ -143,7 +143,7 @@ class TestBaseDataScheduler(TestCase): | ||||
|  | ||||
|         # checking step count | ||||
|         step_cnt = 5 | ||||
|         for _ in range(step_cnt): | ||||
|         for _ in range(0, step_cnt): | ||||
|             sparsifier.step() | ||||
|             scheduler.step() | ||||
|  | ||||
|  | ||||
| @ -123,7 +123,7 @@ class _BaseDataSparsiferTestCase(TestCase): | ||||
|  | ||||
|         step_count = 3 | ||||
|  | ||||
|         for _ in range(step_count): | ||||
|         for _ in range(0, step_count): | ||||
|             sparsifier.step() | ||||
|         for some_data in all_data: | ||||
|             name, data, _ = self._get_name_data_config(some_data) | ||||
|  | ||||
| @ -472,8 +472,8 @@ class TestNearlyDiagonalSparsifier(TestCase): | ||||
|         else: | ||||
|             height, width = mask.shape | ||||
|             dist_to_diagonal = nearliness // 2 | ||||
|             for row in range(height): | ||||
|                 for col in range(width): | ||||
|             for row in range(0, height): | ||||
|                 for col in range(0, width): | ||||
|                     if abs(row - col) <= dist_to_diagonal: | ||||
|                         assert mask[row, col] == 1 | ||||
|                     else: | ||||
|  | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	