mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-31 12:15:03 +08:00 
			
		
		
		
	Compare commits
	
		
			3 Commits
		
	
	
		
			bahuang/fi
			...
			async_tp
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 83349ae64d | |||
| bf08b164dc | |||
| da0b6aea11 | 
| @ -5,9 +5,9 @@ GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-} | ||||
|  | ||||
| # Set CUDA architecture lists to match x86 build_cuda.sh | ||||
| if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then | ||||
|     export TORCH_CUDA_ARCH_LIST="8.0;9.0" | ||||
|     export TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;8.0;9.0" | ||||
| elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then | ||||
|     export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0" | ||||
|     export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;10.0;12.0" | ||||
| elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then | ||||
|     export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX" | ||||
| fi | ||||
| @ -31,7 +31,8 @@ pip install -r /pytorch/requirements.txt | ||||
| pip install auditwheel==6.2.0 wheel | ||||
| if [ "$DESIRED_CUDA" = "cpu" ]; then | ||||
|     echo "BASE_CUDA_VERSION is not set. Building cpu wheel." | ||||
|     python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn | ||||
|     #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files | ||||
|     USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn | ||||
| else | ||||
|     echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" | ||||
|     export USE_SYSTEM_NCCL=1 | ||||
| @ -41,9 +42,13 @@ else | ||||
|         echo "Bundling CUDA libraries with wheel for aarch64." | ||||
|     else | ||||
|         echo "Using nvidia libs from pypi for aarch64." | ||||
|         # Fix platform constraints in PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64 | ||||
|         # Replace 'platform_machine == "x86_64"' with 'platform_machine == "aarch64"' | ||||
|         export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS//platform_machine == \'x86_64\'/platform_machine == \'aarch64\'}" | ||||
|         echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS" | ||||
|         export USE_NVIDIA_PYPI_LIBS=1 | ||||
|     fi | ||||
|  | ||||
|     python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda | ||||
|     #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files | ||||
|     USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda | ||||
| fi | ||||
|  | ||||
| @ -138,8 +138,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None: | ||||
|     folder = os.path.dirname(wheel_path) | ||||
|     os.mkdir(f"{folder}/tmp") | ||||
|     os.system(f"unzip {wheel_path} -d {folder}/tmp") | ||||
|     # Delete original wheel since it will be repackaged | ||||
|     os.system(f"rm {wheel_path}") | ||||
|  | ||||
|     # Check if we should use PyPI NVIDIA libraries or bundle system libraries | ||||
|     use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1" | ||||
| @ -213,8 +211,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None: | ||||
|         ] | ||||
|  | ||||
|         # CUDA version-specific libraries | ||||
|         if "13" in desired_cuda: | ||||
|             minor_version = desired_cuda[-1] | ||||
|         if "130" in desired_cuda: | ||||
|             version_specific_libs = [ | ||||
|                 "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13", | ||||
|                 "/usr/local/cuda/lib64/libcublas.so.13", | ||||
| @ -224,7 +221,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None: | ||||
|                 "/usr/local/cuda/lib64/libcusolver.so.12", | ||||
|                 "/usr/local/cuda/lib64/libnvJitLink.so.13", | ||||
|                 "/usr/local/cuda/lib64/libnvrtc.so.13", | ||||
|                 f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}", | ||||
|                 "/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0", | ||||
|             ] | ||||
|         elif "12" in desired_cuda: | ||||
|             # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9") | ||||
| @ -240,8 +237,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None: | ||||
|                 "/usr/local/cuda/lib64/libnvrtc.so.12", | ||||
|                 f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}", | ||||
|             ] | ||||
|         else: | ||||
|             raise ValueError(f"Unsupported CUDA version: {desired_cuda}.") | ||||
|  | ||||
|         # Combine all libraries | ||||
|         libs_to_copy = common_libs + version_specific_libs | ||||
| @ -280,7 +275,14 @@ def complete_wheel(folder: str) -> str: | ||||
|             f"/{folder}/dist/{repaired_wheel_name}", | ||||
|         ) | ||||
|     else: | ||||
|         repaired_wheel_name = list_dir(f"/{folder}/dist")[0] | ||||
|         repaired_wheel_name = wheel_name.replace( | ||||
|             "linux_aarch64", "manylinux_2_28_aarch64" | ||||
|         ) | ||||
|         print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}") | ||||
|         os.rename( | ||||
|             f"/{folder}/dist/{wheel_name}", | ||||
|             f"/{folder}/dist/{repaired_wheel_name}", | ||||
|         ) | ||||
|  | ||||
|     print(f"Copying {repaired_wheel_name} to artifacts") | ||||
|     shutil.copy2( | ||||
| @ -317,7 +319,7 @@ if __name__ == "__main__": | ||||
|     ).decode() | ||||
|  | ||||
|     print("Building PyTorch wheel") | ||||
|     build_vars = "" | ||||
|     build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " | ||||
|     # MAX_JOB=5 is not required for CPU backend (see commit 465d98b) | ||||
|     if enable_cuda: | ||||
|         build_vars += "MAX_JOBS=5 " | ||||
|  | ||||
| @ -214,7 +214,8 @@ case "$tag" in | ||||
|     TRITON=yes | ||||
|     ;; | ||||
|   pytorch-linux-jammy-py3-gcc11-inductor-benchmarks) | ||||
|     ANACONDA_PYTHON_VERSION=3.10 | ||||
|     # TODO (huydhn): Upgrade this to Python >= 3.10 | ||||
|     ANACONDA_PYTHON_VERSION=3.9 | ||||
|     GCC_VERSION=11 | ||||
|     VISION=yes | ||||
|     KATEX=yes | ||||
|  | ||||
| @ -56,13 +56,9 @@ ENV INSTALLED_VISION ${VISION} | ||||
|  | ||||
| # Install rocm | ||||
| ARG ROCM_VERSION | ||||
| RUN mkdir ci_commit_pins | ||||
| COPY ./common/common_utils.sh common_utils.sh | ||||
| COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt | ||||
| COPY ./common/install_rocm.sh install_rocm.sh | ||||
| RUN bash ./install_rocm.sh | ||||
| RUN rm install_rocm.sh common_utils.sh | ||||
| RUN rm -r ci_commit_pins | ||||
| RUN rm install_rocm.sh | ||||
| COPY ./common/install_rocm_magma.sh install_rocm_magma.sh | ||||
| RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} | ||||
| RUN rm install_rocm_magma.sh | ||||
|  | ||||
| @ -1 +1 @@ | ||||
| e0dda9059d082537cee36be6c5e4fe3b18c880c0 | ||||
| 56392aa978594cc155fa8af48cd949f5b5f1823a | ||||
|  | ||||
| @ -1 +0,0 @@ | ||||
| 7fe50dc3da2069d6645d9deb8c017a876472a977 | ||||
| @ -1 +1 @@ | ||||
| 5ae38bdb0dc066c5823e34dc9797afb9de42c866 | ||||
| fccfc522864cf8bc172abe0cd58ae5581e2d44b9 | ||||
|  | ||||
| @ -42,27 +42,22 @@ install_pip_dependencies() { | ||||
|   # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current | ||||
|   # numba and scipy version used in PyTorch CI | ||||
|   conda_run pip uninstall -y numba scipy | ||||
|   # Yaspin is needed for running CI test (get_benchmark_analysis_data.py) | ||||
|   pip_install yaspin==3.1.0 | ||||
|  | ||||
|   popd | ||||
| } | ||||
|  | ||||
| setup_executorch() { | ||||
|   pushd executorch | ||||
|  | ||||
|   export PYTHON_EXECUTABLE=python | ||||
|   export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON" | ||||
|   export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" | ||||
|  | ||||
|   as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true | ||||
|   popd | ||||
| } | ||||
|  | ||||
| if [ $# -eq 0 ]; then | ||||
|   clone_executorch | ||||
|   install_buck2 | ||||
|   install_conda_dependencies | ||||
|   install_pip_dependencies | ||||
|   pushd executorch | ||||
|   setup_executorch | ||||
|   popd | ||||
| else | ||||
|   "$@" | ||||
| fi | ||||
| clone_executorch | ||||
| install_buck2 | ||||
| install_conda_dependencies | ||||
| install_pip_dependencies | ||||
| setup_executorch | ||||
|  | ||||
| @ -2,11 +2,6 @@ | ||||
|  | ||||
| set -ex | ||||
|  | ||||
| # for pip_install function | ||||
| source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" | ||||
|  | ||||
| ROCM_COMPOSABLE_KERNEL_VERSION="$(cat $(dirname $0)/../ci_commit_pins/rocm-composable-kernel.txt)" | ||||
|  | ||||
| ver() { | ||||
|     printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' '); | ||||
| } | ||||
| @ -118,8 +113,6 @@ EOF | ||||
|         rm -rf HIP clr | ||||
|     fi | ||||
|  | ||||
|     pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION" | ||||
|  | ||||
|     # Cleanup | ||||
|     apt-get autoclean && apt-get clean | ||||
|     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* | ||||
| @ -183,8 +176,6 @@ install_centos() { | ||||
|       sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;" | ||||
|   done | ||||
|  | ||||
|   pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION" | ||||
|  | ||||
|   # Cleanup | ||||
|   yum clean all | ||||
|   rm -rf /var/cache/yum | ||||
|  | ||||
| @ -52,13 +52,9 @@ ENV INSTALLED_VISION ${VISION} | ||||
|  | ||||
| # Install rocm | ||||
| ARG ROCM_VERSION | ||||
| RUN mkdir ci_commit_pins | ||||
| COPY ./common/common_utils.sh common_utils.sh | ||||
| COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt | ||||
| COPY ./common/install_rocm.sh install_rocm.sh | ||||
| RUN bash ./install_rocm.sh | ||||
| RUN rm install_rocm.sh common_utils.sh | ||||
| RUN rm -r ci_commit_pins | ||||
| RUN rm install_rocm.sh | ||||
| COPY ./common/install_rocm_magma.sh install_rocm_magma.sh | ||||
| RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} | ||||
| RUN rm install_rocm_magma.sh | ||||
|  | ||||
| @ -7,4 +7,4 @@ set -ex | ||||
|  | ||||
| SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" | ||||
|  | ||||
| USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh | ||||
| USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh | ||||
|  | ||||
| @ -66,11 +66,6 @@ class VllmBuildParameters: | ||||
|         "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm" | ||||
|     ) | ||||
|  | ||||
|     # the cleaning script to remove torch dependencies from pip | ||||
|     cleaning_script: Path = env_path_field( | ||||
|         "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py" | ||||
|     ) | ||||
|  | ||||
|     # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts | ||||
|     output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm") | ||||
|  | ||||
| @ -165,7 +160,6 @@ class VllmBuildRunner(BaseRunner): | ||||
|         logger.info("Running vllm build with inputs: %s", inputs) | ||||
|         vllm_commit = clone_vllm() | ||||
|  | ||||
|         self.cp_torch_cleaning_script(inputs) | ||||
|         self.cp_dockerfile_if_exist(inputs) | ||||
|         # cp torch wheels from root direct to vllm workspace if exist | ||||
|         self.cp_torch_whls_if_exist(inputs) | ||||
| @ -211,11 +205,6 @@ class VllmBuildRunner(BaseRunner): | ||||
|         copy(inputs.torch_whls_path, tmp_dir) | ||||
|         return tmp_dir | ||||
|  | ||||
|     def cp_torch_cleaning_script(self, inputs: VllmBuildParameters): | ||||
|         script = get_path(inputs.cleaning_script, resolve=True) | ||||
|         vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py") | ||||
|         copy(script, vllm_script) | ||||
|  | ||||
|     def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters): | ||||
|         if not inputs.use_local_dockerfile: | ||||
|             logger.info("using vllm default dockerfile.torch_nightly for build") | ||||
|  | ||||
| @ -11,7 +11,7 @@ from typing import Any | ||||
|  | ||||
| from cli.lib.common.cli_helper import BaseRunner | ||||
| from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env | ||||
| from cli.lib.common.path_helper import copy, get_path, remove_dir | ||||
| from cli.lib.common.path_helper import copy, remove_dir | ||||
| from cli.lib.common.pip_helper import ( | ||||
|     pip_install_first_match, | ||||
|     pip_install_packages, | ||||
| @ -43,10 +43,6 @@ class VllmTestParameters: | ||||
|  | ||||
|     torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9") | ||||
|  | ||||
|     cleaning_script: Path = env_path_field( | ||||
|         "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py" | ||||
|     ) | ||||
|  | ||||
|     def __post_init__(self): | ||||
|         if not self.torch_whls_path.exists(): | ||||
|             raise ValueError("missing torch_whls_path") | ||||
| @ -96,13 +92,11 @@ class VllmTestRunner(BaseRunner): | ||||
|         self._set_envs(params) | ||||
|  | ||||
|         clone_vllm(dst=self.work_directory) | ||||
|         self.cp_torch_cleaning_script(params) | ||||
|         with working_directory(self.work_directory): | ||||
|             remove_dir(Path("vllm")) | ||||
|             self._install_wheels(params) | ||||
|             self._install_dependencies() | ||||
|         # verify the torches are not overridden by test dependencies | ||||
|  | ||||
|         check_versions() | ||||
|  | ||||
|     def run(self): | ||||
| @ -131,11 +125,6 @@ class VllmTestRunner(BaseRunner): | ||||
|             # double check the torches are not overridden by other packages | ||||
|             check_versions() | ||||
|  | ||||
|     def cp_torch_cleaning_script(self, params: VllmTestParameters): | ||||
|         script = get_path(params.cleaning_script, resolve=True) | ||||
|         vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py") | ||||
|         copy(script, vllm_script) | ||||
|  | ||||
|     def _install_wheels(self, params: VllmTestParameters): | ||||
|         logger.info("Running vllm test with inputs: %s", params) | ||||
|         if not pkg_exists("torch"): | ||||
|  | ||||
| @ -258,19 +258,11 @@ function install_torchrec_and_fbgemm() { | ||||
|       git clone --recursive https://github.com/pytorch/fbgemm | ||||
|       pushd fbgemm/fbgemm_gpu | ||||
|       git checkout "${fbgemm_commit}" --recurse-submodules | ||||
|       # until the fbgemm_commit includes the tbb patch | ||||
|       patch <<'EOF' | ||||
| --- a/FbgemmGpu.cmake | ||||
| +++ b/FbgemmGpu.cmake | ||||
| @@ -184,5 +184,6 @@ gpu_cpp_library( | ||||
|      fbgemm_gpu_tbe_cache | ||||
|      fbgemm_gpu_tbe_optimizers | ||||
|      fbgemm_gpu_tbe_utils | ||||
| +    tbb | ||||
|    DESTINATION | ||||
|      fbgemm_gpu) | ||||
| EOF | ||||
|       python setup.py bdist_wheel --build-variant=rocm | ||||
|       python setup.py bdist_wheel \ | ||||
|         --build-variant=rocm \ | ||||
|         -DHIP_ROOT_DIR="${ROCM_PATH}" \ | ||||
|         -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \ | ||||
|         -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA" | ||||
|       popd | ||||
|  | ||||
|       # Save the wheel before cleaning up | ||||
|  | ||||
							
								
								
									
										40
									
								
								.ci/pytorch/functorch_doc_push_script.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										40
									
								
								.ci/pytorch/functorch_doc_push_script.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,40 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This is where the local pytorch install in the docker image is located | ||||
| pt_checkout="/var/lib/jenkins/workspace" | ||||
| source "$pt_checkout/.ci/pytorch/common_utils.sh" | ||||
| echo "functorch_doc_push_script.sh: Invoked with $*" | ||||
|  | ||||
| set -ex -o pipefail | ||||
|  | ||||
| version=${DOCS_VERSION:-nightly} | ||||
| echo "version: $version" | ||||
|  | ||||
| # Build functorch docs | ||||
| pushd $pt_checkout/functorch/docs | ||||
| make html | ||||
| popd | ||||
|  | ||||
| git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages | ||||
| pushd functorch_ghpages | ||||
|  | ||||
| if [ "$version" == "main" ]; then | ||||
|   version=nightly | ||||
| fi | ||||
|  | ||||
| git rm -rf "$version" || true | ||||
| mv "$pt_checkout/functorch/docs/build/html" "$version" | ||||
|  | ||||
| git add "$version" || true | ||||
| git status | ||||
| git config user.email "soumith+bot@pytorch.org" | ||||
| git config user.name "pytorchbot" | ||||
| # If there aren't changes, don't make a commit; push is no-op | ||||
| git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true | ||||
| git status | ||||
|  | ||||
| if [[ "${WITH_PUSH:-}" == true ]]; then | ||||
|   git push -u origin gh-pages | ||||
| fi | ||||
|  | ||||
| popd | ||||
| @ -1,25 +0,0 @@ | ||||
| From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001 | ||||
| From: Michael Wang <13521008+isVoid@users.noreply.github.com> | ||||
| Date: Tue, 1 Apr 2025 17:28:05 -0700 | ||||
| Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage | ||||
|  (#185) | ||||
|  | ||||
| Co-authored-by: isVoid <isVoid@users.noreply.github.com> | ||||
| --- | ||||
|  numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++ | ||||
|  1 file changed, 3 insertions(+) | ||||
|  | ||||
| diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py | ||||
| index 1641bf77..233e9ed7 100644 | ||||
| --- a/numba_cuda/numba/cuda/cudadrv/driver.py | ||||
| +++ b/numba_cuda/numba/cuda/cudadrv/driver.py | ||||
| @@ -365,6 +365,9 @@ def _find_api(self, fname): | ||||
|          else: | ||||
|              variants = ('_v2', '') | ||||
|   | ||||
| +        if fname in ("cuCtxGetDevice", "cuCtxSynchronize"): | ||||
| +            return getattr(self.lib, fname) | ||||
| + | ||||
|          for variant in variants: | ||||
|              try: | ||||
|                  return getattr(self.lib, f'{fname}{variant}') | ||||
| @ -386,8 +386,8 @@ def smoke_test_compile(device: str = "cpu") -> None: | ||||
|  | ||||
|  | ||||
| def smoke_test_nvshmem() -> None: | ||||
|     if not torch.cuda.is_available() or target_os == "windows": | ||||
|         print("Windows platform or CUDA is not available, skipping NVSHMEM test") | ||||
|     if not torch.cuda.is_available(): | ||||
|         print("CUDA is not available, skipping NVSHMEM test") | ||||
|         return | ||||
|  | ||||
|     # Check if NVSHMEM is compiled in current build | ||||
| @ -396,9 +396,7 @@ def smoke_test_nvshmem() -> None: | ||||
|     except ImportError: | ||||
|         # Not built with NVSHMEM support. | ||||
|         # torch is not compiled with NVSHMEM prior to 2.9 | ||||
|         from torch.torch_version import TorchVersion | ||||
|  | ||||
|         if TorchVersion(torch.__version__) < (2, 9): | ||||
|         if torch.__version__ < "2.9": | ||||
|             return | ||||
|         else: | ||||
|             # After 2.9: NVSHMEM is expected to be compiled in current build | ||||
|  | ||||
| @ -32,16 +32,6 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v | ||||
|   git config --global --add safe.directory /var/lib/jenkins/workspace | ||||
| fi | ||||
|  | ||||
|  | ||||
| # Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878 | ||||
| NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true) | ||||
| if [ -n "$NUMBA_CUDA_DIR" ]; then | ||||
|   NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch" | ||||
|   pushd "$NUMBA_CUDA_DIR" | ||||
|   patch -p4 <"$NUMBA_PATCH" | ||||
|   popd | ||||
| fi | ||||
|  | ||||
| echo "Environment variables:" | ||||
| env | ||||
|  | ||||
| @ -1550,10 +1540,14 @@ test_executorch() { | ||||
|   install_torchvision | ||||
|   install_torchaudio | ||||
|  | ||||
|   INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh" | ||||
|  | ||||
|   pushd /executorch | ||||
|   "${INSTALL_SCRIPT}" setup_executorch | ||||
|  | ||||
|   export PYTHON_EXECUTABLE=python | ||||
|   export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" | ||||
|  | ||||
|   # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch | ||||
|   # from the PR | ||||
|   bash .ci/scripts/setup-linux.sh --build-tool cmake | ||||
|  | ||||
|   echo "Run ExecuTorch unit tests" | ||||
|   pytest -v -n auto | ||||
| @ -1567,6 +1561,10 @@ test_executorch() { | ||||
|  | ||||
|   popd | ||||
|  | ||||
|   # Test torchgen generated code for Executorch. | ||||
|   echo "Testing ExecuTorch op registration" | ||||
|   "$BUILD_BIN_DIR"/test_edge_op_registration | ||||
|  | ||||
|   assert_git_not_dirty | ||||
| } | ||||
|  | ||||
| @ -1574,7 +1572,6 @@ test_linux_aarch64() { | ||||
|   python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \ | ||||
|         test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \ | ||||
|         test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \ | ||||
|         distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \ | ||||
|         --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose | ||||
|  | ||||
|   # Dynamo tests | ||||
| @ -1724,6 +1721,11 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then | ||||
| elif [[ "${TEST_CONFIG}" == *inductor* ]]; then | ||||
|   install_torchvision | ||||
|   test_inductor_shard "${SHARD_NUMBER}" | ||||
|   if [[ "${SHARD_NUMBER}" == 1 ]]; then | ||||
|     if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then | ||||
|       test_inductor_distributed | ||||
|     fi | ||||
|   fi | ||||
| elif [[ "${TEST_CONFIG}" == *einops* ]]; then | ||||
|   test_einops | ||||
| elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then | ||||
|  | ||||
| @ -1,9 +1,9 @@ | ||||
| set WIN_DRIVER_VN=580.88 | ||||
| set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe" & REM @lint-ignore | ||||
| curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe | ||||
| set WIN_DRIVER_VN=528.89 | ||||
| set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" & REM @lint-ignore | ||||
| curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe | ||||
| if errorlevel 1 exit /b 1 | ||||
|  | ||||
| start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe -s -noreboot | ||||
| start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe -s -noreboot | ||||
| if errorlevel 1 exit /b 1 | ||||
|  | ||||
| del %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe || ver > NUL | ||||
| del %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe || ver > NUL | ||||
|  | ||||
| @ -85,7 +85,7 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true | ||||
| # Create an isolated directory to store this builds pytorch checkout and conda | ||||
| # installation | ||||
| if [[ -z "$MAC_PACKAGE_WORK_DIR" ]]; then | ||||
|     MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_${DESIRED_PYTHON}_$(date +%H%M%S)" | ||||
|     MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_conda_${DESIRED_PYTHON}_$(date +%H%M%S)" | ||||
| fi | ||||
| mkdir -p "$MAC_PACKAGE_WORK_DIR" || true | ||||
| if [[ -n ${GITHUB_ACTIONS} ]]; then | ||||
| @ -96,11 +96,11 @@ fi | ||||
| whl_tmp_dir="${MAC_PACKAGE_WORK_DIR}/dist" | ||||
| mkdir -p "$whl_tmp_dir" | ||||
|  | ||||
| mac_version='macosx-11_0-arm64' | ||||
| mac_version='macosx_11_0_arm64' | ||||
| libtorch_arch='arm64' | ||||
|  | ||||
| # Create a consistent wheel package name to rename the wheel to | ||||
| wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version//[-,]/_}.whl" | ||||
| wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version}.whl" | ||||
|  | ||||
| ########################################################### | ||||
|  | ||||
| @ -125,6 +125,7 @@ popd | ||||
| export TH_BINARY_BUILD=1 | ||||
| export INSTALL_TEST=0 # dont install test binaries into site-packages | ||||
| export MACOSX_DEPLOYMENT_TARGET=11.0 | ||||
| export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} | ||||
|  | ||||
| EXTRA_CONDA_INSTALL_FLAGS="" | ||||
| CONDA_ENV_CREATE_FLAGS="" | ||||
| @ -132,19 +133,25 @@ RENAME_WHEEL=true | ||||
| case $desired_python in | ||||
|     3.14t) | ||||
|         echo "Using 3.14 deps" | ||||
|         mac_version='macosx-11.0-arm64' | ||||
|         NUMPY_PINNED_VERSION="==2.1.0" | ||||
|         CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|         EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|         desired_python="3.14.0rc1" | ||||
|         RENAME_WHEEL=false | ||||
|         ;; | ||||
|     3.14) | ||||
|         echo "Using 3.14t deps" | ||||
|         mac_version='macosx-11.0-arm64' | ||||
|         NUMPY_PINNED_VERSION="==2.1.0" | ||||
|         EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|         desired_python="3.14.0rc1" | ||||
|         RENAME_WHEEL=false | ||||
|         ;; | ||||
|     3.13t) | ||||
|         echo "Using 3.13 deps" | ||||
|         NUMPY_PINNED_VERSION="==2.1.0" | ||||
|         CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|         EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" | ||||
|         desired_python="3.13" | ||||
|         RENAME_WHEEL=false | ||||
|         ;; | ||||
|     3.13) | ||||
| @ -169,12 +176,17 @@ case $desired_python in | ||||
|         ;; | ||||
| esac | ||||
|  | ||||
| # Install into a fresh env | ||||
| tmp_env_name="wheel_py$python_nodot" | ||||
| conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} | ||||
| source activate "$tmp_env_name" | ||||
|  | ||||
| PINNED_PACKAGES=( | ||||
|     "numpy${NUMPY_PINNED_VERSION}" | ||||
| ) | ||||
| python -mvenv ~/${desired_python}-build | ||||
| source ~/${desired_python}-build/bin/activate | ||||
| retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt" | ||||
| retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt" | ||||
| pip install requests ninja typing-extensions | ||||
| retry pip install -r "${pytorch_rootdir}/requirements.txt" || true | ||||
| retry brew install libomp | ||||
|  | ||||
| # For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which | ||||
| @ -188,7 +200,7 @@ export BUILD_TEST=OFF | ||||
| pushd "$pytorch_rootdir" | ||||
| echo "Calling setup.py bdist_wheel at $(date)" | ||||
|  | ||||
| _PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name "${mac_version//[-.]/_}" | ||||
| python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version} | ||||
|  | ||||
| echo "Finished setup.py bdist_wheel at $(date)" | ||||
|  | ||||
|  | ||||
							
								
								
									
										2
									
								
								.flake8
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								.flake8
									
									
									
									
									
								
							| @ -73,7 +73,7 @@ exclude = | ||||
|     ./docs/src, | ||||
|     ./functorch/docs, | ||||
|     ./functorch/examples, | ||||
|     ./functorch/docs/source/tutorials, | ||||
|     ./functorch/notebooks, | ||||
|     ./scripts, | ||||
|     ./test/generated_type_hints_smoketest.py, | ||||
|     ./third_party, | ||||
|  | ||||
							
								
								
									
										1
									
								
								.github/actionlint.yaml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/actionlint.yaml
									
									
									
									
										vendored
									
									
								
							| @ -21,7 +21,6 @@ self-hosted-runner: | ||||
|     - linux.arm64.2xlarge.ephemeral | ||||
|     - linux.arm64.m7g.4xlarge | ||||
|     - linux.arm64.m7g.4xlarge.ephemeral | ||||
|     - linux.arm64.r7g.12xlarge.memory | ||||
|     - linux.4xlarge.nvidia.gpu | ||||
|     - linux.8xlarge.nvidia.gpu | ||||
|     - linux.16xlarge.nvidia.gpu | ||||
|  | ||||
| @ -264,7 +264,7 @@ def unzip_artifact_and_replace_files() -> None: | ||||
|         change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py") | ||||
|  | ||||
|         for file in Path(f"artifacts/dist/{old_stem}").glob( | ||||
|             "*.dist-info/*", | ||||
|             "*.dist-info/**", | ||||
|         ): | ||||
|             change_content_to_new_version(file) | ||||
|  | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/ci_commit_pins/audio.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ci_commit_pins/audio.txt
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | ||||
| 87ff22e49ed0e92576c4935ccb8c143daac4a3cd | ||||
| 27fc2493d383354a008106f22f3be232badee9a1 | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/ci_commit_pins/fbgemm_rocm.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ci_commit_pins/fbgemm_rocm.txt
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | ||||
| 08ae0af1395c8d8471f4025deb6af9aef90b342f | ||||
| 7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8 | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/ci_commit_pins/vllm.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ci_commit_pins/vllm.txt
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | ||||
| 5963b98b465007e3cfb0d39447e4459a8afa96dc | ||||
| e10fef08838612b4560e9c72e5cb1414a5edfa13 | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/ci_commit_pins/xla.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ci_commit_pins/xla.txt
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | ||||
| c77852e117bdf056c8e9a087e51d6f65cf6ba53d | ||||
| 6c5478ff7c3d50dd1e3047d72ec5909bea474073 | ||||
|  | ||||
							
								
								
									
										29
									
								
								.github/ci_configs/vllm/Dockerfile.tmp_vllm
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										29
									
								
								.github/ci_configs/vllm/Dockerfile.tmp_vllm
									
									
									
									
										vendored
									
									
								
							| @ -82,10 +82,16 @@ RUN if command -v apt-get >/dev/null; then \ | ||||
|         apt-get update -y \ | ||||
|         && apt-get install -y ccache software-properties-common git curl wget sudo vim; \ | ||||
|     else \ | ||||
|         dnf install -y git curl wget sudo; \ | ||||
|         dnf install -y git curl wget sudo vim; \ | ||||
|     fi \ | ||||
|     && python3 --version && python3 -m pip --version | ||||
|  | ||||
| # Workaround for https://github.com/openai/triton/issues/2507 and | ||||
| # https://github.com/pytorch/pytorch/issues/107960 -- hopefully | ||||
| # this won't be needed for future versions of this docker image | ||||
| # or future versions of triton. | ||||
| RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ | ||||
|  | ||||
| # Install uv for faster pip installs if not existed | ||||
| RUN --mount=type=cache,target=/root/.cache/uv \ | ||||
|     if ! python3 -m uv --version >/dev/null 2>&1; then \ | ||||
| @ -214,16 +220,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0 | ||||
| RUN --mount=type=cache,target=/root/.cache/uv \ | ||||
|     --mount=type=bind,source=.git,target=.git \ | ||||
|     if [ "$USE_SCCACHE" = "1" ]; then \ | ||||
|         echo "Installing sccache..."; \ | ||||
|         if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ | ||||
|             SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \ | ||||
|         else \ | ||||
|             SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \ | ||||
|         fi; \ | ||||
|         curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \ | ||||
|         echo "Installing sccache..." \ | ||||
|         && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \ | ||||
|         && tar -xzf sccache.tar.gz \ | ||||
|         && sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \ | ||||
|         && rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \ | ||||
|         && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \ | ||||
|         && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ | ||||
|         && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \ | ||||
|         && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \ | ||||
|         && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ | ||||
| @ -284,7 +285,7 @@ RUN if command -v apt-get >/dev/null; then \ | ||||
|         && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ | ||||
|         && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \ | ||||
|     else \ | ||||
|         dnf install -y git curl wget sudo; \ | ||||
|         dnf install -y git curl wget sudo vim; \ | ||||
|     fi \ | ||||
|     && python3 --version && python3 -m pip --version | ||||
|  | ||||
| @ -297,6 +298,12 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \ | ||||
|     echo "[INFO] Showing torch_build_versions.txt content:" && \ | ||||
|     cat torch_build_versions.txt | ||||
|  | ||||
| # Workaround for https://github.com/openai/triton/issues/2507 and | ||||
| # https://github.com/pytorch/pytorch/issues/107960 -- hopefully | ||||
| # this won't be needed for future versions of this docker image | ||||
| # or future versions of triton. | ||||
| RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ | ||||
|  | ||||
| # Install uv for faster pip installs if not existed | ||||
| RUN --mount=type=cache,target=/root/.cache/uv \ | ||||
|     if ! python3 -m uv --version > /dev/null 2>&1; then \ | ||||
|  | ||||
							
								
								
									
										17
									
								
								.github/ci_configs/vllm/use_existing_torch.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										17
									
								
								.github/ci_configs/vllm/use_existing_torch.py
									
									
									
									
										vendored
									
									
								
							| @ -1,17 +0,0 @@ | ||||
| import glob | ||||
|  | ||||
|  | ||||
| requires_files = glob.glob("requirements/*.txt") | ||||
| requires_files += ["pyproject.toml"] | ||||
| for file in requires_files: | ||||
|     print(f">>> cleaning {file}") | ||||
|     with open(file) as f: | ||||
|         lines = f.readlines() | ||||
|     if "torch" in "".join(lines).lower(): | ||||
|         print("removed:") | ||||
|         with open(file, "w") as f: | ||||
|             for line in lines: | ||||
|                 if "torch" not in line.lower(): | ||||
|                     f.write(line) | ||||
|     print(f"<<< done cleaning {file}") | ||||
|     print() | ||||
| @ -15,7 +15,7 @@ optree==0.13.0 | ||||
| packaging==23.1 | ||||
| parameterized==0.8.1 | ||||
| pillow==10.3.0 | ||||
| protobuf==5.29.5 | ||||
| protobuf==5.29.4 | ||||
| psutil==5.9.8 | ||||
| pygments==2.15.0 | ||||
| pytest-cpp==2.3.0 | ||||
| @ -26,7 +26,7 @@ pytest-xdist==3.3.1 | ||||
| pytest==7.3.2 | ||||
| pyyaml==6.0.2 | ||||
| scipy==1.12.0 | ||||
| setuptools==78.1.1 | ||||
| setuptools==72.1.0 | ||||
| sympy==1.13.3 | ||||
| tlparse==0.4.0 | ||||
| tensorboard==2.13.0 | ||||
|  | ||||
							
								
								
									
										4
									
								
								.github/scripts/docathon-label-sync.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/scripts/docathon-label-sync.py
									
									
									
									
										vendored
									
									
								
							| @ -39,9 +39,7 @@ def main() -> None: | ||||
|     pull_request_label_names = [label.name for label in pull_request_labels] | ||||
|     issue_label_names = [label.name for label in issue_labels] | ||||
|     labels_to_add = [ | ||||
|         label | ||||
|         for label in issue_label_names | ||||
|         if label not in pull_request_label_names and label != "actionable" | ||||
|         label for label in issue_label_names if label not in pull_request_label_names | ||||
|     ] | ||||
|     if not labels_to_add: | ||||
|         print("The pull request already has the same labels.") | ||||
|  | ||||
							
								
								
									
										90
									
								
								.github/scripts/generate_binary_build_matrix.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										90
									
								
								.github/scripts/generate_binary_build_matrix.py
									
									
									
									
										vendored
									
									
								
							| @ -43,55 +43,55 @@ CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"] | ||||
|  | ||||
| PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { | ||||
|     "12.6": ( | ||||
|         "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | " | ||||
|         "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | " | ||||
|         "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | " | ||||
|         "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | " | ||||
|         "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | " | ||||
|         "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | " | ||||
|         "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | " | ||||
|         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " | ||||
|         "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | " | ||||
|         "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | " | ||||
|         "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | " | ||||
|         "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | " | ||||
|         "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'" | ||||
|         "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'" | ||||
|     ), | ||||
|     "12.8": ( | ||||
|         "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | " | ||||
|         "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | " | ||||
|         "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | " | ||||
|         "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | " | ||||
|         "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | " | ||||
|         "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | " | ||||
|         "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | " | ||||
|         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " | ||||
|         "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | " | ||||
|         "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | " | ||||
|         "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | " | ||||
|         "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | " | ||||
|         "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'" | ||||
|         "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'" | ||||
|     ), | ||||
|     "13.0": ( | ||||
|         "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | " | ||||
|         "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | " | ||||
|         "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | " | ||||
|         "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | " | ||||
|         "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | " | ||||
|         "nvidia-curand==10.4.0.35; platform_system == 'Linux' | " | ||||
|         "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | " | ||||
|         "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | " | ||||
|         "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | " | ||||
|         "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | " | ||||
|         "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | " | ||||
|         "nvidia-nvtx==13.0.39; platform_system == 'Linux' | " | ||||
|         "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | " | ||||
|         "nvidia-cufile==1.15.0.42; platform_system == 'Linux'" | ||||
|         "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | " | ||||
|         "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'" | ||||
|     ), | ||||
|     "xpu": ( | ||||
|         "intel-cmplr-lib-rt==2025.2.1 | " | ||||
|  | ||||
							
								
								
									
										94
									
								
								.github/scripts/prepare_vllm_wheels.sh
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										94
									
								
								.github/scripts/prepare_vllm_wheels.sh
									
									
									
									
										vendored
									
									
								
							| @ -1,94 +0,0 @@ | ||||
| #!/usr/bin/env bash | ||||
|  | ||||
| set -eux | ||||
|  | ||||
| torch_version=$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) | ||||
| nightly=$(echo ${torch_version} | cut -d'.' -f4) | ||||
|  | ||||
| # Copied from .ci/manywheel/build_common.sh | ||||
| make_wheel_record() { | ||||
|   fpath=$1 | ||||
|   if echo $fpath | grep RECORD >/dev/null 2>&1; then | ||||
|     echo "$fpath,," | ||||
|   else | ||||
|     fhash=$(openssl dgst -sha256 -binary $fpath | openssl base64 | sed -e 's/+/-/g' | sed -e 's/\//_/g' | sed -e 's/=//g') | ||||
|     fsize=$(ls -nl $fpath | awk '{print $5}') | ||||
|     echo "$fpath,sha256=$fhash,$fsize" | ||||
|   fi | ||||
| } | ||||
|  | ||||
| change_wheel_version() { | ||||
|   local package=$1 | ||||
|   local wheel=$2 | ||||
|   local f_version=$3 | ||||
|   local t_version=$4 | ||||
|  | ||||
|   # Extract the wheel | ||||
|   ${PYTHON_EXECUTABLE} -mwheel unpack $wheel | ||||
|  | ||||
|   mv "${package}-${f_version}" "${package}-${t_version}" | ||||
|   # Change the version from f_version to t_version in the dist-info dir | ||||
|   pushd "${package}-${t_version}" | ||||
|   mv "${package}-${f_version}.dist-info" "${package}-${t_version}.dist-info" | ||||
|  | ||||
|   pushd "${package}-${t_version}.dist-info" | ||||
|   sed -i "s/${package}-${f_version}.dist-info/${package}-${t_version}.dist-info/g" RECORD | ||||
|  | ||||
|   # Update the version in METADATA and its SHA256 hash | ||||
|   sed -i "s/Version: ${f_version}/Version: ${t_version}/g" METADATA | ||||
|   # then add PyTorch nightly dependency of vLLM | ||||
|   if [[ "${package}" == vllm ]] || [[ "${package}" == xformers ]]; then | ||||
|     sed -i "/License-File/a\Requires-Dist: torch==${torch_version}" METADATA | ||||
|   fi | ||||
|   sed -i '/METADATA,sha256/d' RECORD | ||||
|   popd | ||||
|  | ||||
|   make_wheel_record "${package}-${t_version}.dist-info/METADATA" >> "${package}-${t_version}.dist-info/RECORD" | ||||
|   popd | ||||
|  | ||||
|   # Repack the wheel | ||||
|   ${PYTHON_EXECUTABLE} -mwheel pack "${package}-${t_version}" | ||||
|  | ||||
|   # Clean up | ||||
|   rm -rf "${package}-${t_version}" | ||||
| } | ||||
|  | ||||
| repackage_wheel() { | ||||
|   local package=$1 | ||||
|   pushd $package | ||||
|  | ||||
|   local orig_wheel=$(find . -name *${package//-/_}*) | ||||
|   local orig_version=$(unzip -p $orig_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) | ||||
|  | ||||
|   local version="" | ||||
|   if [[ "${package}" == vllm ]]; then | ||||
|     # Copied from vllm/.buildkite/scripts/upload-wheels.sh | ||||
|     version=1.0.0 | ||||
|   else | ||||
|     version=$(echo $orig_version | tr '.+' '.' | cut -d'.' -f1-3) | ||||
|   fi | ||||
|   local nightly_version=$version.$nightly | ||||
|  | ||||
|   # Use nightly version | ||||
|   change_wheel_version ${package//-/_} $orig_wheel $orig_version $nightly_version | ||||
|   # Clean up | ||||
|   rm "${orig_wheel}" | ||||
|  | ||||
|   auditwheel repair --plat $PLATFORM *.whl \ | ||||
|     --exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv* | ||||
|   local repair_wheel=$(find wheelhouse -name *${PLATFORM}*) | ||||
|   local repair_wheel=$(basename ${repair_wheel}) | ||||
|   popd | ||||
|  | ||||
|   cp ${package}/wheelhouse/${repair_wheel} . | ||||
|   rm -rf $package | ||||
| } | ||||
|  | ||||
| # Require to re-package the wheel | ||||
| ${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1 | ||||
|  | ||||
| pushd externals/vllm/wheels | ||||
| for package in xformers flashinfer-python vllm; do | ||||
|   repackage_wheel $package | ||||
| done | ||||
| popd | ||||
| @ -22,16 +22,6 @@ name: !{{ build_environment }} | ||||
|           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" | ||||
| {%- endmacro %} | ||||
|  | ||||
| {%- macro setup_python(py_ver) -%} | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}" | ||||
|           freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }} | ||||
| {%- endmacro %} | ||||
|  | ||||
| on: | ||||
| # TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 | ||||
|   push: | ||||
| @ -71,13 +61,23 @@ jobs: | ||||
|     {%- endif %} | ||||
|     steps: | ||||
|       !{{ set_runner_specific_vars() }} | ||||
|       !{{ setup_python(config.get("python_version", "3.10")) }} | ||||
|       - name: Install conda and dependencies | ||||
|         run: | | ||||
|           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on | ||||
|           curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" | ||||
|           chmod +x "${RUNNER_TEMP}/conda.sh" | ||||
|           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" | ||||
|           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" | ||||
|       !{{ common.checkout(deep_clone=False, directory="pytorch") }} | ||||
|       - name: Populate binary env | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" | ||||
|       - name: Build PyTorch binary | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -94,6 +94,8 @@ jobs: | ||||
| {%- if config["package_type"] == "wheel" %} | ||||
|       - name: Test PyTorch wheel | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -104,9 +106,33 @@ jobs: | ||||
|  | ||||
|           SMOKE_TEST_PARAMS="" | ||||
|  | ||||
|           EXTRA_CONDA_INSTALL_FLAGS="" | ||||
|           CONDA_ENV_CREATE_FLAGS="" | ||||
|           # shellcheck disable=SC2153 | ||||
|           case $DESIRED_PYTHON in | ||||
|             3.14t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.14) | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.13t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" | ||||
|               desired_python="3.13" | ||||
|               ;; | ||||
|             *) | ||||
|               # shellcheck disable=SC2153 | ||||
|               desired_python=${DESIRED_PYTHON} | ||||
|               ;; | ||||
|           esac | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
|           python -mvenv test_venv | ||||
|           source test_venv/bin/activate | ||||
|           conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} | ||||
|           conda activate test_conda_env | ||||
|           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
|  | ||||
							
								
								
									
										14
									
								
								.github/workflows/_docs.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										14
									
								
								.github/workflows/_docs.yml
									
									
									
									
										vendored
									
									
								
							| @ -75,6 +75,10 @@ jobs: | ||||
|             runner: ${{ inputs.runner_prefix }}linux.2xlarge | ||||
|             # It takes less than 30m to finish python docs unless there are issues | ||||
|             timeout-minutes: 30 | ||||
|           - docs_type: functorch | ||||
|             runner: ${{ inputs.runner_prefix }}linux.2xlarge | ||||
|             # It takes less than 15m to finish functorch docs unless there are issues | ||||
|             timeout-minutes: 15 | ||||
|     # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180) | ||||
|     # The current name requires updating the database last docs push query from test-infra every time the matrix is updated | ||||
|     name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }} | ||||
| @ -207,6 +211,16 @@ jobs: | ||||
|           path: cppdocs/ | ||||
|           s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs | ||||
|  | ||||
|       - name: Upload functorch Docs Preview | ||||
|         uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 | ||||
|         if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }} | ||||
|         with: | ||||
|           retention-days: 14 | ||||
|           s3-bucket: doc-previews | ||||
|           if-no-files-found: error | ||||
|           path: functorch_ghpages/nightly/ | ||||
|           s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs | ||||
|  | ||||
|       - name: Teardown Linux | ||||
|         uses: pytorch/test-infra/.github/actions/teardown-linux@main | ||||
|         if: always() | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/_linux-test.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/_linux-test.yml
									
									
									
									
										vendored
									
									
								
							| @ -169,7 +169,7 @@ jobs: | ||||
|         id: install-nvidia-driver | ||||
|         uses: pytorch/test-infra/.github/actions/setup-nvidia@main | ||||
|         with: | ||||
|           driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }} | ||||
|           driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }} | ||||
|         if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }} | ||||
|  | ||||
|       - name: Setup GPU_FLAG for docker run | ||||
|  | ||||
							
								
								
									
										33
									
								
								.github/workflows/_rocm-test.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										33
									
								
								.github/workflows/_rocm-test.yml
									
									
									
									
										vendored
									
									
								
							| @ -62,11 +62,6 @@ on: | ||||
|         required: false | ||||
|         type: number | ||||
|         default: 1 | ||||
|     secrets: | ||||
|       HUGGING_FACE_HUB_TOKEN: | ||||
|         required: false | ||||
|         description: | | ||||
|           HF Auth token to avoid rate limits when downloading models or datasets from hub | ||||
| env: | ||||
|   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} | ||||
|  | ||||
| @ -81,9 +76,10 @@ jobs: | ||||
|     strategy: | ||||
|       matrix: ${{ fromJSON(inputs.test-matrix) }} | ||||
|       fail-fast: false | ||||
|     runs-on: ${{ matrix.runner }} | ||||
|     timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} | ||||
|     runs-on: ${{ matrix.runner }} | ||||
|     steps: | ||||
|       # [see note: pytorch repo ref] | ||||
|       - name: Checkout PyTorch | ||||
|         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main | ||||
|         with: | ||||
| @ -135,9 +131,6 @@ jobs: | ||||
|  | ||||
|       - name: Start monitoring script | ||||
|         id: monitor-script | ||||
|         if: ${{ !inputs.disable-monitor }} | ||||
|         shell: bash | ||||
|         continue-on-error: true | ||||
|         env: | ||||
|           JOB_ID: ${{ steps.get-job-id.outputs.job-id }} | ||||
|           JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} | ||||
| @ -145,6 +138,9 @@ jobs: | ||||
|           WORKFLOW_RUN_ID: ${{github.run_id}} | ||||
|           MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }} | ||||
|           MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }} | ||||
|         if: ${{ !inputs.disable-monitor }} | ||||
|         shell: bash | ||||
|         continue-on-error: true | ||||
|         run: | | ||||
|           python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7 | ||||
|           python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 & | ||||
| @ -182,12 +178,6 @@ jobs: | ||||
|         run: | | ||||
|           echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}" | ||||
|  | ||||
|       - name: Preserve github env variables for use in docker | ||||
|         shell: bash | ||||
|         run: | | ||||
|           env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" | ||||
|           env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" | ||||
|  | ||||
|       - name: Test | ||||
|         id: test | ||||
|         env: | ||||
| @ -203,22 +193,20 @@ jobs: | ||||
|           JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} | ||||
|           BRANCH: ${{ steps.parse-ref.outputs.branch }} | ||||
|           SHA1: ${{ github.event.pull_request.head.sha || github.sha }} | ||||
|           BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }} | ||||
|           TEST_CONFIG: ${{ matrix.config }} | ||||
|           SHARD_NUMBER: ${{ matrix.shard }} | ||||
|           NUM_TEST_SHARDS: ${{ matrix.num_shards }} | ||||
|           REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} | ||||
|           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} | ||||
|           VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} | ||||
|           TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }} | ||||
|           NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }} | ||||
|           NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} | ||||
|           TEST_CONFIG: ${{ matrix.config }} | ||||
|           SHARD_NUMBER: ${{ matrix.shard }} | ||||
|           NUM_TEST_SHARDS: ${{ matrix.num_shards }} | ||||
|           REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} | ||||
|           DOCKER_IMAGE: ${{ inputs.docker-image }} | ||||
|           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} | ||||
|           PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} | ||||
|           TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }} | ||||
|           DASHBOARD_TAG: ${{ inputs.dashboard-tag }} | ||||
|           HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | ||||
|         timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }} | ||||
|         run: | | ||||
|           set -x | ||||
| @ -248,7 +236,6 @@ jobs: | ||||
|             -e GITHUB_RUN_ATTEMPT \ | ||||
|             -e JOB_ID \ | ||||
|             -e JOB_NAME \ | ||||
|             -e BASE_SHA \ | ||||
|             -e BRANCH \ | ||||
|             -e SHA1 \ | ||||
|             -e AWS_DEFAULT_REGION \ | ||||
| @ -266,12 +253,10 @@ jobs: | ||||
|             -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \ | ||||
|             -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ | ||||
|             -e TESTS_TO_INCLUDE \ | ||||
|             -e HUGGING_FACE_HUB_TOKEN \ | ||||
|             -e DASHBOARD_TAG \ | ||||
|             --env-file="${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" \ | ||||
|             --ulimit stack=10485760:83886080 \ | ||||
|             --ulimit core=0 \ | ||||
|             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ | ||||
|             --security-opt seccomp=unconfined \ | ||||
|             --cap-add=SYS_PTRACE \ | ||||
|             --shm-size="8g" \ | ||||
|  | ||||
							
								
								
									
										106
									
								
								.github/workflows/build-vllm-wheel.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										106
									
								
								.github/workflows/build-vllm-wheel.yml
									
									
									
									
										vendored
									
									
								
							| @ -12,9 +12,6 @@ on: | ||||
|     paths: | ||||
|       - .github/workflows/build-vllm-wheel.yml | ||||
|       - .github/ci_commit_pins/vllm.txt | ||||
|   schedule: | ||||
|     # every morning at 01:30PM UTC, 9:30AM EST, 6:30AM PST | ||||
|     - cron: 30 13 * * * | ||||
|  | ||||
| concurrency: | ||||
|   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} | ||||
| @ -27,33 +24,21 @@ jobs: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         python-version: [ '3.12' ] | ||||
|         # TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved | ||||
|         platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ] | ||||
|         # TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554 | ||||
|         device: [ 'cu128', 'cu129' ] | ||||
|         runner: [ 'linux.12xlarge.memory' ] | ||||
|         include: | ||||
|           - platform: manylinux_2_28_x86_64 | ||||
|             device: cu128 | ||||
|           - device: cu128 | ||||
|             manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8' | ||||
|             runner: linux.12xlarge.memory | ||||
|           - platform: manylinux_2_28_x86_64 | ||||
|             device: cu129 | ||||
|           - device: cu129 | ||||
|             manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9' | ||||
|             runner: linux.12xlarge.memory | ||||
|           - platform: manylinux_2_28_aarch64 | ||||
|             device: cu128 | ||||
|             manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8' | ||||
|             runner: linux.arm64.r7g.12xlarge.memory | ||||
|           - platform: manylinux_2_28_aarch64 | ||||
|             device: cu129 | ||||
|             manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9' | ||||
|             runner: linux.arm64.r7g.12xlarge.memory | ||||
|     name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}" | ||||
|     name: "Build ${{ matrix.device }} vLLM wheel" | ||||
|     runs-on: ${{ matrix.runner }} | ||||
|     timeout-minutes: 480 | ||||
|     env: | ||||
|       PY_VERS: ${{ matrix.python-version }} | ||||
|       MANYLINUX_IMAGE: ${{ matrix.manylinux-image }} | ||||
|       PLATFORM: ${{ matrix.platform }} | ||||
|       PLATFORM: 'manylinux_2_28_x86_64' | ||||
|       BUILD_DEVICE: ${{ matrix.device }} | ||||
|     steps: | ||||
|       - name: Setup SSH (Click me for login details) | ||||
| @ -74,6 +59,20 @@ jobs: | ||||
|         run: | | ||||
|           set -eux | ||||
|  | ||||
|           # Keep PyTorch nightly wheel here so that we can install it later during | ||||
|           # vLLM build process | ||||
|           mkdir -p "${RUNNER_TEMP}/artifacts/" | ||||
|  | ||||
|           container_name=$(docker run \ | ||||
|             --tty \ | ||||
|             --detach \ | ||||
|             -e PLATFORM \ | ||||
|             -v "${GITHUB_WORKSPACE}:/pytorch" \ | ||||
|             -v "${RUNNER_TEMP}/artifacts:/artifacts" \ | ||||
|             -w /artifacts/ \ | ||||
|             "${MANYLINUX_IMAGE}" | ||||
|           ) | ||||
|  | ||||
|           # Determine python executable for given version (copied from build-triton-wheel) | ||||
|           case $PY_VERS in | ||||
|           3.10) | ||||
| @ -103,21 +102,6 @@ jobs: | ||||
|             ;; | ||||
|           esac | ||||
|  | ||||
|           # Keep PyTorch nightly wheel here so that we can install it later during | ||||
|           # vLLM build process | ||||
|           mkdir -p "${RUNNER_TEMP}/artifacts/" | ||||
|  | ||||
|           container_name=$(docker run \ | ||||
|             --tty \ | ||||
|             --detach \ | ||||
|             -e PLATFORM \ | ||||
|             -e PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \ | ||||
|             -v "${GITHUB_WORKSPACE}:/pytorch" \ | ||||
|             -v "${RUNNER_TEMP}/artifacts:/artifacts" \ | ||||
|             -w /artifacts/ \ | ||||
|             "${MANYLINUX_IMAGE}" | ||||
|           ) | ||||
|  | ||||
|           docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \ | ||||
|             --pre torch torchvision torchaudio \ | ||||
|             --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}" | ||||
| @ -129,6 +113,7 @@ jobs: | ||||
|             --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}" | ||||
|  | ||||
|           # Save this for later | ||||
|           echo "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" >> "$GITHUB_ENV" | ||||
|           echo "container_name=${container_name}" >> "$GITHUB_ENV" | ||||
|  | ||||
|       - name: Build vLLM wheel | ||||
| @ -146,12 +131,41 @@ jobs: | ||||
|           set -eux | ||||
|  | ||||
|           # Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh | ||||
|           docker exec -t "${container_name}" bash -c /pytorch/.github/scripts/prepare_vllm_wheels.sh | ||||
|           docker exec -t "${container_name}" bash -c " | ||||
|             set -eux | ||||
|  | ||||
|             nightly=\$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2 | cut -d'.' -f4) | ||||
|  | ||||
|             pushd externals/vllm/wheels | ||||
|             for package in xformers flashinfer-python vllm; do | ||||
|               pushd \$package | ||||
|               auditwheel repair --plat \$PLATFORM *.whl \ | ||||
|                 --exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv* | ||||
|               repair_wheel=\$(find wheelhouse -name *\${PLATFORM}*) | ||||
|               repair_wheel=\$(basename \${repair_wheel}) | ||||
|               popd | ||||
|  | ||||
|               cp \${package}/wheelhouse/\${repair_wheel} . | ||||
|               version=\$(unzip -p \$repair_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) | ||||
|  | ||||
|               if [[ \$package == vllm ]]; then | ||||
|                 new_wheel=\${repair_wheel/\$version/1.0.0.\$nightly} | ||||
|               else | ||||
|                 major_version=\$(echo \$version | tr '.+' '.' | cut -d'.' -f1-3) | ||||
|                 new_wheel=\${repair_wheel/\$version/\$major_version.\$nightly} | ||||
|               fi | ||||
|  | ||||
|               mv -- \$repair_wheel \$new_wheel | ||||
|               rm -rf \$package | ||||
|             done | ||||
|             popd | ||||
|           " | ||||
|  | ||||
|           docker exec -t "${container_name}" chown -R 1000:1000 /artifacts | ||||
|  | ||||
|       - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 | ||||
|         with: | ||||
|           name: vllm-wheel-${{ matrix.device }}-${{ matrix.platform }}-${{ matrix.python-version }} | ||||
|           name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }} | ||||
|           if-no-files-found: error | ||||
|           path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl | ||||
|  | ||||
| @ -161,29 +175,27 @@ jobs: | ||||
|  | ||||
|   # Copied from build-triton-wheel workflow (mostly) | ||||
|   upload-wheel: | ||||
|     name: "Upload ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}" | ||||
|     name: "Upload ${{ matrix.device }} vLLM wheel" | ||||
|     needs: | ||||
|       - build-wheel | ||||
|     runs-on: ubuntu-latest | ||||
|     strategy: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ] | ||||
|         device: [ 'cu128', 'cu129' ] | ||||
|     env: | ||||
|       PLATFORM: ${{ matrix.platform }} | ||||
|       BUILD_DEVICE: ${{ matrix.device }} | ||||
|     permissions: | ||||
|       id-token: write | ||||
|       contents: read | ||||
|     container: | ||||
|       image: continuumio/miniconda3:4.12.0 | ||||
|     environment: ${{ ((github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'nightly-wheel-upload' || '' }} | ||||
|     environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }} | ||||
|     steps: | ||||
|       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | ||||
|  | ||||
|       - name: Configure AWS credentials(PyTorch account) for main | ||||
|         if: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} | ||||
|         if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }} | ||||
|         uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 | ||||
|         with: | ||||
|           role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels | ||||
| @ -207,15 +219,15 @@ jobs: | ||||
|         run: | | ||||
|           set -eux | ||||
|           mkdir -p "${RUNNER_TEMP}/artifacts/" | ||||
|           mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-"${PLATFORM}"-*/* "${RUNNER_TEMP}/artifacts/" | ||||
|           mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/" | ||||
|  | ||||
|       - name: Set DRY_RUN | ||||
|         if: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} | ||||
|       - name: Set DRY_RUN (only for tagged pushes) | ||||
|         if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }} | ||||
|         shell: bash | ||||
|         run: | | ||||
|           echo "DRY_RUN=disabled" >> "$GITHUB_ENV" | ||||
|  | ||||
|       - name: Set UPLOAD_CHANNEL | ||||
|       - name: Set UPLOAD_CHANNEL (only for tagged pushes) | ||||
|         if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }} | ||||
|         shell: bash | ||||
|         run: | | ||||
|  | ||||
							
								
								
									
										3
									
								
								.github/workflows/docker-builds.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.github/workflows/docker-builds.yml
									
									
									
									
										vendored
									
									
								
							| @ -71,7 +71,8 @@ jobs: | ||||
|           pytorch-linux-jammy-py3-clang12-onnx, | ||||
|           pytorch-linux-jammy-linter, | ||||
|           pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter, | ||||
|           pytorch-linux-jammy-py3-clang12-executorch, | ||||
|           # Executorch pin needs update | ||||
|           # pytorch-linux-jammy-py3-clang12-executorch, | ||||
|           pytorch-linux-jammy-py3.12-triton-cpu, | ||||
|           pytorch-linux-noble-riscv64-py3.12-gcc14 | ||||
|         ] | ||||
|  | ||||
							
								
								
									
										42
									
								
								.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										42
									
								
								.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -132,7 +132,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_10-cuda-aarch64-12_6 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -178,7 +178,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_10-cuda-aarch64-12_8 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -224,7 +224,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_10-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -335,7 +335,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_11-cuda-aarch64-12_6 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -381,7 +381,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_11-cuda-aarch64-12_8 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -427,7 +427,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_11-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -538,7 +538,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_12-cuda-aarch64-12_6 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -584,7 +584,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_12-cuda-aarch64-12_8 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -630,7 +630,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_12-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -741,7 +741,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_13-cuda-aarch64-12_6 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -787,7 +787,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_13-cuda-aarch64-12_8 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -833,7 +833,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_13-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -944,7 +944,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_13t-cuda-aarch64-12_6 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -990,7 +990,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_13t-cuda-aarch64-12_8 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1036,7 +1036,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_13t-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1147,7 +1147,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_14-cuda-aarch64-12_6 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1193,7 +1193,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_14-cuda-aarch64-12_8 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1239,7 +1239,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_14-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1350,7 +1350,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_14t-cuda-aarch64-12_6 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1396,7 +1396,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_14t-cuda-aarch64-12_8 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
| @ -1442,7 +1442,7 @@ jobs: | ||||
|       ALPINE_IMAGE: "arm64v8/alpine" | ||||
|       build_name: manywheel-py3_14t-cuda-aarch64-13_0 | ||||
|       build_environment: linux-aarch64-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|       timeout-minutes: 420 | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/generated-linux-binary-manywheel-main.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/generated-linux-binary-manywheel-main.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -60,7 +60,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_12-cuda12_8 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_12-cuda12_8-test:  # Testing | ||||
|  | ||||
							
								
								
									
										42
									
								
								.github/workflows/generated-linux-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										42
									
								
								.github/workflows/generated-linux-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -127,7 +127,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_10-cuda12_6 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_10-cuda12_6-test:  # Testing | ||||
| @ -193,7 +193,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_10-cuda12_8 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_10-cuda12_8-test:  # Testing | ||||
| @ -259,7 +259,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_10-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_10-cuda13_0-test:  # Testing | ||||
| @ -719,7 +719,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_11-cuda12_6 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_11-cuda12_6-test:  # Testing | ||||
| @ -785,7 +785,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_11-cuda12_8 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_11-cuda12_8-test:  # Testing | ||||
| @ -851,7 +851,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_11-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_11-cuda13_0-test:  # Testing | ||||
| @ -1311,7 +1311,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_12-cuda12_6 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_12-cuda12_6-test:  # Testing | ||||
| @ -1377,7 +1377,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_12-cuda12_8 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_12-cuda12_8-test:  # Testing | ||||
| @ -1443,7 +1443,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_12-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_12-cuda13_0-test:  # Testing | ||||
| @ -1903,7 +1903,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_13-cuda12_6 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_13-cuda12_6-test:  # Testing | ||||
| @ -1969,7 +1969,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_13-cuda12_8 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_13-cuda12_8-test:  # Testing | ||||
| @ -2035,7 +2035,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_13-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_13-cuda13_0-test:  # Testing | ||||
| @ -2495,7 +2495,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_13t-cuda12_6 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_13t-cuda12_6-test:  # Testing | ||||
| @ -2561,7 +2561,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_13t-cuda12_8 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_13t-cuda12_8-test:  # Testing | ||||
| @ -2627,7 +2627,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_13t-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_13t-cuda13_0-test:  # Testing | ||||
| @ -3087,7 +3087,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_14-cuda12_6 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_14-cuda12_6-test:  # Testing | ||||
| @ -3153,7 +3153,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_14-cuda12_8 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_14-cuda12_8-test:  # Testing | ||||
| @ -3219,7 +3219,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_14-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_14-cuda13_0-test:  # Testing | ||||
| @ -3679,7 +3679,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_14t-cuda12_6 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_14t-cuda12_6-test:  # Testing | ||||
| @ -3745,7 +3745,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_14t-cuda12_8 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_14t-cuda12_8-test:  # Testing | ||||
| @ -3811,7 +3811,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_14t-cuda13_0 | ||||
|       build_environment: linux-binary-manywheel | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_14t-cuda13_0-test:  # Testing | ||||
|  | ||||
							
								
								
									
										18
									
								
								.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										18
									
								
								.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -60,13 +60,13 @@ jobs: | ||||
|           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" | ||||
|           # shellcheck disable=SC2129 | ||||
|           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.10.4" | ||||
|           freethreaded: false | ||||
|       - name: Install conda and dependencies | ||||
|         run: | | ||||
|           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on | ||||
|           curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" | ||||
|           chmod +x "${RUNNER_TEMP}/conda.sh" | ||||
|           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" | ||||
|           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" | ||||
|       - name: Checkout PyTorch | ||||
|         uses: actions/checkout@v4 | ||||
|         with: | ||||
| @ -81,9 +81,13 @@ jobs: | ||||
|         working-directory: pytorch | ||||
|       - name: Populate binary env | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" | ||||
|       - name: Build PyTorch binary | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
|  | ||||
							
								
								
									
										336
									
								
								.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										336
									
								
								.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -56,13 +56,13 @@ jobs: | ||||
|           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" | ||||
|           # shellcheck disable=SC2129 | ||||
|           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.10.4" | ||||
|           freethreaded: false | ||||
|       - name: Install conda and dependencies | ||||
|         run: | | ||||
|           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on | ||||
|           curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" | ||||
|           chmod +x "${RUNNER_TEMP}/conda.sh" | ||||
|           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" | ||||
|           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" | ||||
|       - name: Checkout PyTorch | ||||
|         uses: actions/checkout@v4 | ||||
|         with: | ||||
| @ -77,9 +77,13 @@ jobs: | ||||
|         working-directory: pytorch | ||||
|       - name: Populate binary env | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" | ||||
|       - name: Build PyTorch binary | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -95,6 +99,8 @@ jobs: | ||||
|           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" | ||||
|       - name: Test PyTorch wheel | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -105,9 +111,33 @@ jobs: | ||||
|  | ||||
|           SMOKE_TEST_PARAMS="" | ||||
|  | ||||
|           EXTRA_CONDA_INSTALL_FLAGS="" | ||||
|           CONDA_ENV_CREATE_FLAGS="" | ||||
|           # shellcheck disable=SC2153 | ||||
|           case $DESIRED_PYTHON in | ||||
|             3.14t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.14) | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.13t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" | ||||
|               desired_python="3.13" | ||||
|               ;; | ||||
|             *) | ||||
|               # shellcheck disable=SC2153 | ||||
|               desired_python=${DESIRED_PYTHON} | ||||
|               ;; | ||||
|           esac | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
|           python -mvenv test_venv | ||||
|           source test_venv/bin/activate | ||||
|           conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} | ||||
|           conda activate test_conda_env | ||||
|           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
| @ -166,13 +196,13 @@ jobs: | ||||
|           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" | ||||
|           # shellcheck disable=SC2129 | ||||
|           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.11.4" | ||||
|           freethreaded: false | ||||
|       - name: Install conda and dependencies | ||||
|         run: | | ||||
|           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on | ||||
|           curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" | ||||
|           chmod +x "${RUNNER_TEMP}/conda.sh" | ||||
|           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" | ||||
|           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" | ||||
|       - name: Checkout PyTorch | ||||
|         uses: actions/checkout@v4 | ||||
|         with: | ||||
| @ -187,9 +217,13 @@ jobs: | ||||
|         working-directory: pytorch | ||||
|       - name: Populate binary env | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" | ||||
|       - name: Build PyTorch binary | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -205,6 +239,8 @@ jobs: | ||||
|           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" | ||||
|       - name: Test PyTorch wheel | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -215,9 +251,33 @@ jobs: | ||||
|  | ||||
|           SMOKE_TEST_PARAMS="" | ||||
|  | ||||
|           EXTRA_CONDA_INSTALL_FLAGS="" | ||||
|           CONDA_ENV_CREATE_FLAGS="" | ||||
|           # shellcheck disable=SC2153 | ||||
|           case $DESIRED_PYTHON in | ||||
|             3.14t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.14) | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.13t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" | ||||
|               desired_python="3.13" | ||||
|               ;; | ||||
|             *) | ||||
|               # shellcheck disable=SC2153 | ||||
|               desired_python=${DESIRED_PYTHON} | ||||
|               ;; | ||||
|           esac | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
|           python -mvenv test_venv | ||||
|           source test_venv/bin/activate | ||||
|           conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} | ||||
|           conda activate test_conda_env | ||||
|           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
| @ -276,13 +336,13 @@ jobs: | ||||
|           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" | ||||
|           # shellcheck disable=SC2129 | ||||
|           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.12.4" | ||||
|           freethreaded: false | ||||
|       - name: Install conda and dependencies | ||||
|         run: | | ||||
|           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on | ||||
|           curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" | ||||
|           chmod +x "${RUNNER_TEMP}/conda.sh" | ||||
|           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" | ||||
|           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" | ||||
|       - name: Checkout PyTorch | ||||
|         uses: actions/checkout@v4 | ||||
|         with: | ||||
| @ -297,9 +357,13 @@ jobs: | ||||
|         working-directory: pytorch | ||||
|       - name: Populate binary env | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" | ||||
|       - name: Build PyTorch binary | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -315,6 +379,8 @@ jobs: | ||||
|           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" | ||||
|       - name: Test PyTorch wheel | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -325,9 +391,33 @@ jobs: | ||||
|  | ||||
|           SMOKE_TEST_PARAMS="" | ||||
|  | ||||
|           EXTRA_CONDA_INSTALL_FLAGS="" | ||||
|           CONDA_ENV_CREATE_FLAGS="" | ||||
|           # shellcheck disable=SC2153 | ||||
|           case $DESIRED_PYTHON in | ||||
|             3.14t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.14) | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.13t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" | ||||
|               desired_python="3.13" | ||||
|               ;; | ||||
|             *) | ||||
|               # shellcheck disable=SC2153 | ||||
|               desired_python=${DESIRED_PYTHON} | ||||
|               ;; | ||||
|           esac | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
|           python -mvenv test_venv | ||||
|           source test_venv/bin/activate | ||||
|           conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} | ||||
|           conda activate test_conda_env | ||||
|           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
| @ -386,13 +476,13 @@ jobs: | ||||
|           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" | ||||
|           # shellcheck disable=SC2129 | ||||
|           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.13.4" | ||||
|           freethreaded: false | ||||
|       - name: Install conda and dependencies | ||||
|         run: | | ||||
|           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on | ||||
|           curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" | ||||
|           chmod +x "${RUNNER_TEMP}/conda.sh" | ||||
|           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" | ||||
|           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" | ||||
|       - name: Checkout PyTorch | ||||
|         uses: actions/checkout@v4 | ||||
|         with: | ||||
| @ -407,9 +497,13 @@ jobs: | ||||
|         working-directory: pytorch | ||||
|       - name: Populate binary env | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" | ||||
|       - name: Build PyTorch binary | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -425,6 +519,8 @@ jobs: | ||||
|           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" | ||||
|       - name: Test PyTorch wheel | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -435,9 +531,33 @@ jobs: | ||||
|  | ||||
|           SMOKE_TEST_PARAMS="" | ||||
|  | ||||
|           EXTRA_CONDA_INSTALL_FLAGS="" | ||||
|           CONDA_ENV_CREATE_FLAGS="" | ||||
|           # shellcheck disable=SC2153 | ||||
|           case $DESIRED_PYTHON in | ||||
|             3.14t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.14) | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.13t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" | ||||
|               desired_python="3.13" | ||||
|               ;; | ||||
|             *) | ||||
|               # shellcheck disable=SC2153 | ||||
|               desired_python=${DESIRED_PYTHON} | ||||
|               ;; | ||||
|           esac | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
|           python -mvenv test_venv | ||||
|           source test_venv/bin/activate | ||||
|           conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} | ||||
|           conda activate test_conda_env | ||||
|           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
| @ -496,13 +616,13 @@ jobs: | ||||
|           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" | ||||
|           # shellcheck disable=SC2129 | ||||
|           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.13.4" | ||||
|           freethreaded: true | ||||
|       - name: Install conda and dependencies | ||||
|         run: | | ||||
|           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on | ||||
|           curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" | ||||
|           chmod +x "${RUNNER_TEMP}/conda.sh" | ||||
|           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" | ||||
|           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" | ||||
|       - name: Checkout PyTorch | ||||
|         uses: actions/checkout@v4 | ||||
|         with: | ||||
| @ -517,9 +637,13 @@ jobs: | ||||
|         working-directory: pytorch | ||||
|       - name: Populate binary env | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" | ||||
|       - name: Build PyTorch binary | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -535,6 +659,8 @@ jobs: | ||||
|           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" | ||||
|       - name: Test PyTorch wheel | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -545,9 +671,33 @@ jobs: | ||||
|  | ||||
|           SMOKE_TEST_PARAMS="" | ||||
|  | ||||
|           EXTRA_CONDA_INSTALL_FLAGS="" | ||||
|           CONDA_ENV_CREATE_FLAGS="" | ||||
|           # shellcheck disable=SC2153 | ||||
|           case $DESIRED_PYTHON in | ||||
|             3.14t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.14) | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.13t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" | ||||
|               desired_python="3.13" | ||||
|               ;; | ||||
|             *) | ||||
|               # shellcheck disable=SC2153 | ||||
|               desired_python=${DESIRED_PYTHON} | ||||
|               ;; | ||||
|           esac | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
|           python -mvenv test_venv | ||||
|           source test_venv/bin/activate | ||||
|           conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} | ||||
|           conda activate test_conda_env | ||||
|           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
| @ -606,13 +756,13 @@ jobs: | ||||
|           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" | ||||
|           # shellcheck disable=SC2129 | ||||
|           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.14.0-rc.2" | ||||
|           freethreaded: false | ||||
|       - name: Install conda and dependencies | ||||
|         run: | | ||||
|           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on | ||||
|           curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" | ||||
|           chmod +x "${RUNNER_TEMP}/conda.sh" | ||||
|           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" | ||||
|           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" | ||||
|       - name: Checkout PyTorch | ||||
|         uses: actions/checkout@v4 | ||||
|         with: | ||||
| @ -627,9 +777,13 @@ jobs: | ||||
|         working-directory: pytorch | ||||
|       - name: Populate binary env | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" | ||||
|       - name: Build PyTorch binary | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -645,6 +799,8 @@ jobs: | ||||
|           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" | ||||
|       - name: Test PyTorch wheel | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -655,9 +811,33 @@ jobs: | ||||
|  | ||||
|           SMOKE_TEST_PARAMS="" | ||||
|  | ||||
|           EXTRA_CONDA_INSTALL_FLAGS="" | ||||
|           CONDA_ENV_CREATE_FLAGS="" | ||||
|           # shellcheck disable=SC2153 | ||||
|           case $DESIRED_PYTHON in | ||||
|             3.14t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.14) | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.13t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" | ||||
|               desired_python="3.13" | ||||
|               ;; | ||||
|             *) | ||||
|               # shellcheck disable=SC2153 | ||||
|               desired_python=${DESIRED_PYTHON} | ||||
|               ;; | ||||
|           esac | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
|           python -mvenv test_venv | ||||
|           source test_venv/bin/activate | ||||
|           conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} | ||||
|           conda activate test_conda_env | ||||
|           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
| @ -716,13 +896,13 @@ jobs: | ||||
|           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" | ||||
|           # shellcheck disable=SC2129 | ||||
|           echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" | ||||
|       - name: Setup Python | ||||
|         uses: actions/setup-python@v6 | ||||
|         with: | ||||
|           # TODO: Removeme once 3.14 is out | ||||
|           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 | ||||
|           python-version: "3.14.0-rc.2" | ||||
|           freethreaded: true | ||||
|       - name: Install conda and dependencies | ||||
|         run: | | ||||
|           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on | ||||
|           curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" | ||||
|           chmod +x "${RUNNER_TEMP}/conda.sh" | ||||
|           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" | ||||
|           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" | ||||
|       - name: Checkout PyTorch | ||||
|         uses: actions/checkout@v4 | ||||
|         with: | ||||
| @ -737,9 +917,13 @@ jobs: | ||||
|         working-directory: pytorch | ||||
|       - name: Populate binary env | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" | ||||
|       - name: Build PyTorch binary | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -755,6 +939,8 @@ jobs: | ||||
|           "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" | ||||
|       - name: Test PyTorch wheel | ||||
|         run: | | ||||
|           # shellcheck disable=SC1091 | ||||
|           source "${RUNNER_TEMP}/anaconda/bin/activate" | ||||
|           set -eux -o pipefail | ||||
|           # shellcheck disable=SC1090 | ||||
|           source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" | ||||
| @ -765,9 +951,33 @@ jobs: | ||||
|  | ||||
|           SMOKE_TEST_PARAMS="" | ||||
|  | ||||
|           EXTRA_CONDA_INSTALL_FLAGS="" | ||||
|           CONDA_ENV_CREATE_FLAGS="" | ||||
|           # shellcheck disable=SC2153 | ||||
|           case $DESIRED_PYTHON in | ||||
|             3.14t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.14) | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" | ||||
|               desired_python="3.14.0rc1" | ||||
|               ;; | ||||
|             3.13t) | ||||
|               CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
|               EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" | ||||
|               desired_python="3.13" | ||||
|               ;; | ||||
|             *) | ||||
|               # shellcheck disable=SC2153 | ||||
|               desired_python=${DESIRED_PYTHON} | ||||
|               ;; | ||||
|           esac | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
|           python -mvenv test_venv | ||||
|           source test_venv/bin/activate | ||||
|           conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS} | ||||
|           conda activate test_conda_env | ||||
|           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v | ||||
|  | ||||
|           # shellcheck disable=SC2086 | ||||
|  | ||||
							
								
								
									
										4
									
								
								.github/workflows/inductor-nightly.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/inductor-nightly.yml
									
									
									
									
										vendored
									
									
								
							| @ -37,7 +37,7 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-default-label-prefix | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks | ||||
|       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" | ||||
|       test-matrix: | | ||||
| @ -56,7 +56,7 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: nightly-dynamo-benchmarks-build | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }} | ||||
|       timeout-minutes: 720 | ||||
|  | ||||
| @ -43,11 +43,6 @@ on: | ||||
|         required: false | ||||
|         type: boolean | ||||
|         default: false | ||||
|       freezing: | ||||
|         description: Run freezing? | ||||
|         required: false | ||||
|         type: boolean | ||||
|         default: true | ||||
|       benchmark_configs: | ||||
|         description: The list of configs used the benchmark | ||||
|         required: false | ||||
| @ -80,7 +75,7 @@ jobs: | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
| @ -106,8 +101,8 @@ jobs: | ||||
|     needs: inductor-build | ||||
|     if: github.event.schedule == '0 7 * * *' | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true | ||||
|       docker-image: ${{ needs.inductor-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} | ||||
|       timeout-minutes: 720 | ||||
| @ -121,9 +116,10 @@ jobs: | ||||
|     name: inductor-test | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: inductor-build | ||||
|     if: github.event_name == 'workflow_dispatch' | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }} | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }} | ||||
|       docker-image: ${{ needs.inductor-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} | ||||
|       timeout-minutes: 720 | ||||
|  | ||||
| @ -80,7 +80,7 @@ jobs: | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
| @ -107,7 +107,7 @@ jobs: | ||||
|     needs: inductor-build | ||||
|     if: github.event.schedule == '0 7 * * *' | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true | ||||
|       docker-image: ${{ needs.inductor-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} | ||||
| @ -124,7 +124,7 @@ jobs: | ||||
|     needs: inductor-build | ||||
|     if: github.event_name == 'workflow_dispatch' | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }} | ||||
|       docker-image: ${{ needs.inductor-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} | ||||
|  | ||||
							
								
								
									
										8
									
								
								.github/workflows/inductor-periodic.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										8
									
								
								.github/workflows/inductor-periodic.yml
									
									
									
									
										vendored
									
									
								
							| @ -39,7 +39,7 @@ jobs: | ||||
|       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks | ||||
|       cuda-arch-list: '8.0;8.6' | ||||
|       cuda-arch-list: '8.6' | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, | ||||
| @ -62,7 +62,7 @@ jobs: | ||||
|           { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, | ||||
|           { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, | ||||
|           { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, | ||||
|           { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" }, | ||||
|           { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, | ||||
|           { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, | ||||
|           { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, | ||||
|           { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, | ||||
| @ -154,7 +154,7 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-default-label-prefix | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks | ||||
|       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" | ||||
|       test-matrix: | | ||||
| @ -200,7 +200,7 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: periodic-dynamo-benchmarks-cpu-build | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }} | ||||
|     secrets: inherit | ||||
|  | ||||
							
								
								
									
										4
									
								
								.github/workflows/inductor-unittest.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/inductor-unittest.yml
									
									
									
									
										vendored
									
									
								
							| @ -110,7 +110,7 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       test-matrix: | | ||||
| @ -127,7 +127,7 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: inductor-cpu-build | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }} | ||||
|     secrets: inherit | ||||
|  | ||||
							
								
								
									
										4
									
								
								.github/workflows/inductor.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/inductor.yml
									
									
									
									
										vendored
									
									
								
							| @ -79,7 +79,7 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       test-matrix: | | ||||
| @ -101,7 +101,7 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: inductor-cpu-build | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }} | ||||
|     secrets: inherit | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/nightly.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/nightly.yml
									
									
									
									
										vendored
									
									
								
							| @ -54,7 +54,7 @@ jobs: | ||||
|       - get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-py3.10-gcc11 | ||||
|       build-environment: linux-jammy-py3.9-gcc11 | ||||
|       docker-image: ${{ needs.docs-build.outputs.docker-image }} | ||||
|       push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }} | ||||
|       run-doxygen: true | ||||
|  | ||||
							
								
								
									
										10
									
								
								.github/workflows/operator_benchmark.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										10
									
								
								.github/workflows/operator_benchmark.yml
									
									
									
									
										vendored
									
									
								
							| @ -14,10 +14,6 @@ on: | ||||
|   schedule: | ||||
|     # Run at 07:00 UTC every Sunday | ||||
|     - cron: 0 7 * * 0 | ||||
|   pull_request: | ||||
|     paths: | ||||
|       - benchmarks/operator_benchmark/** | ||||
|       - .github/workflows/operator_benchmark.yml | ||||
|  | ||||
| concurrency: | ||||
|   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} | ||||
| @ -33,7 +29,7 @@ jobs: | ||||
|     name: opbenchmark-build | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
| @ -46,7 +42,7 @@ jobs: | ||||
|     name: opbenchmark-on-demand-build | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
| @ -59,7 +55,7 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: opbenchmark-build | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11-build | ||||
|       build-environment: linux-jammy-py3.9-gcc11-build | ||||
|       docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }} | ||||
|     secrets: inherit | ||||
|  | ||||
							
								
								
									
										28
									
								
								.github/workflows/pull.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										28
									
								
								.github/workflows/pull.yml
									
									
									
									
										vendored
									
									
								
							| @ -127,8 +127,6 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       # More memory is needed to build with asan | ||||
|       runner: linux.2xlarge.memory | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-py3.10-clang18-asan | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan | ||||
| @ -318,6 +316,32 @@ jobs: | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-py3-clang12-executorch-build: | ||||
|     if: false  # Docker build needs pin update | ||||
|     name: linux-jammy-py3-clang12-executorch | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-py3-clang12-executorch | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-py3-clang12-executorch-test: | ||||
|     name: linux-jammy-py3-clang12-executorch | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: linux-jammy-py3-clang12-executorch-build | ||||
|     if: false # Has been broken for a while | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3-clang12-executorch | ||||
|       docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: | ||||
|     name: cuda12.8-py3.10-gcc9-sm75 | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/slow.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/slow.yml
									
									
									
									
										vendored
									
									
								
							| @ -140,8 +140,6 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       # More memory is needed to build with asan | ||||
|       runner: linux.2xlarge.memory | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-py3.10-clang18-asan | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan | ||||
|  | ||||
							
								
								
									
										28
									
								
								.github/workflows/trunk.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										28
									
								
								.github/workflows/trunk.yml
									
									
									
									
										vendored
									
									
								
							| @ -240,7 +240,7 @@ jobs: | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-py3.10-gcc11 | ||||
|       build-environment: linux-jammy-py3.9-gcc11 | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
| @ -255,31 +255,7 @@ jobs: | ||||
|       - verify-cachebench-cpu-build | ||||
|       - target-determination | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.10-gcc11 | ||||
|       build-environment: linux-jammy-py3.9-gcc11 | ||||
|       docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-py3-clang12-executorch-build: | ||||
|     name: linux-jammy-py3-clang12-executorch | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-py3-clang12-executorch | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-py3-clang12-executorch-test: | ||||
|     name: linux-jammy-py3-clang12-executorch | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: linux-jammy-py3-clang12-executorch-build | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3-clang12-executorch | ||||
|       docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }} | ||||
|     secrets: inherit | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/vllm.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/vllm.yml
									
									
									
									
										vendored
									
									
								
							| @ -36,8 +36,6 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       # When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata | ||||
|       allow-reuse-old-whl: false | ||||
|       build-additional-packages: "vision audio" | ||||
|       build-external-packages: "vllm" | ||||
|       build-environment: linux-jammy-cuda12.8-py3.12-gcc11 | ||||
|  | ||||
							
								
								
									
										5
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -259,9 +259,6 @@ gen | ||||
| .pytest_cache | ||||
| aten/build/* | ||||
|  | ||||
| # Linker scripts for prioritized text optimization | ||||
| cmake/linker_script.ld | ||||
|  | ||||
| # Bram | ||||
| plsdontbreak | ||||
|  | ||||
| @ -392,5 +389,3 @@ android/pytorch_android_torchvision/.cxx | ||||
|  | ||||
| # Claude Code local configuration | ||||
| CLAUDE.local.md | ||||
| /test_*.py | ||||
| /debug_*.py | ||||
|  | ||||
| @ -13,7 +13,7 @@ exclude_patterns = [ | ||||
|     '**/fb/**', | ||||
|     'functorch/docs/**', | ||||
|     'functorch/examples/**', | ||||
|     'functorch/docs/source/tutorials/**', | ||||
|     'functorch/notebooks/**', | ||||
|     'torch/_inductor/fx_passes/serialized_patterns/**', | ||||
|     'torch/_inductor/autoheuristic/artifacts/**', | ||||
|     'scripts/**', | ||||
| @ -964,6 +964,7 @@ exclude_patterns = [ | ||||
|     'test/jit/**',  # should be run through test/test_jit.py | ||||
|     'test/ao/sparsity/**',  # should be run through test/test_ao_sparsity.py | ||||
|     'test/fx/**',  # should be run through test/test_fx.py | ||||
|     'test/bottleneck_test/**',  # excluded by test/run_test.py | ||||
|     'test/package/**',  # excluded by test/run_test.py | ||||
|     'test/distributed/argparse_util_test.py', | ||||
|     'test/distributed/bin/test_script.py', | ||||
| @ -1409,6 +1410,8 @@ exclude_patterns = [ | ||||
|     'torch/utils/benchmark/utils/timer.py', | ||||
|     'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py', | ||||
|     'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py', | ||||
|     'torch/utils/bottleneck/__init__.py', | ||||
|     'torch/utils/bottleneck/__main__.py', | ||||
|     'torch/utils/bundled_inputs.py', | ||||
|     'torch/utils/checkpoint.py', | ||||
|     'torch/utils/collect_env.py', | ||||
| @ -1565,6 +1568,7 @@ include_patterns = [ | ||||
| exclude_patterns = [ | ||||
|     'caffe2/**', | ||||
|     'functorch/docs/**', | ||||
|     'functorch/notebooks/**', | ||||
|     'torch/_inductor/fx_passes/serialized_patterns/**', | ||||
|     'torch/_inductor/autoheuristic/artifacts/**', | ||||
|     'test/dynamo/cpython/**', | ||||
|  | ||||
| @ -810,7 +810,7 @@ cc_library( | ||||
|     name = "torch_python", | ||||
|     srcs = libtorch_python_core_sources | ||||
|         + if_cuda(libtorch_python_cuda_sources) | ||||
|         + libtorch_python_distributed_sources | ||||
|         + if_cuda(libtorch_python_distributed_sources) | ||||
|         + GENERATED_AUTOGRAD_PYTHON, | ||||
|     hdrs = glob([ | ||||
|         "torch/csrc/generic/*.cpp", | ||||
|  | ||||
| @ -234,7 +234,6 @@ cmake_dependent_option(INSTALL_TEST "Install test binaries if BUILD_TEST is on" | ||||
| option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF) | ||||
| option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON) | ||||
| option(USE_ASAN "Use Address+Undefined Sanitizers" OFF) | ||||
| option(USE_LSAN "Use Leak Sanitizer" OFF) | ||||
| option(USE_TSAN "Use Thread Sanitizer" OFF) | ||||
| option(USE_CUDA "Use CUDA" ON) | ||||
| option(USE_XPU "Use XPU" ON) | ||||
| @ -380,13 +379,6 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler" | ||||
|                        OFF "USE_CUDA" OFF) | ||||
| cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON | ||||
|                         "CPU_AARCH64" OFF) | ||||
| # prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le. | ||||
| set(USE_PRIORITIZED_TEXT_DEFAULT OFF) | ||||
| if(LINUX AND CPU_AARCH64) | ||||
|   set(USE_PRIORITIZED_TEXT_DEFAULT ON) | ||||
| endif() | ||||
| cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld." | ||||
|   "${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF) | ||||
|  | ||||
| option(USE_MIMALLOC "Use mimalloc" OFF) | ||||
| # Enable third party mimalloc library to improve memory allocation performance | ||||
| @ -664,11 +656,6 @@ endif(MSVC) | ||||
|  | ||||
| string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all") | ||||
|  | ||||
| # Set linker max-page-size to 64KiB on AArch64 Linux | ||||
| if(LINUX AND CPU_AARCH64) | ||||
|   add_link_options_if_supported("-z,max-page-size=0x10000") | ||||
| endif() | ||||
|  | ||||
| # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not | ||||
| # applicable to mobile are disabled by this variable. Setting | ||||
| # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it | ||||
| @ -886,7 +873,7 @@ cmake_dependent_option( | ||||
|   "Whether to build the flash_attention kernel for scaled dot product attention.\ | ||||
|   Will be disabled if not supported by the platform" | ||||
|   ON | ||||
|   "(USE_CUDA AND NOT MSVC) OR USE_ROCM" | ||||
|   "USE_CUDA OR USE_ROCM;NOT MSVC" | ||||
|   OFF) | ||||
|  | ||||
| cmake_dependent_option( | ||||
| @ -902,9 +889,9 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH) | ||||
|   set(USE_FBGEMM_GENAI off) | ||||
| endif() | ||||
|  | ||||
| # Set USE_FBGEMM_GENAI to ON for CUDA build on SM100. | ||||
| if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32) | ||||
|   message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a") | ||||
| # Set USE_FBGEMM_GENAI to ON for CUDA build on SM100 | ||||
| if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0a") | ||||
|   message(WARNING "Setting USE_FBGEMM_GENAI to ON for CUDA build on SM100") | ||||
|   set(USE_FBGEMM_GENAI ON) | ||||
| endif() | ||||
|  | ||||
| @ -921,7 +908,7 @@ cmake_dependent_option( | ||||
| # USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake | ||||
| # | ||||
| if(USE_ROCM) | ||||
|   if(USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION) | ||||
|   if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)) | ||||
|     include(cmake/External/aotriton.cmake) | ||||
|   endif() | ||||
| endif() | ||||
| @ -1433,57 +1420,3 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA) | ||||
|   install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" | ||||
|           DESTINATION "${CMAKE_INSTALL_BINDIR}") | ||||
| endif() | ||||
|  | ||||
| if(USE_PRIORITIZED_TEXT_FOR_LD) | ||||
|   add_compile_options( | ||||
|     $<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections> | ||||
|     $<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections> | ||||
|   ) | ||||
|   set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld") | ||||
|   set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt") | ||||
|  | ||||
|   add_custom_command( | ||||
|     OUTPUT "${LINKER_SCRIPT_FILE_OUT}" | ||||
|     COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}" | ||||
|     DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}" | ||||
|     COMMENT "Generating prioritized text linker files" | ||||
|     VERBATIM | ||||
|   ) | ||||
|  | ||||
|   add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}") | ||||
|  | ||||
|   if(BUILD_PYTHON) | ||||
|     set(LINKER_OPT_TARGETS torch_python) | ||||
|   endif() | ||||
|  | ||||
|   if(NOT BUILD_LIBTORCHLESS) | ||||
|     list(APPEND LINKER_OPT_TARGETS torch_cpu c10) | ||||
|     if(USE_CUDA) | ||||
|       list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda) | ||||
|     endif() | ||||
|     if(USE_XPU) | ||||
|       list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu) | ||||
|     endif() | ||||
|     if(USE_ROCM) | ||||
|       list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip) | ||||
|     endif() | ||||
|   endif() | ||||
|  | ||||
|   foreach(tgt IN LISTS LINKER_OPT_TARGETS) | ||||
|     if(TARGET ${tgt}) | ||||
|       add_dependencies("${tgt}" generate_linker_script) | ||||
|       target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}") | ||||
|       set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}") | ||||
|     else() | ||||
|        message(WARNING "Requested target '${tgt}' for linker script optimization was not found.") | ||||
|     endif() | ||||
|   endforeach() | ||||
|  | ||||
| else() | ||||
|   if(LINUX AND CPU_AARCH64) | ||||
|     message(WARNING [[ | ||||
|     It is strongly recommend to enable linker script optimization for all AArch64 Linux builds. | ||||
|     To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 | ||||
|     ]]) | ||||
|   endif() | ||||
| endif() | ||||
| @ -50,7 +50,6 @@ Following is the Release Compatibility Matrix for PyTorch releases: | ||||
|  | ||||
| | PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm | | ||||
| | --- | --- | --- | --- | --- | --- | | ||||
| | 2.9 | >=3.10, <=(3.14, 3.14t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 13.0 (CUDNN 9.13.0.50) | ROCm 6.4 | | ||||
| | 2.8 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 12.9 (CUDNN 9.10.2.21) | ROCm 6.4 | | ||||
| | 2.7 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8 (CUDNN 9.1.0.70), CUDA 12.6 (CUDNN 9.5.1.17) | CUDA 12.8 (CUDNN 9.7.1.26) | ROCm 6.3 | | ||||
| | 2.6 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8, CUDA 12.4 (CUDNN 9.1.0.70) | CUDA 12.6 (CUDNN 9.5.1.17) | ROCm 6.2.4 | | ||||
|  | ||||
| @ -16,8 +16,6 @@ However, if you believe you have found a security vulnerability in PyTorch, we e | ||||
|  | ||||
| Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new | ||||
|  | ||||
| All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework. | ||||
|  | ||||
| Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported: | ||||
|  | ||||
| https://www.facebook.com/whitehat | ||||
|  | ||||
| @ -265,14 +265,6 @@ IF(USE_FBGEMM_GENAI) | ||||
|       "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu") | ||||
|     list(FILTER fbgemm_genai_native_cuda_cu INCLUDE REGEX ${FBGEMM_CUTLASS_KERNELS_REGEX}) | ||||
|  | ||||
|     # PyTorch is not built for 10.0a in CI, due to lack of portability, | ||||
|     # so we need to explicitly build these files for 10.0a. | ||||
|     foreach(cu_file ${fbgemm_genai_native_cuda_cu}) | ||||
|       _BUILD_FOR_ADDITIONAL_ARCHS( | ||||
|         "${cu_file}" | ||||
|         "100a") | ||||
|     endforeach() | ||||
|  | ||||
|     file(GLOB_RECURSE fbgemm_genai_native_cuda_cpp | ||||
|       "${FBGEMM_GENAI_SRCS}/common/*.cpp" | ||||
|     ) | ||||
|  | ||||
| @ -180,7 +180,7 @@ void Context::setUserEnabledNNPACK(bool e) { | ||||
| } | ||||
|  | ||||
| bool Context::allowTF32CuDNN(const std::string& op) const { | ||||
|   if (op.empty()){ | ||||
|   if (op.size() == 0){ | ||||
|     bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32"; | ||||
|     bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32"; | ||||
|     TORCH_CHECK( | ||||
| @ -281,6 +281,9 @@ bool Context::userEnabledOverrideableSDP() const { | ||||
|  | ||||
| static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG"; | ||||
| static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"}; | ||||
| #ifdef USE_ROCM | ||||
| static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32"; | ||||
| #endif | ||||
|  | ||||
| bool Context::checkCuBLASConfigDeterministic() { | ||||
|   // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config | ||||
| @ -340,6 +343,12 @@ void Context::setImmediateMiopen(bool b) { | ||||
| } | ||||
|  | ||||
| bool Context::allowTF32CuBLAS() const { | ||||
| #ifdef USE_ROCM | ||||
|     const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32); | ||||
|     if (allow_tf32 != true) { | ||||
|       return false; | ||||
|     } | ||||
| #endif | ||||
|   bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST; | ||||
|   bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32"; | ||||
|   TORCH_CHECK( | ||||
| @ -353,6 +362,14 @@ bool Context::allowTF32CuBLAS() const { | ||||
| } | ||||
|  | ||||
| void Context::setAllowTF32CuBLAS(bool b) { | ||||
| #ifdef USE_ROCM | ||||
|   const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32); | ||||
|   if (allow_tf32 != true) { | ||||
|     C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. " | ||||
|                               << "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it."; | ||||
|     return; | ||||
|   } | ||||
| #endif | ||||
|   float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST; | ||||
|   setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee"); | ||||
| } | ||||
| @ -426,7 +443,7 @@ void Context::setFloat32Precision(const std::string& backend, const std::string& | ||||
|     std::string msg; | ||||
|     auto iterp = _fp32_precisions.find(backend); | ||||
|     TORCH_CHECK(iterp != _fp32_precisions.end()); | ||||
|     for (const auto& p : iterp->second) { | ||||
|     for (auto p : iterp->second) { | ||||
|       msg += p; | ||||
|       msg += " "; | ||||
|     } | ||||
|  | ||||
| @ -65,24 +65,14 @@ DLDataType getDLDataType(const Tensor& t) { | ||||
|       break; | ||||
|     // TODO(#146647): use macro here instead of spelling out each shell dtype | ||||
|     case ScalarType::Float8_e5m2: | ||||
|       dtype.code = DLDataTypeCode::kDLFloat8_e5m2; | ||||
|       break; | ||||
|     case ScalarType::Float8_e5m2fnuz: | ||||
|       dtype.code = DLDataTypeCode::kDLFloat8_e5m2fnuz; | ||||
|       break; | ||||
|     case ScalarType::Float8_e4m3fn: | ||||
|       dtype.code = DLDataTypeCode::kDLFloat8_e4m3fn; | ||||
|       break; | ||||
|     case ScalarType::Float8_e4m3fnuz: | ||||
|       dtype.code = DLDataTypeCode::kDLFloat8_e4m3fnuz; | ||||
|       break; | ||||
|     case ScalarType::Float8_e8m0fnu: | ||||
|       dtype.code = DLDataTypeCode::kDLFloat8_e8m0fnu; | ||||
|       TORCH_CHECK_BUFFER(false, "float8 types are not supported by dlpack"); | ||||
|       break; | ||||
|     case ScalarType::Float4_e2m1fn_x2: | ||||
|       dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn; | ||||
|       dtype.lanes = 2; | ||||
|       dtype.bits = 4; | ||||
|       TORCH_CHECK_BUFFER(false, "float4 types are not supported by dlpack"); | ||||
|       break; | ||||
|     case ScalarType::QInt8: | ||||
|     case ScalarType::QUInt8: | ||||
| @ -187,11 +177,7 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat | ||||
|  | ||||
| ScalarType toScalarType(const DLDataType& dtype) { | ||||
|   ScalarType stype = ScalarType::Undefined; | ||||
|   if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) { | ||||
|     TORCH_CHECK_BUFFER( | ||||
|         dtype.lanes == 1, | ||||
|         "ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code)); | ||||
|   } | ||||
|   TORCH_CHECK_BUFFER(dtype.lanes == 1, "ATen does not support lanes != 1"); | ||||
|   switch (dtype.code) { | ||||
|     case DLDataTypeCode::kDLUInt: | ||||
|       switch (dtype.bits) { | ||||
| @ -283,73 +269,6 @@ ScalarType toScalarType(const DLDataType& dtype) { | ||||
|               false, "Unsupported kDLBool bits ", std::to_string(dtype.bits)); | ||||
|       } | ||||
|       break; | ||||
|     case DLDataTypeCode::kDLFloat8_e5m2: | ||||
|       switch (dtype.bits) { | ||||
|         case 8: | ||||
|           stype = ScalarType::Float8_e5m2; | ||||
|           break; | ||||
|         default: | ||||
|           TORCH_CHECK_BUFFER( | ||||
|               false, "Unsupported kDLFloat8_e5m2 bits ", std::to_string(dtype.bits)); | ||||
|       } | ||||
|       break; | ||||
|     case DLDataTypeCode::kDLFloat8_e5m2fnuz: | ||||
|       switch (dtype.bits) { | ||||
|         case 8: | ||||
|           stype = ScalarType::Float8_e5m2fnuz; | ||||
|           break; | ||||
|         default: | ||||
|           TORCH_CHECK_BUFFER( | ||||
|               false, "Unsupported kDLFloat8_e5m2fnuz bits ", std::to_string(dtype.bits)); | ||||
|       } | ||||
|       break; | ||||
|     case DLDataTypeCode::kDLFloat8_e4m3fn: | ||||
|       switch (dtype.bits) { | ||||
|         case 8: | ||||
|           stype = ScalarType::Float8_e4m3fn; | ||||
|           break; | ||||
|         default: | ||||
|           TORCH_CHECK_BUFFER( | ||||
|               false, "Unsupported kDLFloat8_e4m3fn bits ", std::to_string(dtype.bits)); | ||||
|       } | ||||
|       break; | ||||
|     case DLDataTypeCode::kDLFloat8_e4m3fnuz: | ||||
|       switch (dtype.bits) { | ||||
|         case 8: | ||||
|           stype = ScalarType::Float8_e4m3fnuz; | ||||
|           break; | ||||
|         default: | ||||
|           TORCH_CHECK_BUFFER( | ||||
|               false, "Unsupported kDLFloat8_e4m3fnuz bits ", std::to_string(dtype.bits)); | ||||
|       } | ||||
|       break; | ||||
|     case DLDataTypeCode::kDLFloat8_e8m0fnu: | ||||
|       switch (dtype.bits) { | ||||
|         case 8: | ||||
|           stype = ScalarType::Float8_e8m0fnu; | ||||
|           break; | ||||
|         default: | ||||
|           TORCH_CHECK_BUFFER( | ||||
|               false, "Unsupported kDLFloat8_e8m0fnu bits ", std::to_string(dtype.bits)); | ||||
|       } | ||||
|       break; | ||||
|     case DLDataTypeCode::kDLFloat4_e2m1fn: | ||||
|       switch (dtype.bits) { | ||||
|         case 4: | ||||
|           switch (dtype.lanes) { | ||||
|             case 2: | ||||
|               stype = ScalarType::Float4_e2m1fn_x2; | ||||
|               break; | ||||
|             default: | ||||
|               TORCH_CHECK_BUFFER( | ||||
|                 false, "Unsupported kDLFloat4_e2m1fn lanes ", std::to_string(dtype.lanes)); | ||||
|           } | ||||
|           break; | ||||
|         default: | ||||
|           TORCH_CHECK_BUFFER( | ||||
|               false, "Unsupported kDLFloat4_e2m1fn bits ", std::to_string(dtype.bits)); | ||||
|       } | ||||
|       break; | ||||
|     default: | ||||
|       TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code)); | ||||
|   } | ||||
| @ -435,8 +354,8 @@ T* toDLPackImpl(const Tensor& src) { | ||||
|   atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device()); | ||||
|   atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim()); | ||||
|   atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src); | ||||
|   atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data()); | ||||
|   atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data()); | ||||
|   atDLMTensor->tensor.dl_tensor.shape = view.sizes().data(); | ||||
|   atDLMTensor->tensor.dl_tensor.strides = view.strides().data(); | ||||
|   atDLMTensor->tensor.dl_tensor.byte_offset = 0; | ||||
|   fillVersion(&atDLMTensor->tensor); | ||||
|  | ||||
|  | ||||
| @ -102,7 +102,7 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base) | ||||
|   // SparseTensorImpl has no storage, so we cannot query its nbytes. | ||||
|   // (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse) | ||||
|   // Same for XLA | ||||
|   if (base.unsafeGetTensorImpl()->has_storage() && data_ptr().device().type() != c10::DeviceType::XLA) { | ||||
|   if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) { | ||||
|     original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes(); | ||||
|   } else { | ||||
|     original_storage_size_ = -1; | ||||
|  | ||||
| @ -133,12 +133,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { | ||||
|         "resize_ called on tensor with symbolic shape") | ||||
|     TORCH_CHECK( | ||||
|         sparse_dim + dense_dim == static_cast<int64_t>(size.size()), | ||||
|         "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ", | ||||
|         size.size(), | ||||
|         ", sparse_dim = ", | ||||
|         "number of dimensions must be sparse_dim (", | ||||
|         sparse_dim, | ||||
|         ", dense_dim = ", | ||||
|         dense_dim); | ||||
|         ") + dense_dim (", | ||||
|         dense_dim, | ||||
|         "), but got ", | ||||
|         size.size()); | ||||
|     if (nnz() > 0) { | ||||
|       [[maybe_unused]] auto constexpr alt_options_msg = | ||||
|           "You could try the following options:\n\ | ||||
| @ -254,12 +254,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { | ||||
|         "resize_and_clear_ called on tensor with symbolic shape") | ||||
|     TORCH_CHECK( | ||||
|         sparse_dim + dense_dim == static_cast<int64_t>(size.size()), | ||||
|         "'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ", | ||||
|         size.size(), | ||||
|         ", sparse_dim = ", | ||||
|         "number of dimensions must be sparse_dim (", | ||||
|         sparse_dim, | ||||
|         ", dense_dim = ", | ||||
|         dense_dim); | ||||
|         ") + dense_dim (", | ||||
|         dense_dim, | ||||
|         "), but got ", | ||||
|         size.size()); | ||||
|  | ||||
|     set_sizes_and_strides(size, std::vector<int64_t>(size.size())); | ||||
|     sparse_dim_ = sparse_dim; | ||||
|  | ||||
| @ -644,8 +644,6 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP | ||||
|   void * beta_ptr = &fbeta; | ||||
| #ifdef USE_ROCM | ||||
|   int flag = 0; | ||||
|   rocblas_datatype c_type = std::is_same<C_Dtype, float>::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r; | ||||
|   rocblas_datatype d_type = c_type; | ||||
| #if USE_GEMM_FLAGS_FP16_ALT_IMPL | ||||
|   flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; | ||||
| #endif | ||||
| @ -654,8 +652,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP | ||||
|                                    hipOperationToRocOperation(opb), (int)m, (int)n, (int)k, | ||||
|                                    (void*)alpha_ptr, a, rocblas_datatype_f16_r, (int)lda, stridea, | ||||
|                                    b, rocblas_datatype_f16_r, (int)ldb, strideb, | ||||
|                                    (void*)beta_ptr, c, c_type, (int)ldc, stridec, | ||||
|                                    c, d_type, (int)ldc, stridec, | ||||
|                                    (void*)beta_ptr, c, rocblas_datatype_f16_r, (int)ldc, stridec, | ||||
|                                    c, rocblas_datatype_f16_r, (int)ldc, stridec, | ||||
|                                    (int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard, | ||||
|                                    0, flag))); | ||||
| #else | ||||
| @ -1098,8 +1096,6 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE( | ||||
|   GEMM_CHECK_ARGVALUES(at::Half); | ||||
| #ifdef USE_ROCM | ||||
|   int flag = 0; | ||||
|   rocblas_datatype c_type = std::is_same<C_Dtype, float>::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r; | ||||
|   rocblas_datatype d_type = c_type; | ||||
| #if USE_GEMM_FLAGS_FP16_ALT_IMPL | ||||
|   flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; | ||||
| #endif | ||||
| @ -1119,10 +1115,10 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE( | ||||
|       ldb, | ||||
|       beta_ptr, | ||||
|       c, | ||||
|       c_type, | ||||
|       rocblas_datatype_f16_r, | ||||
|       ldc, | ||||
|       c, | ||||
|       d_type, | ||||
|       rocblas_datatype_f16_r, | ||||
|       ldc, | ||||
|       rocblas_datatype_f32_r, | ||||
|       rocblas_gemm_algo_standard, | ||||
|  | ||||
| @ -266,14 +266,11 @@ CUDAGeneratorImpl::CUDAGeneratorImpl( | ||||
|  * See Note [Acquire lock when using random generators] | ||||
|  */ | ||||
| void CUDAGeneratorImpl::set_current_seed(uint64_t seed) { | ||||
|   if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) { | ||||
|     state_->seed_ = seed; | ||||
|     state_->philox_offset_per_thread_ = 0; | ||||
|     no_reset_rnn_state_.clear(); | ||||
|   } else { | ||||
|     TORCH_CHECK(state_->seed_ == seed, "CUDAGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed."); | ||||
|     // no-op case | ||||
|   } | ||||
|   at::cuda::assertNotCapturing( | ||||
|       "Cannot call CUDAGeneratorImpl::set_current_seed"); | ||||
|   state_->seed_ = seed; | ||||
|   state_->philox_offset_per_thread_ = 0; | ||||
|   no_reset_rnn_state_.clear(); | ||||
| } | ||||
|  | ||||
| /** | ||||
| @ -302,6 +299,9 @@ uint64_t CUDAGeneratorImpl::get_offset() const { | ||||
|  * Gets the current seed of CUDAGeneratorImpl. | ||||
|  */ | ||||
| uint64_t CUDAGeneratorImpl::current_seed() const { | ||||
|   // Debatable if current_seed() should be allowed in captured regions. | ||||
|   // Conservatively disallow it for now. | ||||
|   at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed"); | ||||
|   return state_->seed_; | ||||
| } | ||||
|  | ||||
| @ -346,6 +346,8 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const { | ||||
|  * and size of the internal state. | ||||
|  */ | ||||
| void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) { | ||||
|   at::cuda::assertNotCapturing( | ||||
|       "Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing."); | ||||
|   static const size_t seed_size = sizeof(uint64_t); | ||||
|   static const size_t offset_size = sizeof(int64_t); | ||||
|   static const size_t total_size = seed_size + offset_size; | ||||
| @ -400,27 +402,15 @@ c10::intrusive_ptr<c10::GeneratorImpl> CUDAGeneratorImpl::graphsafe_get_state() | ||||
|  */ | ||||
| void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) { | ||||
|   // see Note [Why enforce RNG offset % 4 == 0?] | ||||
|  | ||||
|   // Note: If you use CUDNN RNN's, calling | ||||
|   // set_philox_offset_per_thread instead of set_offset will cause the | ||||
|   // cudnn RNN rng state to become stale. | ||||
|   TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4"); | ||||
|   if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) { | ||||
|     state_->philox_offset_per_thread_ = offset; | ||||
|   } else { | ||||
|     state_->offset_intragraph_ = offset; | ||||
|   } | ||||
|   state_->philox_offset_per_thread_ = offset; | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl. | ||||
|  */ | ||||
| uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const { | ||||
|   if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) { | ||||
|     return state_->philox_offset_per_thread_; | ||||
|   } else { | ||||
|     return state_->offset_intragraph_; | ||||
|   } | ||||
|   return state_->philox_offset_per_thread_; | ||||
| } | ||||
|  | ||||
| /** | ||||
|  | ||||
| @ -45,24 +45,6 @@ struct OffsetCalculator { | ||||
|  | ||||
|   C10_HOST_DEVICE offset_type get(index_t linear_idx) const { | ||||
|     offset_type offsets; | ||||
|  | ||||
| #if defined(USE_ROCM) | ||||
|     if ((dims > 0) && (dims <= 2)) { | ||||
|       auto divmod = sizes_[0].divmod(linear_idx); | ||||
| #pragma unroll | ||||
|       for (int arg = 0; arg < NARGS; arg++) | ||||
|         offsets[arg] = divmod.mod * strides_[0][arg]; | ||||
|       if (dims >= 2) { | ||||
|         divmod = sizes_[1].divmod(divmod.div); | ||||
| #pragma unroll | ||||
|         for (int arg = 0; arg < NARGS; arg++) | ||||
|           offsets[arg] += divmod.mod * strides_[1][arg]; | ||||
|       } | ||||
|       // [...] | ||||
|       return offsets; | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     #pragma unroll | ||||
|     for (int arg = 0; arg < NARGS; arg++) { | ||||
|       offsets[arg] = 0; | ||||
|  | ||||
| @ -19,7 +19,7 @@ | ||||
| #define DLPACK_MAJOR_VERSION 1 | ||||
|  | ||||
| /*! \brief The current minor version of dlpack */ | ||||
| #define DLPACK_MINOR_VERSION 1 | ||||
| #define DLPACK_MINOR_VERSION 0 | ||||
|  | ||||
| /*! \brief DLPACK_DLL prefix for windows */ | ||||
| #ifdef _WIN32 | ||||
| @ -32,7 +32,9 @@ | ||||
| #define DLPACK_DLL | ||||
| #endif | ||||
|  | ||||
| // NOLINTNEXTLINE(modernize-deprecated-headers) | ||||
| #include <stdint.h> | ||||
| // NOLINTNEXTLINE(modernize-deprecated-headers) | ||||
| #include <stddef.h> | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| @ -157,26 +159,6 @@ typedef enum { | ||||
|   kDLComplex = 5U, | ||||
|   /*! \brief boolean */ | ||||
|   kDLBool = 6U, | ||||
|   /*! \brief FP8 data types */ | ||||
|   kDLFloat8_e3m4 = 7U, | ||||
|   kDLFloat8_e4m3 = 8U, | ||||
|   kDLFloat8_e4m3b11fnuz = 9U, | ||||
|   kDLFloat8_e4m3fn = 10U, | ||||
|   kDLFloat8_e4m3fnuz = 11U, | ||||
|   kDLFloat8_e5m2 = 12U, | ||||
|   kDLFloat8_e5m2fnuz = 13U, | ||||
|   kDLFloat8_e8m0fnu = 14U, | ||||
|   /*! \brief FP6 data types | ||||
|    * Setting bits != 6 is currently unspecified, and the producer must ensure it is set | ||||
|    * while the consumer must stop importing if the value is unexpected. | ||||
|    */ | ||||
|   kDLFloat6_e2m3fn = 15U, | ||||
|   kDLFloat6_e3m2fn = 16U, | ||||
|   /*! \brief FP4 data types | ||||
|    * Setting bits != 4 is currently unspecified, and the producer must ensure it is set | ||||
|    * while the consumer must stop importing if the value is unexpected. | ||||
|    */ | ||||
|   kDLFloat4_e2m1fn = 17U, | ||||
| } DLDataTypeCode; | ||||
|  | ||||
| /*! | ||||
| @ -190,12 +172,6 @@ typedef enum { | ||||
|  *   - int8: type_code = 0, bits = 8, lanes = 1 | ||||
|  *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1 | ||||
|  *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits) | ||||
|  *   - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory) | ||||
|  *   - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory) | ||||
|  *   - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory) | ||||
|  * | ||||
|  *  When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e., | ||||
|  *  for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element. | ||||
|  */ | ||||
| typedef struct { | ||||
|   /*! | ||||
| @ -253,12 +229,12 @@ typedef struct { | ||||
|   /*! \brief The data type of the pointer*/ | ||||
|   DLDataType dtype; | ||||
|   /*! \brief The shape of the tensor */ | ||||
|   int64_t* shape; | ||||
|   const int64_t* shape; | ||||
|   /*! | ||||
|    * \brief strides of the tensor (in number of elements, not bytes) | ||||
|    *  can be NULL, indicating tensor is compact and row-majored. | ||||
|    */ | ||||
|   int64_t* strides; | ||||
|   const int64_t* strides; | ||||
|   /*! \brief The offset in bytes to the beginning pointer to data */ | ||||
|   uint64_t byte_offset; | ||||
| } DLTensor; | ||||
| @ -293,7 +269,7 @@ typedef struct DLManagedTensor { | ||||
|   void (*deleter)(struct DLManagedTensor * self); | ||||
| } DLManagedTensor; | ||||
|  | ||||
| // bit masks used in the DLManagedTensorVersioned | ||||
| // bit masks used in in the DLManagedTensorVersioned | ||||
|  | ||||
| /*! \brief bit mask to indicate that the tensor is read only. */ | ||||
| #define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL) | ||||
| @ -306,14 +282,6 @@ typedef struct DLManagedTensor { | ||||
|  */ | ||||
| #define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL) | ||||
|  | ||||
| /* | ||||
|  * \brief bit mask to indicate that whether a sub-byte type is packed or padded. | ||||
|  * | ||||
|  * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can | ||||
|  * be set by the producer to signal that a tensor of sub-byte type is padded. | ||||
|  */ | ||||
| #define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL) | ||||
|  | ||||
| /*! | ||||
|  * \brief A versioned and managed C Tensor object, manage memory of DLTensor. | ||||
|  * | ||||
|  | ||||
| @ -171,8 +171,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { | ||||
|  | ||||
|   POINTWISE_BOXED(fill_.Scalar); | ||||
|   POINTWISE_BOXED(zero_); | ||||
|   // This is special because this op doesn't return anything | ||||
|   m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata); | ||||
|  | ||||
| #undef UNARY_POINTWISE | ||||
| #undef UNARY_POINTWISE_ALL | ||||
|  | ||||
| @ -457,9 +457,24 @@ void gemm( | ||||
|     return; | ||||
|   } | ||||
| #endif | ||||
|   // for the fallback path, first compute gemm with beta = 0, | ||||
|   // and then add c in full precision. | ||||
|   int64_t c_size = n * m; | ||||
|   std::vector<float> float_c(c_size, 0.f); | ||||
|   gemm_no_downcast_stub( | ||||
|       at::kCPU, at::kBFloat16, | ||||
|       transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); | ||||
|       transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m); | ||||
|   for (const auto j : c10::irange(n)) { | ||||
|     for (const auto i : c10::irange(m)) { | ||||
|       auto offset = j * ldc + i; | ||||
|       // beta == 0 won't propagate NaN from C | ||||
|       if (beta == 0.f) { | ||||
|         c[offset] = float_c[j * m + i]; | ||||
|       } else { | ||||
|         c[offset] = beta * c[offset] + float_c[j * m + i]; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| void gemm( | ||||
| @ -478,9 +493,24 @@ void gemm( | ||||
|     return; | ||||
|   } | ||||
| #endif | ||||
|   // for the fallback path, first compute gemm with beta = 0, | ||||
|   // and then add c in full precision. | ||||
|   int64_t c_size = n * m; | ||||
|   std::vector<float> float_c(c_size, 0.f); | ||||
|   gemm_no_downcast_stub( | ||||
|       at::kCPU, at::kHalf, | ||||
|       transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); | ||||
|       transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m); | ||||
|   for (const auto j : c10::irange(n)) { | ||||
|     for (const auto i : c10::irange(m)) { | ||||
|       auto offset = j * ldc + i; | ||||
|       // beta == 0 won't propagate NaN from C | ||||
|       if (beta == 0.f) { | ||||
|         c[offset] = float_c[j * m + i]; | ||||
|       } else { | ||||
|         c[offset] = beta * c[offset] + float_c[j * m + i]; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| void gemm( | ||||
|  | ||||
| @ -81,7 +81,7 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) { | ||||
|   // TODO: contiguous can be made to preserve the memory format | ||||
|   // of the input. However since the above reshape clobbers h and w | ||||
|   // it may not be safe to do that, since channels_last contiguous | ||||
|   // may think oc and the last dim correspond to h,w? | ||||
|   // may think oc and and the last dim correspond to h,w? | ||||
|   // It is not clear, however from initial looking around it feels that | ||||
|   // this may not be correct. | ||||
|   // In this case channels last will likely require custom implementation | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| #pragma once | ||||
| #include <ATen/core/Tensor.h> | ||||
| #include <ATen/Config.h> | ||||
| #include <cstdint> | ||||
|  | ||||
| @ -67,13 +67,13 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)( | ||||
|   int64_t inputH = input_.size(heightDim); | ||||
|   int64_t inputW = input_.size(widthDim); | ||||
|  | ||||
|   TORCH_CHECK((poolSizeT <= inputT) && (outputT + poolSizeT - 1 < inputT), | ||||
|   TORCH_CHECK(outputT + poolSizeT - 1 < inputT, | ||||
|            "fractional_max_pool3d_out(): pool time ", poolSizeT, | ||||
|            " too large relative to input time ", inputT); | ||||
|   TORCH_CHECK((poolSizeW <= inputW) && (outputW + poolSizeW - 1 < inputW), | ||||
|   TORCH_CHECK(outputW + poolSizeW - 1 < inputW, | ||||
|            "fractional_max_pool3d_out(): pool width ", poolSizeW, | ||||
|            " too large relative to input width ", inputW); | ||||
|   TORCH_CHECK((poolSizeH <= inputH) && (outputH + poolSizeH - 1 < inputH), | ||||
|   TORCH_CHECK(outputH + poolSizeH - 1 < inputH, | ||||
|            "fractional_max_pool3d_out(): pool height ", poolSizeH, | ||||
|            " too large relative to input height ", inputH); | ||||
|  | ||||
|  | ||||
| @ -1360,8 +1360,7 @@ Tensor outer(const Tensor& self, const Tensor& vec2) { | ||||
| #endif | ||||
|  | ||||
|  | ||||
| #if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED() | ||||
| // Used by default on x86 platforms and on AArch64+ACL | ||||
| #if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() | ||||
| static inline int64_t get_mkldnn_matmul_min_dim() { | ||||
|   static auto value = [&] { | ||||
|     const int64_t default_min_dim = [&] { | ||||
| @ -1396,6 +1395,8 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) { | ||||
|   return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size; | ||||
| } | ||||
| #endif | ||||
|  | ||||
|  | ||||
| static void addmm_impl_cpu_( | ||||
|     Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) { | ||||
|   TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2); | ||||
| @ -1771,8 +1772,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens | ||||
|     return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) || | ||||
|         (strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1])); | ||||
|   }; | ||||
| #if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED() | ||||
|   // Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL | ||||
|  | ||||
| #if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() | ||||
|   bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]); | ||||
|   if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) { | ||||
|     try { | ||||
| @ -1784,6 +1785,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens | ||||
|     } | ||||
|   } | ||||
| #endif | ||||
|  | ||||
|   if (contraction_size * res_rows * res_cols < 400) { | ||||
|     if (is_bmm_out) { | ||||
|       AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, batch1.scalar_type(), "bmm", [&] { | ||||
|  | ||||
| @ -624,9 +624,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index( | ||||
|   if (backend == BatchNormBackend::Miopen) { | ||||
|     return std::tuple_cat( | ||||
|              at::miopen_batch_norm( | ||||
|                input.contiguous(input.suggest_memory_format()), | ||||
|                weight.contiguous(), | ||||
|                bias.contiguous(), | ||||
|                input.contiguous(), weight.contiguous(), bias.contiguous(), | ||||
|                running_mean.defined() ? running_mean.contiguous() : running_mean, | ||||
|                running_var.defined() ? running_var.contiguous() : running_var, | ||||
|                training, momentum, eps), | ||||
|  | ||||
| @ -2174,7 +2174,7 @@ static void _scatter_via_index_put( | ||||
|   if (self.dim() == 1 || broadcast_index) { | ||||
|     Tensor squeezed = index; | ||||
|     if (broadcast_index && index.dim() > 1) { | ||||
|       for (int64_t d = index.dim() - 1; d >= 0; --d) { | ||||
|       for (const auto d : c10::irange(index.dim())) { | ||||
|         if (d == dim) { | ||||
|           continue; | ||||
|         } | ||||
|  | ||||
| @ -52,7 +52,6 @@ void apply_triu_tril_single( | ||||
|     int64_t self_col_stride, | ||||
|     bool upper) { | ||||
|   constexpr int64_t zero = 0; | ||||
|   k = std::clamp(k, -n, m); // Clamp k to [-n, m] to prevent i + k arithmetic overflow, especially if k approaches INT64_MAX/INT64_MIN. | ||||
|  | ||||
|   if (upper) { | ||||
|     parallel_for(0, n, 0, [&](int64_t start, int64_t end) { | ||||
|  | ||||
| @ -85,11 +85,11 @@ void cpu_max_unpool( | ||||
|     if constexpr (is_3d) { | ||||
|       TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(), | ||||
|           " (output volumes are of size ", output_depth, | ||||
|           "x", output_height, "x", output_width, ")"); | ||||
|           "x", output_height, "x", output_width); | ||||
|     } else { | ||||
|       TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(), | ||||
|           " (output volumes are of size ", output_height, | ||||
|           "x", output_width, ")"); | ||||
|           "x", output_width); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|  | ||||
| @ -36,7 +36,7 @@ void hardsigmoid_kernel(TensorIteratorBase& iter) { | ||||
|             [zero, one_sixth, three, six] GPU_LAMBDA( | ||||
|                 scalar_t self_val) -> scalar_t { | ||||
|               opmath_t x = static_cast<opmath_t>(self_val); | ||||
|               return std::min<opmath_t>(std::max<opmath_t>(x + three, zero), six) * one_sixth; | ||||
|               return std::min(std::max(x + three, zero), six) * one_sixth; | ||||
|             }); | ||||
|       }); | ||||
| } | ||||
|  | ||||
| @ -1080,6 +1080,16 @@ static bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=fals | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static bool _grouped_mm_allowed_device() { | ||||
| #ifdef USE_ROCM | ||||
|     return false; | ||||
| #else | ||||
|     auto dprops = at::cuda::getCurrentDeviceProperties(); | ||||
|     // CUDA capability 8.0 and greater | ||||
|     return dprops->major >= 8; | ||||
| #endif | ||||
| } | ||||
|  | ||||
| #ifdef USE_ROCM | ||||
| static bool _scaled_mm_is_fnuz() { | ||||
|     return at::detail::getCUDAHooks().isGPUArch({"gfx942"}); | ||||
| @ -1776,19 +1786,14 @@ Tensor _grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b, | ||||
| const std::optional<at::Tensor>& offs, | ||||
| const std::optional<at::Tensor>& bias, | ||||
| std::optional<c10::ScalarType> out_dtype) { | ||||
| #ifndef USE_ROCM | ||||
|   _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype); | ||||
|   bool a_b_and_out_are_bf16 = ( | ||||
|     mat_a.dtype() == at::kBFloat16 && | ||||
|     mat_b.dtype() == at::kBFloat16 && | ||||
|     out_dtype.value_or(at::kBFloat16) == at::kBFloat16 | ||||
|   ); | ||||
| #ifndef USE_ROCM | ||||
|   bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16; | ||||
| #else | ||||
|   // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used. | ||||
|   // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm | ||||
|   bool use_fast_path = false; | ||||
| #endif | ||||
|   const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype); | ||||
|   Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); | ||||
|   if (use_fast_path) { | ||||
| @ -1798,6 +1803,9 @@ std::optional<c10::ScalarType> out_dtype) { | ||||
|     _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out); | ||||
|   } | ||||
|   return out; | ||||
| #else | ||||
|   TORCH_CHECK(false, "grouped gemm is not supported on ROCM") | ||||
| #endif | ||||
| } | ||||
|  | ||||
| Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) { | ||||
|  | ||||
| @ -317,17 +317,6 @@ void nonzero_static_cuda_out_impl( | ||||
|     out_temp = | ||||
|         Tensor(at::detail::empty_cuda({self.dim(), size}, out.options())).t(); | ||||
|   } | ||||
|   // If input has zero elements, avoid kernel grid calculations (which can | ||||
|   // produce zero divisors) and just fill the output with fill_value. | ||||
|   if (self.numel() == 0) { | ||||
|     if (need_to_copy) { | ||||
|       out_temp.fill_(fill_value); | ||||
|       out.copy_(out_temp); | ||||
|     } else { | ||||
|       out.fill_(fill_value); | ||||
|     } | ||||
|     return; | ||||
|   } | ||||
|   int64_t* out_data_ptr = need_to_copy ? out_temp.mutable_data_ptr<int64_t>() | ||||
|                                        : out.mutable_data_ptr<int64_t>(); | ||||
|  | ||||
|  | ||||
| @ -416,7 +416,6 @@ struct ReduceOp { | ||||
|     if (config.should_block_y_reduce()) { | ||||
|       value = block_y_reduce<output_vec_size>(value, shared_memory); | ||||
|     } | ||||
|     __syncthreads(); | ||||
|     if (config.should_block_x_reduce()) { | ||||
|       value = block_x_reduce<output_vec_size>(value, shared_memory); | ||||
|     } | ||||
|  | ||||
| @ -17,11 +17,12 @@ __global__ static void compute_cuda_kernel( | ||||
|     index_t* result_ptr, | ||||
|     int64_t size, | ||||
|     int64_t result_size) { | ||||
|   CUDA_KERNEL_ASSERT_PRINTF( | ||||
|       result_size == cumsum_ptr[size - 1], | ||||
|   if (C10_UNLIKELY((result_size != cumsum_ptr[size - 1]))) { | ||||
|     printf("%s:%d:%s: block: [%d,%d,%d], thread: [%d,%d,%d] " | ||||
|       "Invalid input! In `repeat_interleave`, the `output_size` argument (%ld) must be the same as the sum of the elements in the `repeats` tensor (%ld).\n", | ||||
|       result_size, | ||||
|       cumsum_ptr[size - 1]); | ||||
|       __FILE__, __LINE__, __func__,blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, result_size, cumsum_ptr[size - 1 ]); | ||||
|     CUDA_KERNEL_ASSERT(result_size == cumsum_ptr[size - 1]) | ||||
|   } | ||||
|  | ||||
|   int64_t idx = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x; | ||||
|   int64_t stride = (blockDim.x * gridDim.x) / C10_WARP_SIZE; | ||||
|  | ||||
| @ -226,38 +226,6 @@ __global__ void CatArrayBatchedCopy_contig( | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec> | ||||
| __global__ void CatArrayBatchedCopy_vectorized( | ||||
|     char* output, | ||||
|     CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs, | ||||
|     TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os, | ||||
|     const int concatDim, | ||||
|     IndexType trailingSize) { | ||||
|  | ||||
|     IndexType tid = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|     IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec; | ||||
|  | ||||
|     if(tid >= nElements) return; | ||||
|  | ||||
|     const char * data = (char*)inputs.input[blockIdx.y]; | ||||
|     IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec; | ||||
|     IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec; | ||||
|     int64_t dataOffset = (int64_t)offset  * alignment; // in bytes | ||||
|  | ||||
|     IndexType stride = gridDim.x * blockDim.x; | ||||
|  | ||||
|     while( tid < nElements){ | ||||
|       int64_t elementOffset = (int64_t)CatArrIndexToOffset<IndexType, Dims>::compute( | ||||
|                     os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes | ||||
|       auto vec = at::native::memory::ld_vec<alignment>(data + (int64_t)alignment * tid); | ||||
|       at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec); | ||||
|       tid += stride; | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| /* | ||||
|   Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads | ||||
|   to improve memory bandwidth throughput. | ||||
| @ -328,27 +296,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
|   scalar_t *data = (scalar_t *)(out.mutable_data_ptr()); | ||||
|   CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData; | ||||
|   TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam; | ||||
|   // If all batches are contiguous we can call a specialized implementation | ||||
|   // which requires the input tensor addresses to be aligned to a | ||||
|   // 16 Byte boundary. | ||||
|  | ||||
|   constexpr bool isContig = stride_size == 1; | ||||
|   bool isAligned = true; | ||||
|   constexpr int alignment = 16; | ||||
|  | ||||
|   // Next, let's initialize the size, stride arrays for the output Tensor. | ||||
|   // for contig case, we'll canonicalize output strides, so that | ||||
|   // we don't have arbitrary strides for dims of size 0 | ||||
|   size_t stride0 = 1; | ||||
|   if (memory_format == c10::MemoryFormat::Contiguous) { | ||||
|     for (int i = nDims - 1; i >= 0; --i) { | ||||
|     for (int i = 0; i < nDims; ++i) { | ||||
|       outputParam.tensorSize[i] = out.size(i); | ||||
|       if (isContig) { | ||||
|         outputParam.tensorStride[i] = stride0; | ||||
|         stride0 *= out.size(i); | ||||
|       } else { | ||||
|         outputParam.tensorStride[i] = out.stride(i); | ||||
|       } | ||||
|       outputParam.tensorStride[i] = out.stride(i); | ||||
|     } | ||||
|   } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) { | ||||
|     // permute the semantics of dims from NCHW to NHWC so that the input | ||||
| @ -367,15 +320,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
|  | ||||
|   at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(); | ||||
|  | ||||
|   // If all batches are contiguous we can call a specialized implementation | ||||
|   // which requires the input tensor addresses to be aligned to a | ||||
|   // 16 Byte boundary. | ||||
|  | ||||
|   // for channels last computing slice size correctly is much more involved, so we never send it | ||||
|   // on the fully vectorized path | ||||
|   // we need output stride in cat dimension to be multiple of alignment, | ||||
|   // if we ever use it to compute offsets | ||||
|   // for catting in 0th dimension it doesn't matter | ||||
|   bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment && | ||||
|                         memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 || | ||||
|                         outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0); | ||||
|   bool isContig = true; | ||||
|   bool isAligned = true; | ||||
|   unsigned int max_elements_per_tensor = 0; | ||||
|  | ||||
|   // Now we loop | ||||
| @ -391,16 +341,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
|       // high-dimensional tensor | ||||
|       if (inputs[i+batchCounter].get().numel() > 0) { | ||||
|         dimSize = inputs[i+batchCounter].get().size(dimension); | ||||
|         if (isInOutAligned) { | ||||
|           auto t = inputs[i+batchCounter].get(); | ||||
|           // similarly to output stride, we cannot trust stride value to | ||||
|           // determine slice size if the corresponding dimension is 1 | ||||
|           // we have to multiply all the subsequent sizes | ||||
|           int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ? | ||||
|              t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end()); | ||||
|           slice_size *= sizeof(scalar_t); | ||||
|           isInOutAligned &= (slice_size % alignment == 0); | ||||
|         } | ||||
|       } | ||||
|  | ||||
|       catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr()); | ||||
| @ -411,12 +351,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
| #ifdef USE_ROCM | ||||
|       // On ROCm, CatArrayBatchedCopy_contig is faster | ||||
|       isAligned = false; | ||||
|       isInOutAligned = false; | ||||
| #else | ||||
|       // If at least one of the inputs is not aligned, we can't call the | ||||
|       // CatArrayBatchedCopy_alignedK_contig | ||||
|       isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]); | ||||
|       isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment; | ||||
| #endif | ||||
|  | ||||
|       if (stride_size > 1) { | ||||
| @ -427,6 +365,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
|           catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j]; | ||||
|         } | ||||
|         catMetaData.isContiguous[batchCounter] = false; | ||||
|         isContig = false; | ||||
|       } else { | ||||
|         catMetaData.isContiguous[batchCounter] = true; | ||||
|       } | ||||
| @ -449,13 +388,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
|           max_elements_per_tensor, batchCounter); | ||||
| #else | ||||
|     dim3 applyBlock, catGrid; | ||||
|     if (isInOutAligned) { | ||||
|       std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>( | ||||
|         max_elements_per_tensor, batchCounter); | ||||
|     } else if (isContig && isAligned && sizeof(scalar_t) > 2) { | ||||
|     if (isContig && sizeof(scalar_t) > 2) { | ||||
|       std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>( | ||||
|           max_elements_per_tensor, batchCounter); | ||||
|     } else if (isContig && isAligned && sizeof(scalar_t) == 2) { | ||||
|     } else if (isContig && sizeof(scalar_t) == 2) { | ||||
|       std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>( | ||||
|           max_elements_per_tensor, batchCounter); | ||||
|     } else { | ||||
| @ -463,30 +399,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
|       getCatGrid(batchCounter, catGrid); | ||||
|     } | ||||
| #endif | ||||
|     int32_t trailingSize; | ||||
|     TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam; | ||||
|     if (isInOutAligned) { | ||||
|       // in this case we can and should flatten the tensors after the cat dim | ||||
|       // we want to view the tensors as if consisting of `alignment`-sized elements | ||||
|       // however, we might not be able to cleanly divide just the last dim - | ||||
|       // it might not be the multiple of alignment. | ||||
|       // however, we know that the full concatted slice is multiple of alignment, | ||||
|       // so if we flatten all the dims after and including concat dim, | ||||
|       // it will be divisible by alignment | ||||
|       // then we need to divide last out size by elems_per_vec, | ||||
|       // and divide all strides except last by elems_per_vec (last stride is 1 always) | ||||
|       // for input, we will fix up the sizes and strides in the kernel directly | ||||
|       kernelOutputParam = outputParam; | ||||
|       nDims = dimension + 1; | ||||
|       constexpr auto elems_per_vec = alignment / sizeof(scalar_t); | ||||
|       auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1]; | ||||
|       kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec; | ||||
|       trailingSize = outputParam.tensorStride[dimension]; | ||||
|       kernelOutputParam.tensorStride[dimension] = 1; | ||||
|       for (int i = 0; i < dimension; ++i) { | ||||
|         kernelOutputParam.tensorStride[i] /= elems_per_vec; | ||||
|       } | ||||
|     } | ||||
|  | ||||
|     if (memory_format != c10::MemoryFormat::Contiguous) { | ||||
|       switch (dimension) { | ||||
| @ -501,12 +413,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
|     } | ||||
|     // Template Declarations for dim = 1, 2, 3, 4 | ||||
| #define HANDLE_CASE(DIMS) \ | ||||
|     if (isInOutAligned) {\ | ||||
|       constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \ | ||||
|       CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\ | ||||
|       catGrid, applyBlock, 0, stream.stream()>>>(\ | ||||
|         (char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\ | ||||
|     } else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\ | ||||
|     if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\ | ||||
|       CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\ | ||||
|           catGrid, applyBlock, 0, stream.stream()>>>(\ | ||||
|               data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\ | ||||
|  | ||||
| @ -5,20 +5,12 @@ | ||||
|  | ||||
| namespace at::native { | ||||
|  | ||||
| __global__ void weight_int8pack_mm_kernel( | ||||
|     const float* x, | ||||
|     const int8_t* w, | ||||
|     const float* scale, | ||||
|     float* out, | ||||
|     int B, | ||||
|     int K, | ||||
|     int N) { | ||||
| __global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) { | ||||
|   // one thread per output element: [B, N] | ||||
|   int b = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|   int n = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|  | ||||
|   if (b >= B || n >= N) | ||||
|     return; | ||||
|   if (b >= B || n >= N) return; | ||||
|  | ||||
|   float acc = 0.0f; | ||||
|   for (int k = 0; k < K; ++k) { | ||||
| @ -28,11 +20,7 @@ __global__ void weight_int8pack_mm_kernel( | ||||
|   out[b * N + n] = acc * scale[n]; | ||||
| } | ||||
|  | ||||
| void launch_weight_int8pack_mm_cuda_kernel( | ||||
|     const Tensor& x, | ||||
|     const Tensor& w_int8, | ||||
|     const Tensor& scale, | ||||
|     Tensor& out) { | ||||
| void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) { | ||||
|   const int B = x.size(0); | ||||
|   const int K = x.size(1); | ||||
|   const int N = w_int8.size(0); | ||||
| @ -47,16 +35,12 @@ void launch_weight_int8pack_mm_cuda_kernel( | ||||
|       w_int8.data_ptr<int8_t>(), | ||||
|       scale.data_ptr<float>(), | ||||
|       out.data_ptr<float>(), | ||||
|       B, | ||||
|       K, | ||||
|       N); | ||||
|       B, K, N); | ||||
| } | ||||
|  | ||||
|  | ||||
| // Main GPU entry point | ||||
| at::Tensor _weight_int8pack_mm_cuda( | ||||
|     const at::Tensor& x, | ||||
|     const at::Tensor& w_int8, | ||||
|     const at::Tensor& scale) { | ||||
| at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) { | ||||
|   // --- Check inputs --- | ||||
|   TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor"); | ||||
|   TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor"); | ||||
| @ -66,16 +50,12 @@ at::Tensor _weight_int8pack_mm_cuda( | ||||
|   TORCH_CHECK(w_int8.dim() == 2, "w must be 2D"); | ||||
|   TORCH_CHECK(scale.dim() == 1, "scale must be 1D"); | ||||
|  | ||||
|   TORCH_CHECK( | ||||
|       x.size(1) == w_int8.size(1), | ||||
|       "K dimension mismatch: x.size(1) != w.size(1)"); | ||||
|   TORCH_CHECK( | ||||
|       w_int8.size(0) == scale.size(0), | ||||
|       "Output dim mismatch: w.size(0) != scale.size(0)"); | ||||
|   TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)"); | ||||
|   TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)"); | ||||
|  | ||||
|   // --- Determine shapes --- | ||||
|   auto B = x.size(0); // batch size | ||||
|   auto N = w_int8.size(0); // output dim | ||||
|   auto B = x.size(0);  // batch size | ||||
|   auto N = w_int8.size(0);  // output dim | ||||
|  | ||||
|   // Ensure inputs are in the correct types for the kernel | ||||
|   auto x_f32 = x.to(at::kFloat); | ||||
| @ -83,13 +63,12 @@ at::Tensor _weight_int8pack_mm_cuda( | ||||
|   auto scale_f32 = scale.to(at::kFloat); | ||||
|  | ||||
|   // --- Allocate output --- | ||||
|   auto out = at::empty({B, N}, x_f32.options()); | ||||
|   auto out = at::empty({B, N}, x.options().dtype(at::kFloat)); | ||||
|  | ||||
|   // --- Launch kernel --- | ||||
|   launch_weight_int8pack_mm_cuda_kernel( | ||||
|       x_f32, w_int8_contiguous, scale_f32, out); | ||||
|   launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out); | ||||
|  | ||||
|   return out.to(x.dtype()); | ||||
|   return out; | ||||
| } | ||||
|  | ||||
| } // namespace at::native | ||||
|  | ||||
| @ -482,9 +482,7 @@ auto build_graph( | ||||
|   auto scaled_dot_product_flash_attention_options = | ||||
|       fe::graph::SDPA_attributes() | ||||
|           .set_name("CUDNN_SDPA") | ||||
|           .set_is_inference(return_softmaxstats == false) | ||||
|           // TODO(eqy): switch to this API once cuDNN FE is upgraded | ||||
|           // .set_generate_stats(return_softmaxstats) | ||||
|           .set_generate_stats(return_softmaxstats) | ||||
|           .set_causal_mask(is_causal) | ||||
|           .set_attn_scale(attn_scale); | ||||
|   if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) { | ||||
| @ -704,9 +702,7 @@ auto build_graph_nestedtensor( | ||||
|   auto scaled_dot_product_flash_attention_options = | ||||
|       fe::graph::SDPA_attributes() | ||||
|           .set_name("CUDNN_SDPA_NESTEDTENSOR") | ||||
|           .set_is_inference(return_softmaxstats == false) | ||||
|           // TODO(eqy): switch to this API once cuDNN FE is upgraded | ||||
|           // .set_generate_stats(return_softmaxstats) | ||||
|           .set_generate_stats(return_softmaxstats) | ||||
|           .set_causal_mask(is_causal) | ||||
|           .set_attn_scale(attn_scale) | ||||
|           .set_seq_len_q(SEQ_LEN_Q_) | ||||
|  | ||||
| @ -7,7 +7,6 @@ | ||||
| #include <ATen/NativeFunctions.h> | ||||
| #else | ||||
| #include <ATen/ops/empty.h> | ||||
| #include <ATen/ops/empty_like.h> | ||||
| #include <ATen/ops/miopen_batch_norm_native.h> | ||||
| #include <ATen/ops/miopen_batch_norm_backward_native.h> | ||||
| #endif | ||||
| @ -103,7 +102,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm( | ||||
|     mode = miopenBNSpatial; | ||||
|   } | ||||
|  | ||||
|   auto output_t = at::empty_like(input_t, input_t.options(), input_t.suggest_memory_format()); | ||||
|   auto output_t = at::empty(input->sizes(), input->options()); | ||||
|   TensorArg output{ output_t, "output", 0 }; | ||||
|  | ||||
|   auto handle = getMiopenHandle(); | ||||
| @ -171,15 +170,20 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward( | ||||
|     const std::optional<Tensor>& save_var_t_opt, | ||||
|     double epsilon) { | ||||
|   // See [Note: hacky wrapper removal for optional tensor] | ||||
|   const Tensor& save_mean_t = save_mean_t_opt.value_or(Tensor()); | ||||
|   const Tensor& save_var_t = save_var_t_opt.value_or(Tensor()); | ||||
|   const Tensor& running_mean = | ||||
|       running_mean_opt.value_or(Tensor()); | ||||
|   const Tensor& running_var = | ||||
|       running_var_opt.value_or(Tensor()); | ||||
|   const Tensor& save_mean_t = | ||||
|       save_mean_t_opt.value_or(Tensor()); | ||||
|   const Tensor& save_var_t = | ||||
|       save_var_t_opt.value_or(Tensor()); | ||||
|  | ||||
|   auto grad_output_contig = | ||||
|       grad_output_t.contiguous(input_t.suggest_memory_format()); | ||||
|   TensorArg input{input_t, "input", 1}, | ||||
|       grad_output{grad_output_contig, "grad_output", 2}, | ||||
|       weight{weight_t, "weight", 3}, save_mean{save_mean_t, "save_mean", 4}, | ||||
|       save_var{save_var_t, "save_var", 5}; | ||||
|   TensorArg input{ input_t, "input", 1 }, | ||||
|             grad_output{ grad_output_t, "grad_output", 2 }, | ||||
|             weight{ weight_t, "weight", 3 }, | ||||
|             save_mean{ save_mean_t, "save_mean", 4 }, | ||||
|             save_var{ save_var_t, "save_var", 5 }; | ||||
|   CheckedFrom c = "miopen_batch_norm_backward"; | ||||
|  | ||||
|   checkAllDefined(c, {input, grad_output, weight, save_mean, save_var}); | ||||
| @ -191,11 +195,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward( | ||||
|   } | ||||
|   checkAllSameType(c, {input, grad_output}); | ||||
|   checkAllSameType(c, {weight, save_mean, save_var}); | ||||
|   // TODO: is weight required to be contiguous? | ||||
|   checkAllContiguous(c, {save_mean, save_var}); | ||||
|   // TODO: TensorArg check should start handle memory format | ||||
|   TORCH_CHECK(input->is_contiguous(input->suggest_memory_format())); | ||||
|   TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format())); | ||||
|   checkAllContiguous(c, {input, grad_output, save_mean, save_var}); | ||||
|   checkDimRange(c, input, 2, 6 /* exclusive */); | ||||
|   checkSameSize(c, input, grad_output); | ||||
|   auto num_features = input->size(1); | ||||
| @ -210,7 +210,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward( | ||||
|     mode = miopenBNSpatial; | ||||
|   } | ||||
|  | ||||
|   auto grad_input_t  = at::empty(input->sizes(), input->options(), input->suggest_memory_format()); | ||||
|   auto grad_input_t  = at::empty(input->sizes(), input->options()); | ||||
|   auto grad_weight_t = at::empty(weight->sizes(), weight->options()); | ||||
|   auto grad_bias_t   = at::empty(weight->sizes(), weight->options()); | ||||
|  | ||||
|  | ||||
| @ -1770,12 +1770,10 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_depthwise_convolution_back | ||||
| // fusions | ||||
| // --------------------------------------------------------------------- | ||||
|  | ||||
| void raw_miopen_convolution_add_relu_out( | ||||
| void raw_miopen_convolution_relu_out( | ||||
|     const Tensor& output, | ||||
|     const Tensor& input, | ||||
|     const Tensor& weight, | ||||
|     const Tensor& z, | ||||
|     float alpha, | ||||
|     const Tensor& bias, | ||||
|     IntArrayRef stride, | ||||
|     IntArrayRef padding, | ||||
| @ -1783,20 +1781,68 @@ void raw_miopen_convolution_add_relu_out( | ||||
|     int64_t groups, | ||||
|     bool benchmark, | ||||
|     bool deterministic) { | ||||
|   raw_miopen_convolution_forward_out( | ||||
|       output, | ||||
|   auto dataType = getMiopenDataType(input); | ||||
|   miopenConvolutionMode_t c_mode = miopenConvolution; | ||||
|   ConvolutionArgs args{ input, output, weight }; | ||||
|   args.handle = getMiopenHandle(); | ||||
|   at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight); | ||||
|   setConvolutionParams( | ||||
|       &args.params, | ||||
|       args.handle, | ||||
|       input, | ||||
|       weight, | ||||
|       padding, | ||||
|       stride, | ||||
|       dilation, | ||||
|       groups, | ||||
|       deterministic, | ||||
|       memory_format); | ||||
|   args.idesc.set(input, memory_format); | ||||
|   args.wdesc.set(weight, memory_format, 0); | ||||
|   args.odesc.set(output, memory_format); | ||||
|   args.cdesc.set( | ||||
|       dataType, | ||||
|       c_mode, | ||||
|       input.dim() - 2, | ||||
|       args.params.padding, | ||||
|       args.params.stride, | ||||
|       args.params.dilation, | ||||
|       args.params.groups, | ||||
|       benchmark, | ||||
|       deterministic); | ||||
|   at::Tensor alpha_mul_z_add_bias = | ||||
|       at::native::reshape_bias(input.dim(), bias).add(z, alpha); | ||||
|   output.add_(alpha_mul_z_add_bias); | ||||
|   output.relu_(); | ||||
|  | ||||
|   TensorDescriptor bdesc; | ||||
|   bdesc.set(bias.expand({1, bias.size(0)}), output.dim()); | ||||
|  | ||||
|   // Create the fusion plan | ||||
|   miopenFusionPlanDescriptor_t fusePlanDesc; | ||||
|   miopenFusionOpDescriptor_t convoOp; | ||||
|   miopenFusionOpDescriptor_t biasOp; | ||||
|   miopenFusionOpDescriptor_t activOp; | ||||
|   MIOPEN_CHECK(miopenCreateFusionPlan(&fusePlanDesc, miopenVerticalFusion, args.idesc.desc())); | ||||
|   MIOPEN_CHECK(miopenCreateOpConvForward(fusePlanDesc, &convoOp, args.cdesc.desc(), args.wdesc.desc())); | ||||
|   MIOPEN_CHECK(miopenCreateOpBiasForward(fusePlanDesc, &biasOp, bdesc.desc())); | ||||
|   MIOPEN_CHECK(miopenCreateOpActivationForward(fusePlanDesc, &activOp, miopenActivationRELU)); | ||||
|  | ||||
|   // compile fusion plan | ||||
|   MIOPEN_CHECK(miopenCompileFusionPlan(args.handle, fusePlanDesc)); | ||||
|  | ||||
|   // Set the Args | ||||
|   float alpha = static_cast<float>(1); | ||||
|   float beta = static_cast<float>(0); | ||||
|   float activ_alpha = static_cast<float>(0); | ||||
|   float activ_beta = static_cast<float>(0); | ||||
|   float activ_gamma = static_cast<float>(0); | ||||
|   miopenOperatorArgs_t fusionArgs; | ||||
|   MIOPEN_CHECK(miopenCreateOperatorArgs(&fusionArgs)); | ||||
|   MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.const_data_ptr())); | ||||
|   MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.const_data_ptr())); | ||||
|   MIOPEN_CHECK(miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, activ_beta, activ_gamma)); | ||||
|  | ||||
|   miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.const_data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs); | ||||
|  | ||||
|   // Cleanup | ||||
|   miopenDestroyFusionPlan(fusePlanDesc); | ||||
| } | ||||
|  | ||||
| static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat memory_format) { | ||||
| @ -1809,107 +1855,171 @@ static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat m | ||||
| Tensor miopen_convolution_add_relu( | ||||
|     const Tensor& input_t, | ||||
|     const Tensor& weight_t, | ||||
|     const Tensor& z_t, | ||||
|     const Tensor& z, | ||||
|     const std::optional<Scalar>& alpha, | ||||
|     const std::optional<Tensor>& bias_t, | ||||
|     const std::optional<Tensor>& bias, | ||||
|     IntArrayRef stride, | ||||
|     IntArrayRef padding, | ||||
|     IntArrayRef dilation, | ||||
|     int64_t groups) { | ||||
|   auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t); | ||||
|   const Tensor input = input_t.contiguous(memory_format); | ||||
|   const Tensor weight = weight_t.contiguous(memory_format); | ||||
|   Tensor z = z_t; | ||||
|   if (z.suggest_memory_format() != memory_format) { | ||||
|     z = z.to(memory_format); | ||||
|   } | ||||
|   z = z.contiguous(memory_format); | ||||
|  | ||||
|   // FuseFrozenConvAddRelu performs some tensor shape checking | ||||
|   Tensor output_t = at::detail::empty_cuda( | ||||
|       conv_output_size( | ||||
|           input.sizes(), weight.sizes(), padding, stride, dilation), | ||||
|       input.options().memory_format(memory_format)); | ||||
|   if (output_t.numel() == 0) { | ||||
|     return output_t; | ||||
|   } | ||||
|   // MIOpen does not support fusion of add, the alpha2 * z step of the below cuDNN function: | ||||
|   // y = act ( alpha1 * conv(x) + alpha2 * z + bias ) | ||||
|  | ||||
|   auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t); | ||||
|  | ||||
|   auto& ctx = at::globalContext(); | ||||
|   bool benchmark = ctx.benchmarkCuDNN(); | ||||
|   auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0; | ||||
|   auto _bias = bias_t.has_value() | ||||
|       ? bias_t.value() | ||||
|       : at::zeros( | ||||
|             {output_t.size(1)}, | ||||
|             optTypeMetaToScalarType(output_t.options().dtype_opt()), | ||||
|             output_t.options().layout_opt(), | ||||
|             output_t.options().device_opt(), | ||||
|             output_t.options().pinned_memory_opt()); | ||||
|  | ||||
|   raw_miopen_convolution_add_relu_out( | ||||
|       output_t, | ||||
|   TensorArg input  { input_t,  "input",  1 }, | ||||
|             weight { weight_t, "weight", 2 }; | ||||
|  | ||||
|   Tensor output_t = at::detail::empty_cuda( | ||||
|       conv_output_size( | ||||
|         input_t.sizes(), weight_t.sizes(), padding, stride, dilation), | ||||
|       input_t.options().memory_format(memory_format)); | ||||
|   if (output_t.numel() == 0){ | ||||
|     return output_t; | ||||
|   } | ||||
|   // Avoid ambiguity of "output" when this is being used as backwards | ||||
|   TensorArg output{output_t, "result", 0}; | ||||
|   miopen_convolution_forward_out( | ||||
|       output, | ||||
|       "miopen_convolution_add_relu", | ||||
|       input, | ||||
|       weight, | ||||
|       z, | ||||
|       _alpha, | ||||
|       _bias, | ||||
|       stride, | ||||
|       padding, | ||||
|       stride, | ||||
|       dilation, | ||||
|       groups, | ||||
|       benchmark, | ||||
|       true); // deterministic | ||||
|       false // deterministic | ||||
|   ); | ||||
|  | ||||
|   return output_t; | ||||
|   auto contig_output_t = self_or_new_memory_format(output_t, memory_format); | ||||
|  | ||||
|   if (!output_t.is_same(contig_output_t)) { | ||||
|     contig_output_t.copy_(output_t); | ||||
|   } | ||||
|  | ||||
|   auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0; | ||||
|   auto _bias = bias.has_value() | ||||
|           ? bias.value() | ||||
|           : at::zeros( | ||||
|                 {contig_output_t.size(1)}, | ||||
|                 optTypeMetaToScalarType(contig_output_t.options().dtype_opt()), | ||||
|                 contig_output_t.options().layout_opt(), | ||||
|                 contig_output_t.options().device_opt(), | ||||
|                 contig_output_t.options().pinned_memory_opt()); | ||||
|  | ||||
|   at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input_t.dim(), _bias).add(z, _alpha); | ||||
|   contig_output_t.add_(alpha_mul_z_add_bias); | ||||
|   contig_output_t.relu_(); | ||||
|  | ||||
|   return contig_output_t; | ||||
| } | ||||
|  | ||||
| Tensor miopen_convolution_relu( | ||||
|     const Tensor& input_t, | ||||
|     const Tensor& weight_t, | ||||
|     const std::optional<Tensor>& bias_t, | ||||
|     const std::optional<Tensor>& bias, | ||||
|     IntArrayRef stride, | ||||
|     IntArrayRef padding, | ||||
|     IntArrayRef dilation, | ||||
|     int64_t groups) { | ||||
|   auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t); | ||||
|   const Tensor input = input_t.contiguous(memory_format); | ||||
|   const Tensor weight = weight_t.contiguous(memory_format); | ||||
|  | ||||
|   // FuseFrozenConvAddRelu performs some tensor shape checking | ||||
|   Tensor output_t = at::detail::empty_cuda( | ||||
|       conv_output_size( | ||||
|           input.sizes(), weight.sizes(), padding, stride, dilation), | ||||
|       input.options().memory_format(memory_format)); | ||||
|   if (output_t.numel() == 0) { | ||||
|     return output_t; | ||||
|   } | ||||
|  | ||||
|   auto& ctx = at::globalContext(); | ||||
|   bool benchmark = ctx.benchmarkCuDNN(); | ||||
|   auto _bias = bias_t.has_value() | ||||
|       ? bias_t.value() | ||||
|       : at::zeros( | ||||
|             {output_t.size(1)}, | ||||
|             optTypeMetaToScalarType(output_t.options().dtype_opt()), | ||||
|             output_t.options().layout_opt(), | ||||
|             output_t.options().device_opt(), | ||||
|             output_t.options().pinned_memory_opt()); | ||||
|  | ||||
|   raw_miopen_convolution_add_relu_out( | ||||
|       output_t, | ||||
|       input, | ||||
|       weight, | ||||
|       output_t, // use output_t as z to satisfy MIOpen API | ||||
|       0, // alpha | ||||
|       _bias, | ||||
|       stride, | ||||
|       padding, | ||||
|       dilation, | ||||
|       groups, | ||||
|       benchmark, // benchmark | ||||
|       true); // deterministic | ||||
|   // MIOpen currently only supports MemoryFormat::Contiguous and fp32 and 2d | ||||
|   if (input_t.suggest_memory_format() == at::MemoryFormat::Contiguous | ||||
|           && input_t.scalar_type() == at::kFloat | ||||
|           && input_t.ndimension() == 4) { | ||||
|  | ||||
|   return output_t; | ||||
|     // FuseFrozenConvAddRelu performs some tensor shape checking | ||||
|     Tensor output_t = at::detail::empty_cuda( | ||||
|         conv_output_size( | ||||
|             input_t.sizes(), weight_t.sizes(), padding, stride, dilation), | ||||
|         input_t.options().memory_format(input_t.suggest_memory_format())); | ||||
|     if (output_t.numel() == 0) { | ||||
|       return output_t; | ||||
|     } | ||||
|  | ||||
|     auto _bias = bias.has_value() | ||||
|             ? bias.value() | ||||
|             : at::zeros( | ||||
|                   {output_t.size(1)}, | ||||
|                   optTypeMetaToScalarType(output_t.options().dtype_opt()), | ||||
|                   output_t.options().layout_opt(), | ||||
|                   output_t.options().device_opt(), | ||||
|                   output_t.options().pinned_memory_opt()); | ||||
|  | ||||
|     raw_miopen_convolution_relu_out( | ||||
|         output_t, | ||||
|         input_t, | ||||
|         weight_t, | ||||
|         _bias, | ||||
|         stride, | ||||
|         padding, | ||||
|         dilation, | ||||
|         groups, | ||||
|         benchmark, // benchmark | ||||
|         false // deterministic | ||||
|     ); | ||||
|  | ||||
|     return output_t; | ||||
|   } | ||||
|   else { | ||||
|     // fallback | ||||
|  | ||||
|     auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t); | ||||
|  | ||||
|     TensorArg input  { input_t,  "input",  1 }, | ||||
|               weight { weight_t, "weight", 2 }; | ||||
|  | ||||
|     Tensor output_t = at::detail::empty_cuda( | ||||
|         conv_output_size( | ||||
|           input_t.sizes(), weight_t.sizes(), padding, stride, dilation), | ||||
|         input->options().memory_format(memory_format)); | ||||
|     if (output_t.numel() == 0){ | ||||
|       return output_t; | ||||
|     } | ||||
|     // Avoid ambiguity of "output" when this is being used as backwards | ||||
|     TensorArg output{output_t, "result", 0}; | ||||
|     miopen_convolution_forward_out( | ||||
|         output, | ||||
|         "miopen_convolution_relu", | ||||
|         input, | ||||
|         weight, | ||||
|         padding, | ||||
|         stride, | ||||
|         dilation, | ||||
|         groups, | ||||
|         benchmark, | ||||
|         false // deterministic | ||||
|     ); | ||||
|  | ||||
|     auto contig_output_t = self_or_new_memory_format(output_t, memory_format); | ||||
|  | ||||
|     if (!output_t.is_same(contig_output_t)) { | ||||
|       contig_output_t.copy_(output_t); | ||||
|     } | ||||
|  | ||||
|     auto _bias = bias.has_value() | ||||
|             ? bias.value() | ||||
|             : at::zeros( | ||||
|                   {contig_output_t.size(1)}, | ||||
|                   optTypeMetaToScalarType(contig_output_t.options().dtype_opt()), | ||||
|                   contig_output_t.options().layout_opt(), | ||||
|                   contig_output_t.options().device_opt(), | ||||
|                   contig_output_t.options().pinned_memory_opt()); | ||||
|  | ||||
|     at::Tensor reshaped_bias = at::native::reshape_bias(input_t.dim(), _bias); | ||||
|     contig_output_t.add_(reshaped_bias); | ||||
|     contig_output_t.relu_(); | ||||
|  | ||||
|     return contig_output_t; | ||||
|   } | ||||
| } | ||||
|  | ||||
| REGISTER_CUDA_DISPATCH(miopen_convolution_backward_stub, &miopen_convolution_backward) | ||||
|  | ||||
							
								
								
									
										48
									
								
								aten/src/ATen/native/mps/MPSGraphSonomaOps.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								aten/src/ATen/native/mps/MPSGraphSonomaOps.h
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,48 @@ | ||||
| #pragma once | ||||
|  | ||||
| #include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h> | ||||
|  | ||||
| #if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0)) | ||||
|  | ||||
| typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode) { | ||||
|   MPSGraphFFTScalingModeNone = 0L, | ||||
|   MPSGraphFFTScalingModeSize = 1L, | ||||
|   MPSGraphFFTScalingModeUnitary = 2L, | ||||
| }; | ||||
|  | ||||
| @interface FakeMPSGraphFFTDescriptor : NSObject<NSCopying> | ||||
| @property(readwrite, nonatomic) BOOL inverse; | ||||
| @property(readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode; | ||||
| @property(readwrite, nonatomic) BOOL roundToOddHermitean; | ||||
| + (nullable instancetype)descriptor; | ||||
| @end | ||||
|  | ||||
| @compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor; | ||||
|  | ||||
| @interface MPSGraph (SonomaOps) | ||||
| - (MPSGraphTensor* _Nonnull)conjugateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)realPartOfTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)fastFourierTransformWithTensor:(MPSGraphTensor* _Nonnull)tensor | ||||
|                                                       axes:(NSArray<NSNumber*>* _Nonnull)axes | ||||
|                                                 descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor | ||||
|                                                       name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)realToHermiteanFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor | ||||
|                                                     axes:(NSArray<NSNumber*>* _Nonnull)axes | ||||
|                                               descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor | ||||
|                                                     name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)HermiteanToRealFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor | ||||
|                                                     axes:(NSArray<NSNumber*>* _Nonnull)axes | ||||
|                                               descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor | ||||
|                                                     name:(NSString* _Nullable)name; | ||||
| @end | ||||
|  | ||||
| // define BFloat16 enums for MacOS13 | ||||
| #define MPSDataTypeBFloat16 ((MPSDataType)(MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16)) | ||||
|  | ||||
| // define Metal version | ||||
| #define MTLLanguageVersion3_1 ((MTLLanguageVersion)((3 << 16) + 1)) | ||||
| #endif | ||||
							
								
								
									
										196
									
								
								aten/src/ATen/native/mps/MPSGraphVenturaOps.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										196
									
								
								aten/src/ATen/native/mps/MPSGraphVenturaOps.h
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,196 @@ | ||||
| #pragma once | ||||
| #include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h> | ||||
|  | ||||
| // TODO: Remove me when moved to MacOS 13 | ||||
| #if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2)) | ||||
|  | ||||
| @interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying> | ||||
|  | ||||
| @property(readwrite, nonatomic) NSUInteger strideInX; | ||||
| @property(readwrite, nonatomic) NSUInteger strideInY; | ||||
| @property(readwrite, nonatomic) NSUInteger strideInZ; | ||||
| @property(readwrite, nonatomic) NSUInteger dilationRateInX; | ||||
| @property(readwrite, nonatomic) NSUInteger dilationRateInY; | ||||
| @property(readwrite, nonatomic) NSUInteger dilationRateInZ; | ||||
|  | ||||
| @property(readwrite, nonatomic) NSUInteger paddingLeft; | ||||
| @property(readwrite, nonatomic) NSUInteger paddingRight; | ||||
| @property(readwrite, nonatomic) NSUInteger paddingTop; | ||||
| @property(readwrite, nonatomic) NSUInteger paddingBottom; | ||||
| @property(readwrite, nonatomic) NSUInteger paddingFront; | ||||
| @property(readwrite, nonatomic) NSUInteger paddingBack; | ||||
|  | ||||
| @property(readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle; | ||||
| @property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout; | ||||
| @property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout; | ||||
|  | ||||
| @property(readwrite, nonatomic) NSUInteger groups; | ||||
|  | ||||
| @end | ||||
|  | ||||
| @compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor; | ||||
|  | ||||
| #endif | ||||
|  | ||||
| @interface MPSGraph (VenturaOps) | ||||
|  | ||||
| #if !defined(__MAC_13_0) && (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0)) | ||||
|  | ||||
| typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode) { | ||||
|   MPSGraphResizeNearestRoundingModeRoundPreferCeil = 0L, | ||||
|   MPSGraphResizeNearestRoundingModeRoundPreferFloor = 1L, | ||||
|   MPSGraphResizeNearestRoundingModeCeil = 2L, | ||||
|   MPSGraphResizeNearestRoundingModeFloor = 3L, | ||||
|   MPSGraphResizeNearestRoundingModeRoundToEven = 4L, | ||||
|   MPSGraphResizeNearestRoundingModeRoundToOdd = 5L, | ||||
| }; | ||||
|  | ||||
| // Define complex enums for MacOS 12 | ||||
| #define MPSDataTypeComplexBit 0x01000000 | ||||
| #define MPSDataTypeComplexFloat32 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64)) | ||||
| #define MPSDataTypeComplexFloat16 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32)) | ||||
| #endif | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)convolution3DWithSourceTensor:(MPSGraphTensor* _Nonnull)source | ||||
|                                             weightsTensor:(MPSGraphTensor* _Nonnull)weights | ||||
|                                                descriptor:(MPSGraphConvolution3DOpDescriptor* _Nonnull)descriptor | ||||
|                                                      name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull) | ||||
|     convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient | ||||
|                                           weightsTensor:(MPSGraphTensor* _Nonnull)weights | ||||
|                                             outputShape:(MPSShape* _Nonnull)outputShape | ||||
|                            forwardConvolutionDescriptor: | ||||
|                                (MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor | ||||
|                                                    name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull) | ||||
|     convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient | ||||
|                                               sourceTensor:(MPSGraphTensor* _Nonnull)source | ||||
|                                                outputShape:(MPSShape* _Nonnull)outputShape | ||||
|                               forwardConvolutionDescriptor: | ||||
|                                   (MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor | ||||
|                                                       name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor* _Nonnull)tensor | ||||
|                                                axis:(NSInteger)axis | ||||
|                                                name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor | ||||
|                                       axis:(NSInteger)axis | ||||
|                                       name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor | ||||
|                                       axis:(NSInteger)axis | ||||
|                                 descending:(BOOL)descending | ||||
|                                       name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor | ||||
|                                 axisTensor:(MPSGraphTensor* _Nonnull)axisTensor | ||||
|                                 descending:(BOOL)descending | ||||
|                                       name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor | ||||
|                                 axisTensor:(MPSGraphTensor* _Nonnull)axisTensor | ||||
|                                       name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor | ||||
|                                          axis:(NSInteger)axis | ||||
|                                          name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor | ||||
|                                          axis:(NSInteger)axis | ||||
|                                    descending:(BOOL)descending | ||||
|                                          name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor | ||||
|                                    axisTensor:(MPSGraphTensor* _Nonnull)axisTensor | ||||
|                                    descending:(BOOL)descending | ||||
|                                          name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor | ||||
|                                    axisTensor:(MPSGraphTensor* _Nonnull)axisTensor | ||||
|                                          name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)inverseOfTensor:(MPSGraphTensor* _Nonnull)inputTensor name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor | ||||
|                                          sizeTensor:(MPSGraphTensor* _Nonnull)size | ||||
|                                 nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode | ||||
|                                        centerResult:(BOOL)centerResult | ||||
|                                        alignCorners:(BOOL)alignCorners | ||||
|                                              layout:(MPSGraphTensorNamedDataLayout)layout | ||||
|                                                name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor | ||||
|                                          sizeTensor:(MPSGraphTensor* _Nonnull)size | ||||
|                                   scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset | ||||
|                                 nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode | ||||
|                                              layout:(MPSGraphTensorNamedDataLayout)layout | ||||
|                                                name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor | ||||
|                                           sizeTensor:(MPSGraphTensor* _Nonnull)size | ||||
|                                         centerResult:(BOOL)centerResult | ||||
|                                         alignCorners:(BOOL)alignCorners | ||||
|                                               layout:(MPSGraphTensorNamedDataLayout)layout | ||||
|                                                 name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor | ||||
|                                           sizeTensor:(MPSGraphTensor* _Nonnull)size | ||||
|                                    scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset | ||||
|                                               layout:(MPSGraphTensorNamedDataLayout)layout | ||||
|                                                 name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient | ||||
|                                                       input:(MPSGraphTensor* _Nonnull)input | ||||
|                                         nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode | ||||
|                                                centerResult:(BOOL)centerResult | ||||
|                                                alignCorners:(BOOL)alignCorners | ||||
|                                                      layout:(MPSGraphTensorNamedDataLayout)layout | ||||
|                                                        name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient | ||||
|                                                       input:(MPSGraphTensor* _Nonnull)input | ||||
|                                           scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset | ||||
|                                         nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode | ||||
|                                                      layout:(MPSGraphTensorNamedDataLayout)layout | ||||
|                                                        name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient | ||||
|                                                        input:(MPSGraphTensor* _Nonnull)input | ||||
|                                                 centerResult:(BOOL)centerResult | ||||
|                                                 alignCorners:(BOOL)alignCorners | ||||
|                                                       layout:(MPSGraphTensorNamedDataLayout)layout | ||||
|                                                         name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient | ||||
|                                                        input:(MPSGraphTensor* _Nonnull)input | ||||
|                                            scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset | ||||
|                                                       layout:(MPSGraphTensorNamedDataLayout)layout | ||||
|                                                         name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source | ||||
|                                       coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates | ||||
|                                                 layout:(MPSGraphTensorNamedDataLayout)layout | ||||
|                                   normalizeCoordinates:(BOOL)normalizeCoordinates | ||||
|                                    relativeCoordinates:(BOOL)relativeCoordinates | ||||
|                                           alignCorners:(BOOL)alignCorners | ||||
|                                            paddingMode:(MPSGraphPaddingMode)paddingMode | ||||
|                                           samplingMode:(MPSGraphResizeMode)samplingMode | ||||
|                                          constantValue:(double)constantValue | ||||
|                                                   name:(NSString* _Nullable)name; | ||||
|  | ||||
| - (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source | ||||
|                                       coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates | ||||
|                                                 layout:(MPSGraphTensorNamedDataLayout)layout | ||||
|                                   normalizeCoordinates:(BOOL)normalizeCoordinates | ||||
|                                    relativeCoordinates:(BOOL)relativeCoordinates | ||||
|                                           alignCorners:(BOOL)alignCorners | ||||
|                                            paddingMode:(MPSGraphPaddingMode)paddingMode | ||||
|                                    nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode | ||||
|                                          constantValue:(double)constantValue | ||||
|                                                   name:(NSString* _Nullable)name; | ||||
| - (MPSGraphTensor* _Nonnull)truncateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name; | ||||
|  | ||||
| @end | ||||
| @ -9,6 +9,8 @@ | ||||
| #include <ATen/mps/MPSAllocatorInterface.h> | ||||
| #include <ATen/mps/MPSProfiler.h> | ||||
| #include <ATen/native/mps/MPSGraphSequoiaOps.h> | ||||
| #include <ATen/native/mps/MPSGraphSonomaOps.h> | ||||
| #include <ATen/native/mps/MPSGraphVenturaOps.h> | ||||
| #include <ATen/native/mps/OperationUtils.h> | ||||
| #include <fmt/format.h> | ||||
| #include <fmt/ranges.h> | ||||
| @ -568,7 +570,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, | ||||
|     MPSShape* mpsStrides = getMPSShape(_tensor.strides()); | ||||
|     check_mps_shape(mpsShape); | ||||
|  | ||||
|     auto storage_numel = src.storage().nbytes() / src.element_size() - src.storage_offset(); | ||||
|     auto storage_numel = src.storage().nbytes() / src.element_size(); | ||||
|     TORCH_CHECK(storage_numel <= std::numeric_limits<int32_t>::max(), | ||||
|                 "MPSGaph does not support tensor dims larger than INT_MAX"); | ||||
|     MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:dataType | ||||
|  | ||||
| @ -1,25 +0,0 @@ | ||||
| #pragma once | ||||
| #include <c10/metal/common.h> | ||||
|  | ||||
| #ifdef __METAL__ | ||||
| enum class EmbeddingBagMode { SUM = 0, MEAN, MAX }; | ||||
| #else | ||||
| #include <ATen/native/EmbeddingBag.h> | ||||
| using at::native::EmbeddingBagMode; | ||||
| #endif | ||||
|  | ||||
| template <typename idx_type_t = uint32_t> | ||||
| struct EmbeddingBagParams { | ||||
|   ::c10::metal::array<idx_type_t, 2> weight_strides; | ||||
|   ::c10::metal::array<idx_type_t, 2> output_strides; | ||||
|   ::c10::metal::array<idx_type_t, 2> max_indices_strides; | ||||
|  | ||||
|   idx_type_t per_sample_weights_strides; | ||||
|  | ||||
|   idx_type_t num_indices; | ||||
|   idx_type_t num_bags; | ||||
|   idx_type_t feature_size; | ||||
|  | ||||
|   EmbeddingBagMode mode; | ||||
|   int64_t padding_idx; | ||||
| }; | ||||
| @ -1,212 +0,0 @@ | ||||
| #include <ATen/native/mps/kernels/EmbeddingBag.h> | ||||
| #include <c10/metal/utils.h> | ||||
| #include <metal_array> | ||||
| #include <metal_stdlib> | ||||
|  | ||||
| using namespace metal; | ||||
| using namespace c10::metal; | ||||
|  | ||||
| template <EmbeddingBagMode M, typename T> | ||||
| struct ReductionOpInit { | ||||
|   inline opmath_t<T> operator()() { | ||||
|     return 0; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <typename T> | ||||
| struct ReductionOpInit<EmbeddingBagMode::MAX, T> { | ||||
|   inline opmath_t<T> operator()() { | ||||
|     return static_cast<opmath_t<T>>(-INFINITY); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <EmbeddingBagMode M, typename T> | ||||
| struct ReductionOp { | ||||
|   inline opmath_t<T> operator()( | ||||
|       T weight_val, | ||||
|       opmath_t<T> out_val, | ||||
|       uint32_t per_sample_weights_index, | ||||
|       constant T* per_sample_weights, | ||||
|       uint32_t per_sample_weights_strides); | ||||
| }; | ||||
|  | ||||
| template <typename T> | ||||
| struct ReductionOp<EmbeddingBagMode::SUM, T> { | ||||
|   inline opmath_t<T> operator()( | ||||
|       T weight_val, | ||||
|       opmath_t<T> out_val, | ||||
|       uint32_t per_sample_weights_index, | ||||
|       constant T* per_sample_weights, | ||||
|       uint32_t per_sample_weights_strides) { | ||||
|     if (per_sample_weights_strides) { | ||||
|       T per_sample_weight = per_sample_weights | ||||
|           [per_sample_weights_strides * per_sample_weights_index]; | ||||
|       return static_cast<opmath_t<T>>(per_sample_weight) * | ||||
|           static_cast<opmath_t<T>>(weight_val) + | ||||
|           out_val; | ||||
|     } else { | ||||
|       return static_cast<opmath_t<T>>(weight_val) + out_val; | ||||
|     } | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <typename T> | ||||
| struct ReductionOp<EmbeddingBagMode::MEAN, T> { | ||||
|   inline opmath_t<T> operator()( | ||||
|       T weight_val, | ||||
|       opmath_t<T> out_val, | ||||
|       uint32_t, | ||||
|       constant T*, | ||||
|       uint32_t) { | ||||
|     return static_cast<opmath_t<T>>(weight_val) + out_val; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <typename T> | ||||
| struct ReductionOp<EmbeddingBagMode::MAX, T> { | ||||
|   inline opmath_t<T> operator()( | ||||
|       T weight_val, | ||||
|       opmath_t<T> out_val, | ||||
|       uint32_t, | ||||
|       constant T*, | ||||
|       uint32_t) { | ||||
|     return max(static_cast<opmath_t<T>>(weight_val), out_val); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <EmbeddingBagMode M, typename T> | ||||
| struct ReductionOpFinal { | ||||
|   inline T operator()(opmath_t<T> val, uint32_t) { | ||||
|     return static_cast<T>(val); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <typename T> | ||||
| struct ReductionOpFinal<EmbeddingBagMode::MEAN, T> { | ||||
|   inline T operator()(opmath_t<T> val, uint32_t count) { | ||||
|     auto out = val / count; | ||||
|     return static_cast<T>((count == 0) ? 0 : out); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <typename T> | ||||
| struct ReductionOpFinal<EmbeddingBagMode::MAX, T> { | ||||
|   inline T operator()(opmath_t<T> val, uint32_t count) { | ||||
|     return static_cast<T>((count == 0) ? 0 : val); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <EmbeddingBagMode M, typename T, typename I> | ||||
| void embedding_bag_impl( | ||||
|     constant T* weight, | ||||
|     constant I* indices, | ||||
|     constant I* offsets, | ||||
|     constant T* per_sample_weights, | ||||
|     device T* output, | ||||
|     device I* offset2bag, | ||||
|     device I* bag_size, | ||||
|     device I* max_indices, | ||||
|     constant EmbeddingBagParams<uint32_t>& params, | ||||
|     uint tid) { | ||||
|   auto num_indices = params.num_indices; | ||||
|   auto num_bags = params.num_bags; | ||||
|   auto feature_size = params.feature_size; | ||||
|   auto padding_idx = params.padding_idx; | ||||
|   auto per_sample_weights_strides = params.per_sample_weights_strides; | ||||
|   constant auto& output_strides = params.output_strides; | ||||
|   constant auto& weight_strides = params.weight_strides; | ||||
|   constant auto& max_indices_strides = params.max_indices_strides; | ||||
|  | ||||
|   auto bag_idx = tid / feature_size; | ||||
|   auto feature_idx = tid % feature_size; | ||||
|  | ||||
|   output += bag_idx * output_strides[0] + feature_idx * output_strides[1]; | ||||
|  | ||||
|   uint32_t offsets_end = min(bag_idx + 1, num_bags - 1); | ||||
|   bool is_last_bag = bag_idx + 1 == num_bags; | ||||
|   uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]); | ||||
|   uint32_t indices_end = is_last_bag * (num_indices) + | ||||
|       (!is_last_bag) * (static_cast<uint32_t>(offsets[offsets_end])); | ||||
|  | ||||
|   auto out_val = ReductionOpInit<M, T>()(); | ||||
|  | ||||
|   uint32_t bag_size_ = 0; | ||||
|  | ||||
|   for (uint32_t indices_idx = indices_start; indices_idx < indices_end; | ||||
|        indices_idx++) { | ||||
|     I weight_idx = indices[indices_idx]; | ||||
|     bool pad = (weight_idx == padding_idx); | ||||
|     T weight_val = weight | ||||
|         [static_cast<uint32_t>(weight_idx) * weight_strides[0] + | ||||
|          feature_idx * weight_strides[1]]; | ||||
|  | ||||
|     bag_size_ += static_cast<uint32_t>(!pad); | ||||
|  | ||||
|     auto tmp_val = ReductionOp<M, T>()( | ||||
|         weight_val, | ||||
|         out_val, | ||||
|         indices_idx, | ||||
|         per_sample_weights, | ||||
|         per_sample_weights_strides); | ||||
|  | ||||
|     out_val = pad ? out_val : tmp_val; | ||||
|   } | ||||
|  | ||||
|   *output = ReductionOpFinal<M, T>()(out_val, bag_size_); | ||||
| } | ||||
|  | ||||
| #define DISPATCH_IMPL(MODE)        \ | ||||
|   return embedding_bag_impl<MODE>( \ | ||||
|       weight,                      \ | ||||
|       indices,                     \ | ||||
|       offsets,                     \ | ||||
|       per_sample_weights,          \ | ||||
|       output,                      \ | ||||
|       offset2bag,                  \ | ||||
|       bag_size,                    \ | ||||
|       max_indices,                 \ | ||||
|       params,                      \ | ||||
|       tid) | ||||
|  | ||||
| template <typename T, typename I> | ||||
| kernel void embedding_bag( | ||||
|     constant T* weight [[buffer(0)]], | ||||
|     constant I* indices [[buffer(1)]], | ||||
|     constant I* offsets [[buffer(2)]], | ||||
|     constant T* per_sample_weights [[buffer(3)]], | ||||
|     device T* output [[buffer(4)]], | ||||
|     device I* offset2bag [[buffer(5)]], | ||||
|     device I* bag_size [[buffer(6)]], | ||||
|     device I* max_indices [[buffer(7)]], | ||||
|     constant EmbeddingBagParams<uint32_t>& params [[buffer(8)]], | ||||
|     uint tid [[thread_position_in_grid]]) { | ||||
|   switch (params.mode) { | ||||
|     case EmbeddingBagMode::SUM: | ||||
|       DISPATCH_IMPL(EmbeddingBagMode::SUM); | ||||
|     case EmbeddingBagMode::MEAN: | ||||
|       DISPATCH_IMPL(EmbeddingBagMode::MEAN); | ||||
|     case EmbeddingBagMode::MAX: | ||||
|       DISPATCH_IMPL(EmbeddingBagMode::MAX); | ||||
|   } | ||||
| } | ||||
|  | ||||
| #define REGISTER_EMBEDDING_BAG_OP(T, I)                             \ | ||||
|   template [[host_name("embedding_bag_" #T "_" #I)]]                \ | ||||
|   kernel void embedding_bag<T, I>(                                  \ | ||||
|       constant T * weight [[buffer(0)]],                            \ | ||||
|       constant I * indices [[buffer(1)]],                           \ | ||||
|       constant I * offsets [[buffer(2)]],                           \ | ||||
|       constant T * per_sample_weights [[buffer(3)]],                \ | ||||
|       device T * output [[buffer(4)]],                              \ | ||||
|       device I * offset2bag [[buffer(5)]],                          \ | ||||
|       device I * bag_size [[buffer(6)]],                            \ | ||||
|       device I * max_indices [[buffer(7)]],                         \ | ||||
|       constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \ | ||||
|       uint tid [[thread_position_in_grid]]); | ||||
|  | ||||
| REGISTER_EMBEDDING_BAG_OP(float, int); | ||||
| REGISTER_EMBEDDING_BAG_OP(float, long); | ||||
| REGISTER_EMBEDDING_BAG_OP(half, int); | ||||
| REGISTER_EMBEDDING_BAG_OP(half, long); | ||||
| REGISTER_EMBEDDING_BAG_OP(bfloat, int); | ||||
| REGISTER_EMBEDDING_BAG_OP(bfloat, long); | ||||
| @ -8,6 +8,8 @@ | ||||
| #include <ATen/native/TensorIterator.h> | ||||
| #include <ATen/native/mps/OperationUtils.h> | ||||
| #include <ATen/native/mps/operations/BinaryKernel.h> | ||||
| // For MTLLanguageVersion_3_1 | ||||
| #include <ATen/native/mps/MPSGraphSonomaOps.h> | ||||
| #include <fmt/format.h> | ||||
|  | ||||
| #ifndef AT_PER_OPERATOR_HEADERS | ||||
|  | ||||
| @ -1,12 +1,23 @@ | ||||
| //  Copyright © 2022 Apple Inc. | ||||
| #define TORCH_ASSERT_ONLY_METHOD_OPERATORS | ||||
| #include <ATen/native/ConvUtils.h> | ||||
| #include <ATen/native/mps/MPSGraphVenturaOps.h> | ||||
| #include <ATen/native/mps/OperationUtils.h> | ||||
| #include <ATen/ops/_mps_convolution_native.h> | ||||
| #include <ATen/ops/_mps_convolution_transpose_native.h> | ||||
| #include <ATen/ops/mps_convolution_backward_native.h> | ||||
| #include <ATen/ops/mps_convolution_transpose_backward_native.h> | ||||
| #include <fmt/format.h> | ||||
|  | ||||
| #if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2)) | ||||
|  | ||||
| @implementation FakeMPSGraphConvolution3DOpDescriptor | ||||
| - (nonnull id)copyWithZone:(nullable NSZone*)zone { | ||||
|   return self; | ||||
| } | ||||
|  | ||||
| @end | ||||
|  | ||||
| #endif | ||||
|  | ||||
| namespace at::native { | ||||
|  | ||||
| @ -39,9 +50,11 @@ static void fill_conv3d_desc(MPSGraphConvolution3DOpDescriptor* descriptor_, | ||||
|   descriptor_.paddingFront = paddingDepth; | ||||
|   descriptor_.paddingBack = paddingDepth; | ||||
|  | ||||
|   descriptor_.dataLayout = MPSGraphTensorNamedDataLayoutNCDHW; | ||||
|   // PyTorch always uses NCDHW memory layout for 3D tensors | ||||
|   descriptor_.dataLayout = (MPSGraphTensorNamedDataLayout)7L; // MPSGraphTensorNamedDataLayoutNCDHW; | ||||
|  | ||||
|   descriptor_.weightsLayout = MPSGraphTensorNamedDataLayoutOIDHW; | ||||
|   // PyTorch always uses OIDHW memory layout for 3D weights | ||||
|   descriptor_.weightsLayout = (MPSGraphTensorNamedDataLayout)9L; // MPSGraphTensorNamedDataLayoutOIDHW; | ||||
|  | ||||
|   descriptor_.groups = groups; // not yet tested in Xcode/C++ | ||||
| } | ||||
| @ -173,6 +186,18 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_, | ||||
|     if (bias_defined) | ||||
|       bias_shape = bias_opt.value().sizes(); | ||||
|  | ||||
|     std::string mem_format_key; | ||||
|     switch (memory_format) { | ||||
|       case at::MemoryFormat::Contiguous: | ||||
|         mem_format_key = "Contiguous"; | ||||
|         break; | ||||
|       case at::MemoryFormat::ChannelsLast: | ||||
|         mem_format_key = "ChannelsLast"; | ||||
|         break; | ||||
|       default: | ||||
|         assert(0 && "Check should have been done earlier\n"); | ||||
|     } | ||||
|  | ||||
|     std::string bias_shape_key; | ||||
|     if (bias_defined) { | ||||
|       bias_shape_key = std::to_string(bias_shape[0]); | ||||
| @ -180,16 +205,20 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_, | ||||
|       bias_shape_key = "nobias"; | ||||
|     } | ||||
|  | ||||
|     std::string key = fmt::format("mps_{}convolution:{}:{}:{}:{}:{}:{}:{}:{}", | ||||
|                                   is3DConv ? "3d_" : "", | ||||
|                                   getArrayRefString(stride), | ||||
|                                   getArrayRefString(dilation), | ||||
|                                   getArrayRefString(padding), | ||||
|                                   groups, | ||||
|                                   is_channels_last, | ||||
|                                   mps::getTensorsStringKey({input_t, weight_t}), | ||||
|                                   bias_defined, | ||||
|                                   bias_shape_key); | ||||
|     std::string key; | ||||
|     if (is3DConv) { | ||||
|       key = "mps_3d_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" + | ||||
|           std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + | ||||
|           std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" + | ||||
|           std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key + | ||||
|           mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key; | ||||
|  | ||||
|     } else { | ||||
|       key = "mps_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" + | ||||
|           std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" + | ||||
|           std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key + | ||||
|           mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key; | ||||
|     } | ||||
|  | ||||
|     MPSShape* inputShape = mps::getMPSShape(input_t, memory_format); | ||||
|     MPSShape* outputShape = mps::getMPSShape(output_t, memory_format); | ||||
| @ -371,15 +400,33 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size, | ||||
|   @autoreleasepool { | ||||
|     MPSStream* stream = getCurrentMPSStream(); | ||||
|  | ||||
|     std::string mem_format_key; | ||||
|     switch (memory_format) { | ||||
|       case at::MemoryFormat::Contiguous: | ||||
|         mem_format_key = "Contiguous"; | ||||
|         break; | ||||
|       case at::MemoryFormat::ChannelsLast: | ||||
|         mem_format_key = "ChannelsLast"; | ||||
|         break; | ||||
|       default: | ||||
|         assert(0 && "Check should have been done earlier\n"); | ||||
|     } | ||||
|  | ||||
|     MPSShape* mps_input_shape = getMPSShape(input_size); | ||||
|     std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}", | ||||
|                                   is3DConv ? "3d_" : "", | ||||
|                                   getArrayRefString(stride), | ||||
|                                   getArrayRefString(dilation), | ||||
|                                   getArrayRefString(padding), | ||||
|                                   groups, | ||||
|                                   is_channels_last, | ||||
|                                   getTensorsStringKey({grad_output_t, weight_t})); | ||||
|     std::string key; | ||||
|     if (is3DConv) { | ||||
|       key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" + | ||||
|           ":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + | ||||
|           std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" + | ||||
|           std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key + | ||||
|           getTensorsStringKey({grad_output_t, weight_t}); | ||||
|  | ||||
|     } else { | ||||
|       key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" + | ||||
|           std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" + | ||||
|           std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key + | ||||
|           getTensorsStringKey({grad_output_t, weight_t}); | ||||
|     } | ||||
|     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) { | ||||
|       auto gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t); | ||||
|       auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t); | ||||
| @ -504,13 +551,19 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size, | ||||
|     MPSStream* stream = getCurrentMPSStream(); | ||||
|  | ||||
|     MPSShape* mps_weight_shape = getMPSShape(weight_size); | ||||
|     std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}", | ||||
|                                   is3DConv ? "3d_" : "", | ||||
|                                   getArrayRefString(stride), | ||||
|                                   getArrayRefString(dilation), | ||||
|                                   getArrayRefString(padding), | ||||
|                                   groups, | ||||
|                                   getTensorsStringKey({grad_output_t, input_t, grad_weight_t})); | ||||
|     std::string key; | ||||
|     if (is3DConv) { | ||||
|       key = "mps_3d_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" + | ||||
|           std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + | ||||
|           std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" + | ||||
|           std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + | ||||
|           getTensorsStringKey({grad_output_t, input_t, grad_weight_t}); | ||||
|     } else { | ||||
|       key = "mps_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" + | ||||
|           std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" + | ||||
|           std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + | ||||
|           getTensorsStringKey({grad_output_t, input_t, grad_weight_t}); | ||||
|     } | ||||
|     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) { | ||||
|       MPSShape* inputShape = getMPSShape(input_t); | ||||
|       bool isDepthwiseConv = | ||||
|  | ||||
| @ -2,6 +2,7 @@ | ||||
| #define TORCH_ASSERT_ONLY_METHOD_OPERATORS | ||||
| #include <ATen/mps/MPSProfiler.h> | ||||
| #include <ATen/native/mps/Copy.h> | ||||
| #include <ATen/native/mps/MPSGraphSonomaOps.h> | ||||
| #include <ATen/native/mps/OperationUtils.h> | ||||
| #include <ATen/ops/_copy_from_and_resize_native.h> | ||||
| #include <ATen/ops/_copy_from_native.h> | ||||
|  | ||||
| @ -5,6 +5,8 @@ | ||||
| #include <ATen/native/DistributionTemplates.h> | ||||
| #include <ATen/native/Distributions.h> | ||||
| #include <ATen/native/TensorFactories.h> | ||||
| #include <ATen/native/mps/MPSGraphSonomaOps.h> | ||||
| #include <ATen/native/mps/MPSGraphVenturaOps.h> | ||||
| #include <ATen/native/mps/OperationUtils.h> | ||||
|  | ||||
| #ifndef AT_PER_OPERATOR_HEADERS | ||||
|  | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	