mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 13:44:15 +08:00
Compare commits
237 Commits
v2.9.0-rc4
...
remove_pyi
Author | SHA1 | Date | |
---|---|---|---|
a3795cfaea | |||
a9d5c00727 | |||
c1102ca308 | |||
7856f8d7f4 | |||
973c3b531a | |||
d5a496e7f1 | |||
051e544ef6 | |||
0925c644ed | |||
b2553a6ec4 | |||
a749c40342 | |||
595e13feb7 | |||
ddc5107601 | |||
a94ddd9b00 | |||
29f84b0f61 | |||
27daa6af6a | |||
9b429846e8 | |||
cdfa298a3b | |||
d25c35d2b2 | |||
ee53ad2dd0 | |||
0dcd9304aa | |||
25f1a5d8d1 | |||
269c9907a0 | |||
a326ef37e6 | |||
cdb2d1838a | |||
f7ea4975ab | |||
65d642d6db | |||
fa4d5e76ea | |||
38afeb2ba2 | |||
53b8bdb977 | |||
cad052423b | |||
b5f4a7dc14 | |||
d89189f289 | |||
d71a6497b7 | |||
a0dca0fc60 | |||
e15686b40d | |||
1e9ddf510f | |||
7357eb66c5 | |||
03798b0f91 | |||
6c334885d4 | |||
a7bbc5fea7 | |||
98e9440f30 | |||
66c0f14ecc | |||
972e409829 | |||
52af91e4c1 | |||
179f10621b | |||
195ac549d7 | |||
636a511084 | |||
75de5b65b4 | |||
6b59a19242 | |||
5f66902ecf | |||
00e9ba75cd | |||
333e546c02 | |||
f7e8321961 | |||
30e16d6389 | |||
28e8531032 | |||
0babdfad63 | |||
561430edcd | |||
79d2418b5a | |||
5dd84559a5 | |||
5dd14f0b65 | |||
95191522e0 | |||
da954f10d6 | |||
d959eb02cb | |||
62f044e260 | |||
2335f90414 | |||
6e8f17c580 | |||
31345fb4f7 | |||
872ed60679 | |||
e8eeb06034 | |||
3cd734584d | |||
222ec8d28e | |||
c140bf217f | |||
7eb92b076f | |||
ccb450b190 | |||
ae97eb86f7 | |||
7a9c4d794c | |||
501e19137a | |||
4a757e1e17 | |||
563921619b | |||
84d8ec73f1 | |||
a956066b4e | |||
ff6870d134 | |||
92f9ed7ac3 | |||
8e217a9f6d | |||
429052f151 | |||
a3f01f6418 | |||
62843c14bb | |||
082d3dd9d5 | |||
468c1f9e9d | |||
9614c2eb14 | |||
4c6a6c2db9 | |||
3ad3bfe11d | |||
1c6dfbe557 | |||
934f878883 | |||
cef05b1202 | |||
b500c166ef | |||
d65ffdef3d | |||
ac72f81c12 | |||
9cac1b9259 | |||
9bc648235d | |||
799471d92b | |||
43d9b5ecaa | |||
463fbc8ca0 | |||
2f53395943 | |||
fccddf02b6 | |||
8be8b94793 | |||
fe8cc619b8 | |||
2f5a24c2a2 | |||
24492cbab2 | |||
3f6d88f04c | |||
94db2ad51d | |||
9f783e172d | |||
a8432bcaad | |||
a3a40cb741 | |||
c924c675d0 | |||
c3f30eca9e | |||
1e710552c1 | |||
7c39b2ecbe | |||
afdd4247a2 | |||
22df9332da | |||
6b9b7ce6fe | |||
1274297e06 | |||
f68f76d8c7 | |||
fa1d409e83 | |||
52d4660ae9 | |||
7345454e2e | |||
23170dfebc | |||
12e993f533 | |||
07d2531672 | |||
6944d4b639 | |||
f654cff566 | |||
f17c5e0789 | |||
435c18fb4a | |||
612cdc8f48 | |||
da5069f289 | |||
4fd2a2b273 | |||
bb1d53bc47 | |||
36338fc7f2 | |||
e0c910149c | |||
f4aeceaa9d | |||
d8e6b2fddc | |||
31c25c7d01 | |||
5dbee5691c | |||
864ffe12d7 | |||
4e35594674 | |||
35d7b32159 | |||
0663bdb123 | |||
40ea6e418a | |||
348303ebd2 | |||
94755e81c4 | |||
6d65737aee | |||
053251b98d | |||
7e2e83cdbe | |||
d033d11d26 | |||
80d4da893c | |||
bf7f481144 | |||
ab0694f1c6 | |||
5f630d28d7 | |||
a67e798cb7 | |||
30191fcf03 | |||
623e623c82 | |||
f08487aa86 | |||
1051c7dbc2 | |||
2dc2613180 | |||
582d278983 | |||
b5e6e58050 | |||
fefc406a3d | |||
3d32bb114b | |||
de05dbc39c | |||
fc1b09a52a | |||
c2388201fc | |||
a6f9e0e62a | |||
337fe1079d | |||
b494547f0b | |||
d9832d8425 | |||
f0ae3a57f6 | |||
26b3ae5890 | |||
be8095b07f | |||
b2d8f6a6af | |||
98e22c8a69 | |||
e1f0a69943 | |||
833997a6fd | |||
b9a7d0e13b | |||
1c16c18a53 | |||
96ef26f71a | |||
5ac112b569 | |||
dda071587f | |||
11acfed3ce | |||
5f40a8a9a3 | |||
e64965300a | |||
00985970e3 | |||
484c4093a8 | |||
760c478a14 | |||
dc4f97e9c1 | |||
c66e58b7d0 | |||
878f59ef75 | |||
e60ad4f628 | |||
2281d009e5 | |||
33589374b6 | |||
5539916fe1 | |||
e4174b1fd7 | |||
0e7ccc09db | |||
87cc126457 | |||
a3e26d1727 | |||
d2393c2d7d | |||
b498299953 | |||
4d66a3b894 | |||
e2545487de | |||
8922bbcaab | |||
14744e1ab2 | |||
b477fb106f | |||
d22d916719 | |||
86d34a43f5 | |||
8508651477 | |||
723c27ed78 | |||
bdbe931d58 | |||
af60398c3a | |||
82f1eb9b03 | |||
4b2d297eec | |||
0ec723acd0 | |||
e1be887870 | |||
d91eecc9a5 | |||
24a4dae85b | |||
d3c4cf838e | |||
b1e99c8c7a | |||
5eb35d2ab8 | |||
f03d635dc6 | |||
1f0b01d4b6 | |||
c0142f5c06 | |||
3ea6868049 | |||
be3b8d2ec9 | |||
5ccf3ca3ec | |||
e38e953432 | |||
4dd73e659a | |||
0d9c95cd7e | |||
dcc42e95f4 | |||
002e59440a |
@ -5,9 +5,9 @@ GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
|
||||
|
||||
# Set CUDA architecture lists to match x86 build_cuda.sh
|
||||
if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;8.0;9.0"
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;10.0;12.0"
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
|
||||
fi
|
||||
@ -42,9 +42,6 @@ else
|
||||
echo "Bundling CUDA libraries with wheel for aarch64."
|
||||
else
|
||||
echo "Using nvidia libs from pypi for aarch64."
|
||||
# Fix platform constraints in PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64
|
||||
# Replace 'platform_machine == "x86_64"' with 'platform_machine == "aarch64"'
|
||||
export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS//platform_machine == \'x86_64\'/platform_machine == \'aarch64\'}"
|
||||
echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
|
||||
export USE_NVIDIA_PYPI_LIBS=1
|
||||
fi
|
||||
|
@ -138,6 +138,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
folder = os.path.dirname(wheel_path)
|
||||
os.mkdir(f"{folder}/tmp")
|
||||
os.system(f"unzip {wheel_path} -d {folder}/tmp")
|
||||
# Delete original wheel since it will be repackaged
|
||||
os.system(f"rm {wheel_path}")
|
||||
|
||||
# Check if we should use PyPI NVIDIA libraries or bundle system libraries
|
||||
use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
|
||||
@ -211,7 +213,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
]
|
||||
|
||||
# CUDA version-specific libraries
|
||||
if "130" in desired_cuda:
|
||||
if "13" in desired_cuda:
|
||||
minor_version = desired_cuda[-1]
|
||||
version_specific_libs = [
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
|
||||
"/usr/local/cuda/lib64/libcublas.so.13",
|
||||
@ -221,7 +224,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
"/usr/local/cuda/lib64/libcusolver.so.12",
|
||||
"/usr/local/cuda/lib64/libnvJitLink.so.13",
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.13",
|
||||
"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
|
||||
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
|
||||
]
|
||||
elif "12" in desired_cuda:
|
||||
# Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
|
||||
@ -237,6 +240,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.12",
|
||||
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
|
||||
]
|
||||
else:
|
||||
raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
|
||||
|
||||
# Combine all libraries
|
||||
libs_to_copy = common_libs + version_specific_libs
|
||||
@ -275,14 +280,7 @@ def complete_wheel(folder: str) -> str:
|
||||
f"/{folder}/dist/{repaired_wheel_name}",
|
||||
)
|
||||
else:
|
||||
repaired_wheel_name = wheel_name.replace(
|
||||
"linux_aarch64", "manylinux_2_28_aarch64"
|
||||
)
|
||||
print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
|
||||
os.rename(
|
||||
f"/{folder}/dist/{wheel_name}",
|
||||
f"/{folder}/dist/{repaired_wheel_name}",
|
||||
)
|
||||
repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
|
||||
|
||||
print(f"Copying {repaired_wheel_name} to artifacts")
|
||||
shutil.copy2(
|
||||
|
@ -214,8 +214,7 @@ case "$tag" in
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
|
||||
# TODO (huydhn): Upgrade this to Python >= 3.10
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
|
@ -56,9 +56,13 @@ ENV INSTALLED_VISION ${VISION}
|
||||
|
||||
# Install rocm
|
||||
ARG ROCM_VERSION
|
||||
RUN mkdir ci_commit_pins
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
|
||||
COPY ./common/install_rocm.sh install_rocm.sh
|
||||
RUN bash ./install_rocm.sh
|
||||
RUN rm install_rocm.sh
|
||||
RUN rm install_rocm.sh common_utils.sh
|
||||
RUN rm -r ci_commit_pins
|
||||
COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
|
||||
RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
|
||||
RUN rm install_rocm_magma.sh
|
||||
|
1
.ci/docker/ci_commit_pins/rocm-composable-kernel.txt
Normal file
1
.ci/docker/ci_commit_pins/rocm-composable-kernel.txt
Normal file
@ -0,0 +1 @@
|
||||
7fe50dc3da2069d6645d9deb8c017a876472a977
|
@ -1 +1 @@
|
||||
fccfc522864cf8bc172abe0cd58ae5581e2d44b9
|
||||
5ae38bdb0dc066c5823e34dc9797afb9de42c866
|
||||
|
@ -2,6 +2,11 @@
|
||||
|
||||
set -ex
|
||||
|
||||
# for pip_install function
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
|
||||
|
||||
ROCM_COMPOSABLE_KERNEL_VERSION="$(cat $(dirname $0)/../ci_commit_pins/rocm-composable-kernel.txt)"
|
||||
|
||||
ver() {
|
||||
printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
|
||||
}
|
||||
@ -113,6 +118,8 @@ EOF
|
||||
rm -rf HIP clr
|
||||
fi
|
||||
|
||||
pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
|
||||
|
||||
# Cleanup
|
||||
apt-get autoclean && apt-get clean
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
@ -176,6 +183,8 @@ install_centos() {
|
||||
sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
|
||||
done
|
||||
|
||||
pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
|
||||
|
||||
# Cleanup
|
||||
yum clean all
|
||||
rm -rf /var/cache/yum
|
||||
|
@ -52,9 +52,13 @@ ENV INSTALLED_VISION ${VISION}
|
||||
|
||||
# Install rocm
|
||||
ARG ROCM_VERSION
|
||||
RUN mkdir ci_commit_pins
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
|
||||
COPY ./common/install_rocm.sh install_rocm.sh
|
||||
RUN bash ./install_rocm.sh
|
||||
RUN rm install_rocm.sh
|
||||
RUN rm install_rocm.sh common_utils.sh
|
||||
RUN rm -r ci_commit_pins
|
||||
COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
|
||||
RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
|
||||
RUN rm install_rocm_magma.sh
|
||||
|
@ -66,6 +66,11 @@ class VllmBuildParameters:
|
||||
"DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
|
||||
)
|
||||
|
||||
# the cleaning script to remove torch dependencies from pip
|
||||
cleaning_script: Path = env_path_field(
|
||||
"cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
|
||||
)
|
||||
|
||||
# OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
|
||||
output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")
|
||||
|
||||
@ -160,6 +165,7 @@ class VllmBuildRunner(BaseRunner):
|
||||
logger.info("Running vllm build with inputs: %s", inputs)
|
||||
vllm_commit = clone_vllm()
|
||||
|
||||
self.cp_torch_cleaning_script(inputs)
|
||||
self.cp_dockerfile_if_exist(inputs)
|
||||
# cp torch wheels from root direct to vllm workspace if exist
|
||||
self.cp_torch_whls_if_exist(inputs)
|
||||
@ -205,6 +211,11 @@ class VllmBuildRunner(BaseRunner):
|
||||
copy(inputs.torch_whls_path, tmp_dir)
|
||||
return tmp_dir
|
||||
|
||||
def cp_torch_cleaning_script(self, inputs: VllmBuildParameters):
|
||||
script = get_path(inputs.cleaning_script, resolve=True)
|
||||
vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
|
||||
copy(script, vllm_script)
|
||||
|
||||
def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):
|
||||
if not inputs.use_local_dockerfile:
|
||||
logger.info("using vllm default dockerfile.torch_nightly for build")
|
||||
|
@ -11,7 +11,7 @@ from typing import Any
|
||||
|
||||
from cli.lib.common.cli_helper import BaseRunner
|
||||
from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env
|
||||
from cli.lib.common.path_helper import copy, remove_dir
|
||||
from cli.lib.common.path_helper import copy, get_path, remove_dir
|
||||
from cli.lib.common.pip_helper import (
|
||||
pip_install_first_match,
|
||||
pip_install_packages,
|
||||
@ -43,6 +43,10 @@ class VllmTestParameters:
|
||||
|
||||
torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
|
||||
|
||||
cleaning_script: Path = env_path_field(
|
||||
"cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
if not self.torch_whls_path.exists():
|
||||
raise ValueError("missing torch_whls_path")
|
||||
@ -92,11 +96,13 @@ class VllmTestRunner(BaseRunner):
|
||||
self._set_envs(params)
|
||||
|
||||
clone_vllm(dst=self.work_directory)
|
||||
self.cp_torch_cleaning_script(params)
|
||||
with working_directory(self.work_directory):
|
||||
remove_dir(Path("vllm"))
|
||||
self._install_wheels(params)
|
||||
self._install_dependencies()
|
||||
# verify the torches are not overridden by test dependencies
|
||||
|
||||
check_versions()
|
||||
|
||||
def run(self):
|
||||
@ -125,6 +131,11 @@ class VllmTestRunner(BaseRunner):
|
||||
# double check the torches are not overridden by other packages
|
||||
check_versions()
|
||||
|
||||
def cp_torch_cleaning_script(self, params: VllmTestParameters):
|
||||
script = get_path(params.cleaning_script, resolve=True)
|
||||
vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
|
||||
copy(script, vllm_script)
|
||||
|
||||
def _install_wheels(self, params: VllmTestParameters):
|
||||
logger.info("Running vllm test with inputs: %s", params)
|
||||
if not pkg_exists("torch"):
|
||||
|
@ -258,11 +258,19 @@ function install_torchrec_and_fbgemm() {
|
||||
git clone --recursive https://github.com/pytorch/fbgemm
|
||||
pushd fbgemm/fbgemm_gpu
|
||||
git checkout "${fbgemm_commit}" --recurse-submodules
|
||||
python setup.py bdist_wheel \
|
||||
--build-variant=rocm \
|
||||
-DHIP_ROOT_DIR="${ROCM_PATH}" \
|
||||
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
|
||||
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
|
||||
# until the fbgemm_commit includes the tbb patch
|
||||
patch <<'EOF'
|
||||
--- a/FbgemmGpu.cmake
|
||||
+++ b/FbgemmGpu.cmake
|
||||
@@ -184,5 +184,6 @@ gpu_cpp_library(
|
||||
fbgemm_gpu_tbe_cache
|
||||
fbgemm_gpu_tbe_optimizers
|
||||
fbgemm_gpu_tbe_utils
|
||||
+ tbb
|
||||
DESTINATION
|
||||
fbgemm_gpu)
|
||||
EOF
|
||||
python setup.py bdist_wheel --build-variant=rocm
|
||||
popd
|
||||
|
||||
# Save the wheel before cleaning up
|
||||
|
@ -386,8 +386,8 @@ def smoke_test_compile(device: str = "cpu") -> None:
|
||||
|
||||
|
||||
def smoke_test_nvshmem() -> None:
|
||||
if not torch.cuda.is_available():
|
||||
print("CUDA is not available, skipping NVSHMEM test")
|
||||
if not torch.cuda.is_available() or target_os == "windows":
|
||||
print("Windows platform or CUDA is not available, skipping NVSHMEM test")
|
||||
return
|
||||
|
||||
# Check if NVSHMEM is compiled in current build
|
||||
@ -396,7 +396,9 @@ def smoke_test_nvshmem() -> None:
|
||||
except ImportError:
|
||||
# Not built with NVSHMEM support.
|
||||
# torch is not compiled with NVSHMEM prior to 2.9
|
||||
if torch.__version__ < "2.9":
|
||||
from torch.torch_version import TorchVersion
|
||||
|
||||
if TorchVersion(torch.__version__) < (2, 9):
|
||||
return
|
||||
else:
|
||||
# After 2.9: NVSHMEM is expected to be compiled in current build
|
||||
|
@ -1721,11 +1721,6 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
|
||||
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
|
||||
install_torchvision
|
||||
test_inductor_shard "${SHARD_NUMBER}"
|
||||
if [[ "${SHARD_NUMBER}" == 1 ]]; then
|
||||
if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then
|
||||
test_inductor_distributed
|
||||
fi
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *einops* ]]; then
|
||||
test_einops
|
||||
elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
|
||||
|
@ -1,9 +1,9 @@
|
||||
set WIN_DRIVER_VN=528.89
|
||||
set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" & REM @lint-ignore
|
||||
curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe
|
||||
set WIN_DRIVER_VN=580.88
|
||||
set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe" & REM @lint-ignore
|
||||
curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe -s -noreboot
|
||||
start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe -s -noreboot
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
del %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe || ver > NUL
|
||||
del %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe || ver > NUL
|
||||
|
@ -85,7 +85,7 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
|
||||
# Create an isolated directory to store this builds pytorch checkout and conda
|
||||
# installation
|
||||
if [[ -z "$MAC_PACKAGE_WORK_DIR" ]]; then
|
||||
MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_conda_${DESIRED_PYTHON}_$(date +%H%M%S)"
|
||||
MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_${DESIRED_PYTHON}_$(date +%H%M%S)"
|
||||
fi
|
||||
mkdir -p "$MAC_PACKAGE_WORK_DIR" || true
|
||||
if [[ -n ${GITHUB_ACTIONS} ]]; then
|
||||
@ -96,11 +96,11 @@ fi
|
||||
whl_tmp_dir="${MAC_PACKAGE_WORK_DIR}/dist"
|
||||
mkdir -p "$whl_tmp_dir"
|
||||
|
||||
mac_version='macosx_11_0_arm64'
|
||||
mac_version='macosx-11_0-arm64'
|
||||
libtorch_arch='arm64'
|
||||
|
||||
# Create a consistent wheel package name to rename the wheel to
|
||||
wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version}.whl"
|
||||
wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version//[-,]/_}.whl"
|
||||
|
||||
###########################################################
|
||||
|
||||
@ -125,7 +125,6 @@ popd
|
||||
export TH_BINARY_BUILD=1
|
||||
export INSTALL_TEST=0 # dont install test binaries into site-packages
|
||||
export MACOSX_DEPLOYMENT_TARGET=11.0
|
||||
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
@ -133,25 +132,19 @@ RENAME_WHEEL=true
|
||||
case $desired_python in
|
||||
3.14t)
|
||||
echo "Using 3.14 deps"
|
||||
mac_version='macosx-11.0-arm64'
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
RENAME_WHEEL=false
|
||||
;;
|
||||
3.14)
|
||||
echo "Using 3.14t deps"
|
||||
mac_version='macosx-11.0-arm64'
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
RENAME_WHEEL=false
|
||||
;;
|
||||
3.13t)
|
||||
echo "Using 3.13 deps"
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
RENAME_WHEEL=false
|
||||
;;
|
||||
3.13)
|
||||
@ -176,17 +169,12 @@ case $desired_python in
|
||||
;;
|
||||
esac
|
||||
|
||||
# Install into a fresh env
|
||||
tmp_env_name="wheel_py$python_nodot"
|
||||
conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS}
|
||||
source activate "$tmp_env_name"
|
||||
|
||||
PINNED_PACKAGES=(
|
||||
"numpy${NUMPY_PINNED_VERSION}"
|
||||
)
|
||||
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
|
||||
pip install requests ninja typing-extensions
|
||||
retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
|
||||
python -mvenv ~/${desired_python}-build
|
||||
source ~/${desired_python}-build/bin/activate
|
||||
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
|
||||
retry brew install libomp
|
||||
|
||||
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
|
||||
@ -200,7 +188,7 @@ export BUILD_TEST=OFF
|
||||
pushd "$pytorch_rootdir"
|
||||
echo "Calling setup.py bdist_wheel at $(date)"
|
||||
|
||||
python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version}
|
||||
_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name "${mac_version//[-.]/_}"
|
||||
|
||||
echo "Finished setup.py bdist_wheel at $(date)"
|
||||
|
||||
|
2
.flake8
2
.flake8
@ -73,7 +73,7 @@ exclude =
|
||||
./docs/src,
|
||||
./functorch/docs,
|
||||
./functorch/examples,
|
||||
./functorch/notebooks,
|
||||
./functorch/docs/source/tutorials,
|
||||
./scripts,
|
||||
./test/generated_type_hints_smoketest.py,
|
||||
./third_party,
|
||||
|
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
27fc2493d383354a008106f22f3be232badee9a1
|
||||
caba63f0fa29ef9e3d566699f32f11c07c8bda4e
|
||||
|
2
.github/ci_commit_pins/fbgemm_rocm.txt
vendored
2
.github/ci_commit_pins/fbgemm_rocm.txt
vendored
@ -1 +1 @@
|
||||
7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8
|
||||
08ae0af1395c8d8471f4025deb6af9aef90b342f
|
||||
|
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
e10fef08838612b4560e9c72e5cb1414a5edfa13
|
||||
f510715882304796a96e33028b4f6de1b026c2c7
|
||||
|
17
.github/ci_configs/vllm/use_existing_torch.py
vendored
Normal file
17
.github/ci_configs/vllm/use_existing_torch.py
vendored
Normal file
@ -0,0 +1,17 @@
|
||||
import glob
|
||||
|
||||
|
||||
requires_files = glob.glob("requirements/*.txt")
|
||||
requires_files += ["pyproject.toml"]
|
||||
for file in requires_files:
|
||||
print(f">>> cleaning {file}")
|
||||
with open(file) as f:
|
||||
lines = f.readlines()
|
||||
if "torch" in "".join(lines).lower():
|
||||
print("removed:")
|
||||
with open(file, "w") as f:
|
||||
for line in lines:
|
||||
if "torch" not in line.lower():
|
||||
f.write(line)
|
||||
print(f"<<< done cleaning {file}")
|
||||
print()
|
@ -15,7 +15,7 @@ optree==0.13.0
|
||||
packaging==23.1
|
||||
parameterized==0.8.1
|
||||
pillow==10.3.0
|
||||
protobuf==5.29.4
|
||||
protobuf==5.29.5
|
||||
psutil==5.9.8
|
||||
pygments==2.15.0
|
||||
pytest-cpp==2.3.0
|
||||
@ -26,7 +26,7 @@ pytest-xdist==3.3.1
|
||||
pytest==7.3.2
|
||||
pyyaml==6.0.2
|
||||
scipy==1.12.0
|
||||
setuptools==72.1.0
|
||||
setuptools==78.1.1
|
||||
sympy==1.13.3
|
||||
tlparse==0.4.0
|
||||
tensorboard==2.13.0
|
||||
|
4
.github/scripts/docathon-label-sync.py
vendored
4
.github/scripts/docathon-label-sync.py
vendored
@ -39,7 +39,9 @@ def main() -> None:
|
||||
pull_request_label_names = [label.name for label in pull_request_labels]
|
||||
issue_label_names = [label.name for label in issue_labels]
|
||||
labels_to_add = [
|
||||
label for label in issue_label_names if label not in pull_request_label_names
|
||||
label
|
||||
for label in issue_label_names
|
||||
if label not in pull_request_label_names and label != "actionable"
|
||||
]
|
||||
if not labels_to_add:
|
||||
print("The pull request already has the same labels.")
|
||||
|
90
.github/scripts/generate_binary_build_matrix.py
vendored
90
.github/scripts/generate_binary_build_matrix.py
vendored
@ -43,55 +43,55 @@ CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"]
|
||||
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"12.6": (
|
||||
"nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
"nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | "
|
||||
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
|
||||
"nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | "
|
||||
"nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | "
|
||||
"nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | "
|
||||
"nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
|
||||
),
|
||||
"12.8": (
|
||||
"nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
"nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | "
|
||||
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
|
||||
"nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | "
|
||||
"nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | "
|
||||
"nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | "
|
||||
"nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
|
||||
),
|
||||
"13.0": (
|
||||
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
|
||||
"nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
|
||||
"nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
|
||||
"nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
|
||||
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
|
||||
"nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
|
||||
),
|
||||
"xpu": (
|
||||
"intel-cmplr-lib-rt==2025.2.1 | "
|
||||
|
91
.github/scripts/prepare_vllm_wheels.sh
vendored
Executable file
91
.github/scripts/prepare_vllm_wheels.sh
vendored
Executable file
@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -eux
|
||||
|
||||
torch_version=$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
nightly=$(echo ${torch_version} | cut -d'.' -f4)
|
||||
|
||||
# Copied from .ci/manywheel/build_common.sh
|
||||
make_wheel_record() {
|
||||
fpath=$1
|
||||
if echo $fpath | grep RECORD >/dev/null 2>&1; then
|
||||
echo "$fpath,,"
|
||||
else
|
||||
fhash=$(openssl dgst -sha256 -binary $fpath | openssl base64 | sed -e 's/+/-/g' | sed -e 's/\//_/g' | sed -e 's/=//g')
|
||||
fsize=$(ls -nl $fpath | awk '{print $5}')
|
||||
echo "$fpath,sha256=$fhash,$fsize"
|
||||
fi
|
||||
}
|
||||
|
||||
change_wheel_version() {
|
||||
local package=$1
|
||||
local wheel=$2
|
||||
local f_version=$3
|
||||
local t_version=$4
|
||||
|
||||
# Extract the wheel
|
||||
${PYTHON_EXECUTABLE} -mwheel unpack $wheel
|
||||
|
||||
mv "${package}-${f_version}" "${package}-${t_version}"
|
||||
# Change the version from f_version to t_version in the dist-info dir
|
||||
pushd "${package}-${t_version}"
|
||||
mv "${package}-${f_version}.dist-info" "${package}-${t_version}.dist-info"
|
||||
|
||||
pushd "${package}-${t_version}.dist-info"
|
||||
sed -i "s/${package}-${f_version}.dist-info/${package}-${t_version}.dist-info/g" RECORD
|
||||
|
||||
# Update the version in METADATA and its SHA256 hash
|
||||
sed -i "s/Version: ${f_version}/Version: ${t_version}/g" METADATA
|
||||
# then add PyTorch nightly dependency of vLLM
|
||||
if [[ "${package}" == vllm ]] || [[ "${package}" == xformers ]]; then
|
||||
sed -i "/License-File/a\Requires-Dist: torch==${torch_version}" METADATA
|
||||
fi
|
||||
sed -i '/METADATA,sha256/d' RECORD
|
||||
popd
|
||||
|
||||
make_wheel_record "${package}-${t_version}.dist-info/METADATA" >> "${package}-${t_version}.dist-info/RECORD"
|
||||
popd
|
||||
|
||||
# Repack the wheel
|
||||
${PYTHON_EXECUTABLE} -mwheel pack "${package}-${t_version}"
|
||||
|
||||
# Clean up
|
||||
rm -rf "${package}-${t_version}"
|
||||
}
|
||||
|
||||
repackage_wheel() {
|
||||
local package=$1
|
||||
pushd $package
|
||||
|
||||
local orig_wheel=$(find . -name *${package//-/_}*)
|
||||
local orig_version=$(unzip -p $orig_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
|
||||
local version=""
|
||||
if [[ "${package}" == vllm ]]; then
|
||||
# Copied from vllm/.buildkite/scripts/upload-wheels.sh
|
||||
version=1.0.0
|
||||
else
|
||||
version=$(echo $orig_version | tr '.+' '.' | cut -d'.' -f1-3)
|
||||
fi
|
||||
local nightly_version=$version.$nightly
|
||||
|
||||
# Use nightly version
|
||||
change_wheel_version ${package//-/_} $orig_wheel $orig_version $nightly_version
|
||||
# Clean up
|
||||
rm "${orig_wheel}"
|
||||
|
||||
auditwheel repair --plat $PLATFORM *.whl \
|
||||
--exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
|
||||
local repair_wheel=$(find wheelhouse -name *${PLATFORM}*)
|
||||
local repair_wheel=$(basename ${repair_wheel})
|
||||
popd
|
||||
|
||||
cp ${package}/wheelhouse/${repair_wheel} .
|
||||
rm -rf $package
|
||||
}
|
||||
|
||||
pushd externals/vllm/wheels
|
||||
for package in xformers flashinfer-python vllm; do
|
||||
repackage_wheel $package
|
||||
done
|
||||
popd
|
@ -22,6 +22,16 @@ name: !{{ build_environment }}
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
{%- endmacro %}
|
||||
|
||||
{%- macro setup_python(py_ver) -%}
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
|
||||
freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
|
||||
{%- endmacro %}
|
||||
|
||||
on:
|
||||
# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
|
||||
push:
|
||||
@ -61,23 +71,13 @@ jobs:
|
||||
{%- endif %}
|
||||
steps:
|
||||
!{{ set_runner_specific_vars() }}
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
!{{ setup_python(config.get("python_version", "3.10")) }}
|
||||
!{{ common.checkout(deep_clone=False, directory="pytorch") }}
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -94,8 +94,6 @@ jobs:
|
||||
{%- if config["package_type"] == "wheel" %}
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -106,33 +104,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
|
61
.github/workflows/build-vllm-wheel.yml
vendored
61
.github/workflows/build-vllm-wheel.yml
vendored
@ -59,20 +59,6 @@ jobs:
|
||||
run: |
|
||||
set -eux
|
||||
|
||||
# Keep PyTorch nightly wheel here so that we can install it later during
|
||||
# vLLM build process
|
||||
mkdir -p "${RUNNER_TEMP}/artifacts/"
|
||||
|
||||
container_name=$(docker run \
|
||||
--tty \
|
||||
--detach \
|
||||
-e PLATFORM \
|
||||
-v "${GITHUB_WORKSPACE}:/pytorch" \
|
||||
-v "${RUNNER_TEMP}/artifacts:/artifacts" \
|
||||
-w /artifacts/ \
|
||||
"${MANYLINUX_IMAGE}"
|
||||
)
|
||||
|
||||
# Determine python executable for given version (copied from build-triton-wheel)
|
||||
case $PY_VERS in
|
||||
3.10)
|
||||
@ -102,6 +88,21 @@ jobs:
|
||||
;;
|
||||
esac
|
||||
|
||||
# Keep PyTorch nightly wheel here so that we can install it later during
|
||||
# vLLM build process
|
||||
mkdir -p "${RUNNER_TEMP}/artifacts/"
|
||||
|
||||
container_name=$(docker run \
|
||||
--tty \
|
||||
--detach \
|
||||
-e PLATFORM \
|
||||
-e PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
|
||||
-v "${GITHUB_WORKSPACE}:/pytorch" \
|
||||
-v "${RUNNER_TEMP}/artifacts:/artifacts" \
|
||||
-w /artifacts/ \
|
||||
"${MANYLINUX_IMAGE}"
|
||||
)
|
||||
|
||||
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \
|
||||
--pre torch torchvision torchaudio \
|
||||
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
|
||||
@ -113,7 +114,6 @@ jobs:
|
||||
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
|
||||
|
||||
# Save this for later
|
||||
echo "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" >> "$GITHUB_ENV"
|
||||
echo "container_name=${container_name}" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Build vLLM wheel
|
||||
@ -131,36 +131,7 @@ jobs:
|
||||
set -eux
|
||||
|
||||
# Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh
|
||||
docker exec -t "${container_name}" bash -c "
|
||||
set -eux
|
||||
|
||||
nightly=\$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2 | cut -d'.' -f4)
|
||||
|
||||
pushd externals/vllm/wheels
|
||||
for package in xformers flashinfer-python vllm; do
|
||||
pushd \$package
|
||||
auditwheel repair --plat \$PLATFORM *.whl \
|
||||
--exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
|
||||
repair_wheel=\$(find wheelhouse -name *\${PLATFORM}*)
|
||||
repair_wheel=\$(basename \${repair_wheel})
|
||||
popd
|
||||
|
||||
cp \${package}/wheelhouse/\${repair_wheel} .
|
||||
version=\$(unzip -p \$repair_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
|
||||
if [[ \$package == vllm ]]; then
|
||||
new_wheel=\${repair_wheel/\$version/1.0.0.\$nightly}
|
||||
else
|
||||
major_version=\$(echo \$version | tr '.+' '.' | cut -d'.' -f1-3)
|
||||
new_wheel=\${repair_wheel/\$version/\$major_version.\$nightly}
|
||||
fi
|
||||
|
||||
mv -- \$repair_wheel \$new_wheel
|
||||
rm -rf \$package
|
||||
done
|
||||
popd
|
||||
"
|
||||
|
||||
docker exec -t "${container_name}" bash -c /pytorch/.github/scripts/prepare_vllm_wheels.sh
|
||||
docker exec -t "${container_name}" chown -R 1000:1000 /artifacts
|
||||
|
||||
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
|
||||
|
42
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
42
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -132,7 +132,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -178,7 +178,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -224,7 +224,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -335,7 +335,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -381,7 +381,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -427,7 +427,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -538,7 +538,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -584,7 +584,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -630,7 +630,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -741,7 +741,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -787,7 +787,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -833,7 +833,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -944,7 +944,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -990,7 +990,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1036,7 +1036,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1147,7 +1147,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1193,7 +1193,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1239,7 +1239,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1350,7 +1350,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1396,7 +1396,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1442,7 +1442,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
2
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
2
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
@ -60,7 +60,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_8-test: # Testing
|
||||
|
42
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
42
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -127,7 +127,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_6-test: # Testing
|
||||
@ -193,7 +193,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_8-test: # Testing
|
||||
@ -259,7 +259,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda13_0-test: # Testing
|
||||
@ -719,7 +719,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_6-test: # Testing
|
||||
@ -785,7 +785,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_8-test: # Testing
|
||||
@ -851,7 +851,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda13_0-test: # Testing
|
||||
@ -1311,7 +1311,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_6-test: # Testing
|
||||
@ -1377,7 +1377,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_8-test: # Testing
|
||||
@ -1443,7 +1443,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda13_0-test: # Testing
|
||||
@ -1903,7 +1903,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_6-test: # Testing
|
||||
@ -1969,7 +1969,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_8-test: # Testing
|
||||
@ -2035,7 +2035,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda13_0-test: # Testing
|
||||
@ -2495,7 +2495,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_6-test: # Testing
|
||||
@ -2561,7 +2561,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_8-test: # Testing
|
||||
@ -2627,7 +2627,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda13_0-test: # Testing
|
||||
@ -3087,7 +3087,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_6-test: # Testing
|
||||
@ -3153,7 +3153,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_8-test: # Testing
|
||||
@ -3219,7 +3219,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda13_0-test: # Testing
|
||||
@ -3679,7 +3679,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_6-test: # Testing
|
||||
@ -3745,7 +3745,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_8-test: # Testing
|
||||
@ -3811,7 +3811,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda13_0-test: # Testing
|
||||
|
18
.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
generated
vendored
18
.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
generated
vendored
@ -60,13 +60,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.10.4"
|
||||
freethreaded: false
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -81,13 +81,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
|
336
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
336
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
@ -56,13 +56,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.10.4"
|
||||
freethreaded: false
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -77,13 +77,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -99,8 +95,6 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -111,33 +105,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -196,13 +166,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.11.4"
|
||||
freethreaded: false
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -217,13 +187,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -239,8 +205,6 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -251,33 +215,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -336,13 +276,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.12.4"
|
||||
freethreaded: false
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -357,13 +297,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -379,8 +315,6 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -391,33 +325,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -476,13 +386,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.13.4"
|
||||
freethreaded: false
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -497,13 +407,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -519,8 +425,6 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -531,33 +435,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -616,13 +496,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.13.4"
|
||||
freethreaded: true
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -637,13 +517,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -659,8 +535,6 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -671,33 +545,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -756,13 +606,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.14.0-rc.2"
|
||||
freethreaded: false
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -777,13 +627,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -799,8 +645,6 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -811,33 +655,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -896,13 +716,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.14.0-rc.2"
|
||||
freethreaded: true
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -917,13 +737,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -939,8 +755,6 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -951,33 +765,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
|
4
.github/workflows/inductor-nightly.yml
vendored
4
.github/workflows/inductor-nightly.yml
vendored
@ -37,7 +37,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-default-label-prefix
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
@ -56,7 +56,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: nightly-dynamo-benchmarks-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
|
@ -75,7 +75,7 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -101,7 +101,7 @@ jobs:
|
||||
needs: inductor-build
|
||||
if: github.event.schedule == '0 7 * * *'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
@ -118,7 +118,7 @@ jobs:
|
||||
needs: inductor-build
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
|
@ -80,7 +80,7 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -107,7 +107,7 @@ jobs:
|
||||
needs: inductor-build
|
||||
if: github.event.schedule == '0 7 * * *'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
@ -124,7 +124,7 @@ jobs:
|
||||
needs: inductor-build
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
|
4
.github/workflows/inductor-periodic.yml
vendored
4
.github/workflows/inductor-periodic.yml
vendored
@ -154,7 +154,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-default-label-prefix
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
@ -200,7 +200,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: periodic-dynamo-benchmarks-cpu-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
4
.github/workflows/inductor-unittest.yml
vendored
4
.github/workflows/inductor-unittest.yml
vendored
@ -110,7 +110,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
@ -127,7 +127,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-cpu-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
4
.github/workflows/inductor.yml
vendored
4
.github/workflows/inductor.yml
vendored
@ -79,7 +79,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
@ -101,7 +101,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-cpu-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
2
.github/workflows/nightly.yml
vendored
2
.github/workflows/nightly.yml
vendored
@ -54,7 +54,7 @@ jobs:
|
||||
- get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
build-environment: linux-jammy-py3.10-gcc11
|
||||
docker-image: ${{ needs.docs-build.outputs.docker-image }}
|
||||
push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }}
|
||||
run-doxygen: true
|
||||
|
10
.github/workflows/operator_benchmark.yml
vendored
10
.github/workflows/operator_benchmark.yml
vendored
@ -14,6 +14,10 @@ on:
|
||||
schedule:
|
||||
# Run at 07:00 UTC every Sunday
|
||||
- cron: 0 7 * * 0
|
||||
pull_request:
|
||||
paths:
|
||||
- benchmarks/operator_benchmark/**
|
||||
- .github/workflows/operator_benchmark.yml
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
@ -29,7 +33,7 @@ jobs:
|
||||
name: opbenchmark-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -42,7 +46,7 @@ jobs:
|
||||
name: opbenchmark-on-demand-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -55,7 +59,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: opbenchmark-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
2
.github/workflows/pull.yml
vendored
2
.github/workflows/pull.yml
vendored
@ -127,6 +127,8 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build with asan
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-clang18-asan
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
|
2
.github/workflows/slow.yml
vendored
2
.github/workflows/slow.yml
vendored
@ -140,6 +140,8 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build with asan
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-clang18-asan
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
|
4
.github/workflows/trunk.yml
vendored
4
.github/workflows/trunk.yml
vendored
@ -240,7 +240,7 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
build-environment: linux-jammy-py3.10-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -255,7 +255,7 @@ jobs:
|
||||
- verify-cachebench-cpu-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
build-environment: linux-jammy-py3.10-gcc11
|
||||
docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
@ -13,7 +13,7 @@ exclude_patterns = [
|
||||
'**/fb/**',
|
||||
'functorch/docs/**',
|
||||
'functorch/examples/**',
|
||||
'functorch/notebooks/**',
|
||||
'functorch/docs/source/tutorials/**',
|
||||
'torch/_inductor/fx_passes/serialized_patterns/**',
|
||||
'torch/_inductor/autoheuristic/artifacts/**',
|
||||
'scripts/**',
|
||||
@ -1568,7 +1568,6 @@ include_patterns = [
|
||||
exclude_patterns = [
|
||||
'caffe2/**',
|
||||
'functorch/docs/**',
|
||||
'functorch/notebooks/**',
|
||||
'torch/_inductor/fx_passes/serialized_patterns/**',
|
||||
'torch/_inductor/autoheuristic/artifacts/**',
|
||||
'test/dynamo/cpython/**',
|
||||
|
@ -810,7 +810,7 @@ cc_library(
|
||||
name = "torch_python",
|
||||
srcs = libtorch_python_core_sources
|
||||
+ if_cuda(libtorch_python_cuda_sources)
|
||||
+ if_cuda(libtorch_python_distributed_sources)
|
||||
+ libtorch_python_distributed_sources
|
||||
+ GENERATED_AUTOGRAD_PYTHON,
|
||||
hdrs = glob([
|
||||
"torch/csrc/generic/*.cpp",
|
||||
|
@ -234,6 +234,7 @@ cmake_dependent_option(INSTALL_TEST "Install test binaries if BUILD_TEST is on"
|
||||
option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
|
||||
option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
|
||||
option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
|
||||
option(USE_LSAN "Use Leak Sanitizer" OFF)
|
||||
option(USE_TSAN "Use Thread Sanitizer" OFF)
|
||||
option(USE_CUDA "Use CUDA" ON)
|
||||
option(USE_XPU "Use XPU" ON)
|
||||
@ -873,7 +874,7 @@ cmake_dependent_option(
|
||||
"Whether to build the flash_attention kernel for scaled dot product attention.\
|
||||
Will be disabled if not supported by the platform"
|
||||
ON
|
||||
"USE_CUDA OR USE_ROCM;NOT MSVC"
|
||||
"USE_CUDA OR USE_ROCM"
|
||||
OFF)
|
||||
|
||||
cmake_dependent_option(
|
||||
@ -889,9 +890,9 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
|
||||
set(USE_FBGEMM_GENAI off)
|
||||
endif()
|
||||
|
||||
# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100
|
||||
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0a")
|
||||
message(WARNING "Setting USE_FBGEMM_GENAI to ON for CUDA build on SM100")
|
||||
# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
|
||||
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||
message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
|
||||
set(USE_FBGEMM_GENAI ON)
|
||||
endif()
|
||||
|
||||
@ -908,7 +909,7 @@ cmake_dependent_option(
|
||||
# USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake
|
||||
#
|
||||
if(USE_ROCM)
|
||||
if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
|
||||
if(USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)
|
||||
include(cmake/External/aotriton.cmake)
|
||||
endif()
|
||||
endif()
|
||||
|
@ -50,6 +50,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:
|
||||
|
||||
| PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
|
||||
| --- | --- | --- | --- | --- | --- |
|
||||
| 2.9 | >=3.10, <=(3.14, 3.14t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 13.0 (CUDNN 9.13.0.50) | ROCm 6.4 |
|
||||
| 2.8 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 12.9 (CUDNN 9.10.2.21) | ROCm 6.4 |
|
||||
| 2.7 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8 (CUDNN 9.1.0.70), CUDA 12.6 (CUDNN 9.5.1.17) | CUDA 12.8 (CUDNN 9.7.1.26) | ROCm 6.3 |
|
||||
| 2.6 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8, CUDA 12.4 (CUDNN 9.1.0.70) | CUDA 12.6 (CUDNN 9.5.1.17) | ROCm 6.2.4 |
|
||||
|
@ -16,6 +16,8 @@ However, if you believe you have found a security vulnerability in PyTorch, we e
|
||||
|
||||
Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new
|
||||
|
||||
All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
|
||||
|
||||
Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
|
||||
|
||||
https://www.facebook.com/whitehat
|
||||
|
@ -265,6 +265,14 @@ IF(USE_FBGEMM_GENAI)
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
|
||||
list(FILTER fbgemm_genai_native_cuda_cu INCLUDE REGEX ${FBGEMM_CUTLASS_KERNELS_REGEX})
|
||||
|
||||
# PyTorch is not built for 10.0a in CI, due to lack of portability,
|
||||
# so we need to explicitly build these files for 10.0a.
|
||||
foreach(cu_file ${fbgemm_genai_native_cuda_cu})
|
||||
_BUILD_FOR_ADDITIONAL_ARCHS(
|
||||
"${cu_file}"
|
||||
"100a")
|
||||
endforeach()
|
||||
|
||||
file(GLOB_RECURSE fbgemm_genai_native_cuda_cpp
|
||||
"${FBGEMM_GENAI_SRCS}/common/*.cpp"
|
||||
)
|
||||
|
@ -133,12 +133,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
"resize_ called on tensor with symbolic shape")
|
||||
TORCH_CHECK(
|
||||
sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
|
||||
"number of dimensions must be sparse_dim (",
|
||||
"'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
|
||||
size.size(),
|
||||
", sparse_dim = ",
|
||||
sparse_dim,
|
||||
") + dense_dim (",
|
||||
dense_dim,
|
||||
"), but got ",
|
||||
size.size());
|
||||
", dense_dim = ",
|
||||
dense_dim);
|
||||
if (nnz() > 0) {
|
||||
[[maybe_unused]] auto constexpr alt_options_msg =
|
||||
"You could try the following options:\n\
|
||||
@ -254,12 +254,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
"resize_and_clear_ called on tensor with symbolic shape")
|
||||
TORCH_CHECK(
|
||||
sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
|
||||
"number of dimensions must be sparse_dim (",
|
||||
"'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
|
||||
size.size(),
|
||||
", sparse_dim = ",
|
||||
sparse_dim,
|
||||
") + dense_dim (",
|
||||
dense_dim,
|
||||
"), but got ",
|
||||
size.size());
|
||||
", dense_dim = ",
|
||||
dense_dim);
|
||||
|
||||
set_sizes_and_strides(size, std::vector<int64_t>(size.size()));
|
||||
sparse_dim_ = sparse_dim;
|
||||
|
@ -644,6 +644,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
|
||||
void * beta_ptr = &fbeta;
|
||||
#ifdef USE_ROCM
|
||||
int flag = 0;
|
||||
rocblas_datatype c_type = std::is_same<C_Dtype, float>::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r;
|
||||
rocblas_datatype d_type = c_type;
|
||||
#if USE_GEMM_FLAGS_FP16_ALT_IMPL
|
||||
flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
|
||||
#endif
|
||||
@ -652,8 +654,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
|
||||
hipOperationToRocOperation(opb), (int)m, (int)n, (int)k,
|
||||
(void*)alpha_ptr, a, rocblas_datatype_f16_r, (int)lda, stridea,
|
||||
b, rocblas_datatype_f16_r, (int)ldb, strideb,
|
||||
(void*)beta_ptr, c, rocblas_datatype_f16_r, (int)ldc, stridec,
|
||||
c, rocblas_datatype_f16_r, (int)ldc, stridec,
|
||||
(void*)beta_ptr, c, c_type, (int)ldc, stridec,
|
||||
c, d_type, (int)ldc, stridec,
|
||||
(int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard,
|
||||
0, flag)));
|
||||
#else
|
||||
@ -1096,6 +1098,8 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
|
||||
GEMM_CHECK_ARGVALUES(at::Half);
|
||||
#ifdef USE_ROCM
|
||||
int flag = 0;
|
||||
rocblas_datatype c_type = std::is_same<C_Dtype, float>::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r;
|
||||
rocblas_datatype d_type = c_type;
|
||||
#if USE_GEMM_FLAGS_FP16_ALT_IMPL
|
||||
flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
|
||||
#endif
|
||||
@ -1115,10 +1119,10 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
|
||||
ldb,
|
||||
beta_ptr,
|
||||
c,
|
||||
rocblas_datatype_f16_r,
|
||||
c_type,
|
||||
ldc,
|
||||
c,
|
||||
rocblas_datatype_f16_r,
|
||||
d_type,
|
||||
ldc,
|
||||
rocblas_datatype_f32_r,
|
||||
rocblas_gemm_algo_standard,
|
||||
|
@ -45,6 +45,24 @@ struct OffsetCalculator {
|
||||
|
||||
C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
|
||||
offset_type offsets;
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
if ((dims > 0) && (dims <= 2)) {
|
||||
auto divmod = sizes_[0].divmod(linear_idx);
|
||||
#pragma unroll
|
||||
for (int arg = 0; arg < NARGS; arg++)
|
||||
offsets[arg] = divmod.mod * strides_[0][arg];
|
||||
if (dims >= 2) {
|
||||
divmod = sizes_[1].divmod(divmod.div);
|
||||
#pragma unroll
|
||||
for (int arg = 0; arg < NARGS; arg++)
|
||||
offsets[arg] += divmod.mod * strides_[1][arg];
|
||||
}
|
||||
// [...]
|
||||
return offsets;
|
||||
}
|
||||
#endif
|
||||
|
||||
#pragma unroll
|
||||
for (int arg = 0; arg < NARGS; arg++) {
|
||||
offsets[arg] = 0;
|
||||
|
@ -457,24 +457,9 @@ void gemm(
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
// for the fallback path, first compute gemm with beta = 0,
|
||||
// and then add c in full precision.
|
||||
int64_t c_size = n * m;
|
||||
std::vector<float> float_c(c_size, 0.f);
|
||||
gemm_no_downcast_stub(
|
||||
at::kCPU, at::kBFloat16,
|
||||
transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
|
||||
for (const auto j : c10::irange(n)) {
|
||||
for (const auto i : c10::irange(m)) {
|
||||
auto offset = j * ldc + i;
|
||||
// beta == 0 won't propagate NaN from C
|
||||
if (beta == 0.f) {
|
||||
c[offset] = float_c[j * m + i];
|
||||
} else {
|
||||
c[offset] = beta * c[offset] + float_c[j * m + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
}
|
||||
|
||||
void gemm(
|
||||
@ -493,24 +478,9 @@ void gemm(
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
// for the fallback path, first compute gemm with beta = 0,
|
||||
// and then add c in full precision.
|
||||
int64_t c_size = n * m;
|
||||
std::vector<float> float_c(c_size, 0.f);
|
||||
gemm_no_downcast_stub(
|
||||
at::kCPU, at::kHalf,
|
||||
transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
|
||||
for (const auto j : c10::irange(n)) {
|
||||
for (const auto i : c10::irange(m)) {
|
||||
auto offset = j * ldc + i;
|
||||
// beta == 0 won't propagate NaN from C
|
||||
if (beta == 0.f) {
|
||||
c[offset] = float_c[j * m + i];
|
||||
} else {
|
||||
c[offset] = beta * c[offset] + float_c[j * m + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
}
|
||||
|
||||
void gemm(
|
||||
|
@ -1360,7 +1360,8 @@ Tensor outer(const Tensor& self, const Tensor& vec2) {
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
|
||||
#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
|
||||
// Used by default on x86 platforms and on AArch64+ACL
|
||||
static inline int64_t get_mkldnn_matmul_min_dim() {
|
||||
static auto value = [&] {
|
||||
const int64_t default_min_dim = [&] {
|
||||
@ -1395,8 +1396,6 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) {
|
||||
return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static void addmm_impl_cpu_(
|
||||
Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) {
|
||||
TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2);
|
||||
@ -1772,8 +1771,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
|
||||
return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) ||
|
||||
(strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1]));
|
||||
};
|
||||
|
||||
#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
|
||||
#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
|
||||
// Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL
|
||||
bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
|
||||
if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) {
|
||||
try {
|
||||
@ -1785,7 +1784,6 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (contraction_size * res_rows * res_cols < 400) {
|
||||
if (is_bmm_out) {
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, batch1.scalar_type(), "bmm", [&] {
|
||||
|
@ -624,7 +624,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
|
||||
if (backend == BatchNormBackend::Miopen) {
|
||||
return std::tuple_cat(
|
||||
at::miopen_batch_norm(
|
||||
input.contiguous(), weight.contiguous(), bias.contiguous(),
|
||||
input.contiguous(input.suggest_memory_format()),
|
||||
weight.contiguous(),
|
||||
bias.contiguous(),
|
||||
running_mean.defined() ? running_mean.contiguous() : running_mean,
|
||||
running_var.defined() ? running_var.contiguous() : running_var,
|
||||
training, momentum, eps),
|
||||
|
@ -36,7 +36,7 @@ void hardsigmoid_kernel(TensorIteratorBase& iter) {
|
||||
[zero, one_sixth, three, six] GPU_LAMBDA(
|
||||
scalar_t self_val) -> scalar_t {
|
||||
opmath_t x = static_cast<opmath_t>(self_val);
|
||||
return std::min(std::max(x + three, zero), six) * one_sixth;
|
||||
return std::min<opmath_t>(std::max<opmath_t>(x + three, zero), six) * one_sixth;
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -1080,16 +1080,6 @@ static bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=fals
|
||||
#endif
|
||||
}
|
||||
|
||||
static bool _grouped_mm_allowed_device() {
|
||||
#ifdef USE_ROCM
|
||||
return false;
|
||||
#else
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
// CUDA capability 8.0 and greater
|
||||
return dprops->major >= 8;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef USE_ROCM
|
||||
static bool _scaled_mm_is_fnuz() {
|
||||
return at::detail::getCUDAHooks().isGPUArch({"gfx942"});
|
||||
@ -1786,14 +1776,19 @@ Tensor _grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
std::optional<c10::ScalarType> out_dtype) {
|
||||
#ifndef USE_ROCM
|
||||
_grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
|
||||
bool a_b_and_out_are_bf16 = (
|
||||
mat_a.dtype() == at::kBFloat16 &&
|
||||
mat_b.dtype() == at::kBFloat16 &&
|
||||
out_dtype.value_or(at::kBFloat16) == at::kBFloat16
|
||||
);
|
||||
#ifndef USE_ROCM
|
||||
bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16;
|
||||
#else
|
||||
// _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
|
||||
// the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
|
||||
bool use_fast_path = false;
|
||||
#endif
|
||||
const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
|
||||
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
|
||||
if (use_fast_path) {
|
||||
@ -1803,9 +1798,6 @@ std::optional<c10::ScalarType> out_dtype) {
|
||||
_grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
|
||||
}
|
||||
return out;
|
||||
#else
|
||||
TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
|
||||
#endif
|
||||
}
|
||||
|
||||
Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/empty.h>
|
||||
#include <ATen/ops/empty_like.h>
|
||||
#include <ATen/ops/miopen_batch_norm_native.h>
|
||||
#include <ATen/ops/miopen_batch_norm_backward_native.h>
|
||||
#endif
|
||||
@ -102,7 +103,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
|
||||
mode = miopenBNSpatial;
|
||||
}
|
||||
|
||||
auto output_t = at::empty(input->sizes(), input->options());
|
||||
auto output_t = at::empty_like(input_t, input_t.options(), input_t.suggest_memory_format());
|
||||
TensorArg output{ output_t, "output", 0 };
|
||||
|
||||
auto handle = getMiopenHandle();
|
||||
@ -170,20 +171,15 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
|
||||
const std::optional<Tensor>& save_var_t_opt,
|
||||
double epsilon) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
const Tensor& running_mean =
|
||||
running_mean_opt.value_or(Tensor());
|
||||
const Tensor& running_var =
|
||||
running_var_opt.value_or(Tensor());
|
||||
const Tensor& save_mean_t =
|
||||
save_mean_t_opt.value_or(Tensor());
|
||||
const Tensor& save_var_t =
|
||||
save_var_t_opt.value_or(Tensor());
|
||||
const Tensor& save_mean_t = save_mean_t_opt.value_or(Tensor());
|
||||
const Tensor& save_var_t = save_var_t_opt.value_or(Tensor());
|
||||
|
||||
TensorArg input{ input_t, "input", 1 },
|
||||
grad_output{ grad_output_t, "grad_output", 2 },
|
||||
weight{ weight_t, "weight", 3 },
|
||||
save_mean{ save_mean_t, "save_mean", 4 },
|
||||
save_var{ save_var_t, "save_var", 5 };
|
||||
auto grad_output_contig =
|
||||
grad_output_t.contiguous(input_t.suggest_memory_format());
|
||||
TensorArg input{input_t, "input", 1},
|
||||
grad_output{grad_output_contig, "grad_output", 2},
|
||||
weight{weight_t, "weight", 3}, save_mean{save_mean_t, "save_mean", 4},
|
||||
save_var{save_var_t, "save_var", 5};
|
||||
CheckedFrom c = "miopen_batch_norm_backward";
|
||||
|
||||
checkAllDefined(c, {input, grad_output, weight, save_mean, save_var});
|
||||
@ -195,7 +191,11 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
|
||||
}
|
||||
checkAllSameType(c, {input, grad_output});
|
||||
checkAllSameType(c, {weight, save_mean, save_var});
|
||||
checkAllContiguous(c, {input, grad_output, save_mean, save_var});
|
||||
// TODO: is weight required to be contiguous?
|
||||
checkAllContiguous(c, {save_mean, save_var});
|
||||
// TODO: TensorArg check should start handle memory format
|
||||
TORCH_CHECK(input->is_contiguous(input->suggest_memory_format()));
|
||||
TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format()));
|
||||
checkDimRange(c, input, 2, 6 /* exclusive */);
|
||||
checkSameSize(c, input, grad_output);
|
||||
auto num_features = input->size(1);
|
||||
@ -210,7 +210,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
|
||||
mode = miopenBNSpatial;
|
||||
}
|
||||
|
||||
auto grad_input_t = at::empty(input->sizes(), input->options());
|
||||
auto grad_input_t = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
|
||||
auto grad_weight_t = at::empty(weight->sizes(), weight->options());
|
||||
auto grad_bias_t = at::empty(weight->sizes(), weight->options());
|
||||
|
||||
|
@ -617,6 +617,7 @@ static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) {
|
||||
// we allocate 1 here due to MacOS13 bug for gather MPSGraph op, look below for the error
|
||||
Tensor output_t = at::empty({1}, input_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
|
||||
if (output_t.numel() == 0 || num_in_elements == 0) {
|
||||
output_t.fill_(std::numeric_limits<float>::quiet_NaN());
|
||||
return output_t;
|
||||
}
|
||||
|
||||
|
@ -1414,7 +1414,7 @@
|
||||
- func: cat(Tensor[] tensors, int dim=0) -> Tensor
|
||||
structured_delegate: cat.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: cat_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: cat_sparse
|
||||
QuantizedCPU: cat_quantized_cpu
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested
|
||||
tags: core
|
||||
@ -1798,7 +1798,7 @@
|
||||
device_guard: False
|
||||
dispatch:
|
||||
MkldnnCPU: copy_mkldnn_
|
||||
SparseCPU, SparseCUDA: copy_sparse_wrapper_
|
||||
SparseCPU, SparseCUDA, SparseMPS: copy_sparse_wrapper_
|
||||
CompositeExplicitAutograd: copy_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_
|
||||
@ -2160,7 +2160,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: div.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: div_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_sparse
|
||||
ZeroTensor: div_zerotensor
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor
|
||||
tags: [core, pointwise]
|
||||
@ -2170,7 +2170,7 @@
|
||||
variants: method
|
||||
structured_delegate: div.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: div_sparse_
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_sparse_
|
||||
tags: pointwise
|
||||
|
||||
- func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
|
||||
@ -2179,7 +2179,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS, MTIA: div_out
|
||||
SparseCPU, SparseCUDA: div_out_sparse_zerodim
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
|
||||
tags: pointwise
|
||||
|
||||
- func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
|
||||
@ -2187,7 +2187,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: div.out_mode
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: div_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_sparse
|
||||
tags: [core, pointwise]
|
||||
|
||||
- func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
|
||||
@ -2195,7 +2195,7 @@
|
||||
variants: method
|
||||
structured_delegate: div.out_mode
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: div_sparse_
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_sparse_
|
||||
tags: pointwise
|
||||
|
||||
- func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
|
||||
@ -2204,7 +2204,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: div_out_mode
|
||||
SparseCPU, SparseCUDA: div_out_sparse_zerodim
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
|
||||
tags: pointwise
|
||||
|
||||
# For C++ only, until we have conversion from C++ numbers to Tensor
|
||||
@ -2768,20 +2768,20 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CPU, CUDA, MPS, MTIA: floor_divide
|
||||
SparseCPU, SparseCUDA: floor_divide_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse
|
||||
|
||||
- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: method
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: floor_divide_
|
||||
SparseCPU, SparseCUDA: floor_divide_sparse_
|
||||
SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse_
|
||||
|
||||
- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: floor_divide_out
|
||||
SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
|
||||
SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim
|
||||
|
||||
- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -4273,7 +4273,7 @@
|
||||
structured_delegate: mul.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: mul_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: mul_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr
|
||||
MkldnnCPU: mkldnn_mul
|
||||
ZeroTensor: mul_zerotensor
|
||||
@ -4285,7 +4285,7 @@
|
||||
structured_delegate: mul.out
|
||||
variants: method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: mul_sparse_
|
||||
SparseCPU, SparseCUDA, SparseMPS: mul_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_
|
||||
MkldnnCPU: mkldnn_mul_
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor
|
||||
@ -4299,6 +4299,7 @@
|
||||
CPU, CUDA, MPS, MTIA: mul_out
|
||||
SparseCPU: mul_out_sparse_cpu
|
||||
SparseCUDA: mul_out_sparse_cuda
|
||||
SparseMPS: mul_out_sparse_mps
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_out_sparse_csr
|
||||
MkldnnCPU: mkldnn_mul_out
|
||||
tags: pointwise
|
||||
@ -5848,7 +5849,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: sum
|
||||
SparseCPU, SparseCUDA, SparseMeta: sum_coo
|
||||
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
|
||||
autogen: sum.out
|
||||
|
||||
@ -5859,7 +5860,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
NestedTensorCPU: NestedTensor_sum_dim_CPU
|
||||
SparseCPU, SparseCUDA: sum_sparse_coo
|
||||
SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
|
||||
tags: core
|
||||
|
||||
@ -6975,7 +6976,7 @@
|
||||
CPU, CUDA: sub_out
|
||||
MPS: sub_out_mps
|
||||
MTIA: sub_out_mtia
|
||||
SparseCPU, SparseCUDA: sub_out_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: sub_out_sparse
|
||||
tags: pointwise
|
||||
|
||||
- func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
|
||||
@ -6983,7 +6984,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: sub.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: sub_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: sub_sparse
|
||||
ZeroTensor: sub_zerotensor
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor
|
||||
tags: [core, pointwise]
|
||||
@ -6993,7 +6994,7 @@
|
||||
variants: method
|
||||
structured_delegate: sub.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: sub_sparse_
|
||||
SparseCPU, SparseCUDA, SparseMPS: sub_sparse_
|
||||
tags: pointwise
|
||||
# For C++ only, until we have conversion from C++ numbers to Tensor
|
||||
|
||||
@ -10342,7 +10343,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA: pow_Tensor_Scalar_out
|
||||
SparseCPU, SparseCUDA: pow_out_sparse_scalar
|
||||
SparseCPU, SparseCUDA, SparseMPS: pow_out_sparse_scalar
|
||||
MPS: pow_tensor_scalar_out_mps
|
||||
tags: pointwise
|
||||
|
||||
@ -10351,7 +10352,7 @@
|
||||
structured_delegate: pow.Tensor_Scalar_out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: pow_sparse_scalar
|
||||
SparseCPU, SparseCUDA, SparseMPS: pow_sparse_scalar
|
||||
tags: [core, pointwise]
|
||||
|
||||
- func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Config.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/AccumulateType.h>
|
||||
#include <ATen/NamedTensorUtils.h>
|
||||
#include <ATen/native/sparse/ParamUtils.h>
|
||||
#include <ATen/native/SparseTensorUtils.h>
|
||||
@ -295,6 +296,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
|
||||
to exp functions as well as reuse of softmax implementation for
|
||||
log_softmax.
|
||||
*/
|
||||
using accscalar_t = at::acc_type<scalar_t, false>;
|
||||
auto sparse_dim = input.sparse_dim();
|
||||
auto indices = input._indices().contiguous();
|
||||
auto values = input._values().contiguous();
|
||||
@ -340,14 +342,14 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
|
||||
continue;
|
||||
|
||||
/* Prepare scratch space */
|
||||
std::vector<scalar_t> mx_row(nvalues, -std::numeric_limits<scalar_t>::infinity());
|
||||
std::vector<scalar_t> exp_sums_row(nvalues, 0);
|
||||
std::vector<accscalar_t> mx_row(nvalues, -std::numeric_limits<accscalar_t>::infinity());
|
||||
std::vector<accscalar_t> exp_sums_row(nvalues, 0);
|
||||
|
||||
/* Compute mx */
|
||||
for (int64_t i : pool_indices) {
|
||||
auto values_row = values_accessor[i];
|
||||
for (const auto j : c10::irange(nvalues)) {
|
||||
mx_row[j] = std::max(mx_row[j], values_row[j]);
|
||||
mx_row[j] = std::max(mx_row[j], accscalar_t(values_row[j]));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -391,13 +391,13 @@ void _validate_sparse_coo_tensor_args(
|
||||
int64_t sparse_dim = indices.size(0);
|
||||
int64_t dense_dim = values.dim() - 1;
|
||||
TORCH_CHECK(
|
||||
static_cast<int64_t>(size.size()) == sparse_dim + dense_dim,
|
||||
"number of dimensions must be sparse_dim (",
|
||||
sparse_dim,
|
||||
") + dense_dim (",
|
||||
dense_dim,
|
||||
"), but got ",
|
||||
size.size());
|
||||
sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
|
||||
"'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
|
||||
size.size(),
|
||||
", sparse_dim = ",
|
||||
sparse_dim,
|
||||
", dense_dim = ",
|
||||
dense_dim);
|
||||
|
||||
if (check_pinning) {
|
||||
TORCH_CHECK(
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
|
||||
#include <ATen/ops/cat.h>
|
||||
#include <ATen/ops/add_native.h>
|
||||
#include <ATen/ops/mul_native.h>
|
||||
#include <ATen/ops/empty_native.h>
|
||||
#include <ATen/ops/zeros_native.h>
|
||||
#include <ATen/ops/result_type.h>
|
||||
@ -20,10 +21,265 @@
|
||||
namespace at::native {
|
||||
|
||||
using namespace at::sparse;
|
||||
using namespace mps;
|
||||
|
||||
Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha);
|
||||
#ifndef PYTORCH_JIT_COMPILE_SHADERS
|
||||
static auto& lib = MetalShaderLibrary::getBundledLibrary();
|
||||
#else
|
||||
#include <ATen/native/mps/Mul_metallib.h>
|
||||
#endif
|
||||
|
||||
Tensor& add_out_dense_sparse_mps(
|
||||
static SparseTensor& mul_out_dense_sparse_mps(
|
||||
const Tensor& dense,
|
||||
const Tensor& sparse,
|
||||
SparseTensor& out) {
|
||||
|
||||
TORCH_CHECK(sparse.is_sparse(), "mul: expected 'sparse' to be sparse COO");
|
||||
TORCH_CHECK(sparse.is_mps(), "mul: expected 'sparse' to be MPS, got ", sparse.device());
|
||||
TORCH_CHECK(out.is_mps(), "mul: expected 'out' to be MPS, got ", out.device());
|
||||
|
||||
const bool scalar_like = (dense.dim() == 0) || (dense.numel() == 1);
|
||||
TORCH_CHECK(dense.is_mps() || scalar_like,
|
||||
"mul: expected 'dense' to be MPS or scalar-like, got ", dense.device());
|
||||
|
||||
const int64_t nnz = sparse._nnz();
|
||||
out.resize_as_(sparse);
|
||||
|
||||
auto commonDtype = at::result_type(dense, sparse);
|
||||
TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
|
||||
"Can't convert result type ", commonDtype, " to output ", out.scalar_type());
|
||||
|
||||
auto indices = sparse._indices().contiguous();
|
||||
auto values = sparse._values().to(commonDtype).contiguous();
|
||||
|
||||
if (nnz == 0) {
|
||||
auto empty_vals = values.narrow(0, 0, 0);
|
||||
alias_into_sparse(out,
|
||||
indices.narrow(1, 0, 0),
|
||||
(out.scalar_type() == commonDtype) ? empty_vals
|
||||
: empty_vals.to(out.scalar_type()));
|
||||
out._coalesced_(sparse.is_coalesced());
|
||||
return out;
|
||||
}
|
||||
|
||||
if (scalar_like) {
|
||||
auto scalar = dense;
|
||||
if (dense.numel() == 1 && dense.dim() > 0) {
|
||||
scalar = dense.view({});
|
||||
}
|
||||
scalar = scalar.to(values.options());
|
||||
auto out_vals = values.mul(scalar);
|
||||
if (out.scalar_type() != commonDtype) {
|
||||
out_vals = out_vals.to(out.scalar_type());
|
||||
}
|
||||
|
||||
alias_into_sparse(out, indices, out_vals);
|
||||
out._coalesced_(sparse.is_coalesced());
|
||||
return out;
|
||||
}
|
||||
|
||||
TORCH_CHECK(dense.sizes().equals(sparse.sizes()),
|
||||
"mul(dense, sparse): sizes must match exactly (no broadcasting): ",
|
||||
dense.sizes(), " vs ", sparse.sizes());
|
||||
|
||||
const int64_t ndim_i = sparse.sparse_dim();
|
||||
const int64_t ndim = dense.dim();
|
||||
TORCH_CHECK(
|
||||
ndim_i <= ndim,
|
||||
"mul(dense, sparse): sparse_dim=", ndim_i, " exceeds dense.dim()=", ndim);
|
||||
|
||||
// Prepare shapes
|
||||
int64_t view_rows = 1, view_cols = 1;
|
||||
for (int64_t i = 0; i < ndim_i; ++i) view_rows *= sparse.size(i);
|
||||
for (int64_t i = ndim_i; i < ndim; ++i) view_cols *= sparse.size(i);
|
||||
|
||||
auto dense_mps = dense.to(commonDtype).contiguous().reshape({view_rows, view_cols});
|
||||
auto out_vals = at::empty_like(values, values.options());
|
||||
|
||||
const uint32_t u_view_cols = static_cast<uint32_t>(view_cols);
|
||||
const uint32_t u_nnz = static_cast<uint32_t>(nnz);
|
||||
const uint32_t u_ndim_i = static_cast<uint32_t>(ndim_i);
|
||||
|
||||
auto stream = getCurrentMPSStream();
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc("dense_sparse_mul_kernel_" + mps::scalarToMetalTypeString(values));
|
||||
auto computeEncoder = stream->commandEncoder();
|
||||
[computeEncoder setComputePipelineState:pso];
|
||||
|
||||
const uint32_t gridWidth = u_view_cols;
|
||||
const uint32_t gridDepth = u_nnz;
|
||||
MTLSize gridSize = MTLSizeMake(gridWidth, 1, gridDepth);
|
||||
|
||||
const uint32_t maxThreadsPerGroup = pso.maxTotalThreadsPerThreadgroup;
|
||||
const uint32_t tew = pso.threadExecutionWidth;
|
||||
uint32_t tgWidth = std::min(gridWidth, tew);
|
||||
MTLSize threadgroupSize = MTLSizeMake(tgWidth, 1, 1);
|
||||
|
||||
mtl_setArgs(
|
||||
computeEncoder,
|
||||
dense_mps,
|
||||
values,
|
||||
out_vals,
|
||||
indices,
|
||||
sparse.sizes(),
|
||||
std::array<uint32_t, 3>{u_nnz, u_ndim_i, u_view_cols}
|
||||
);
|
||||
|
||||
[computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadgroupSize];
|
||||
}
|
||||
});
|
||||
|
||||
Tensor final_vals = out_vals;
|
||||
if (out.scalar_type() != commonDtype) {
|
||||
final_vals = final_vals.to(out.scalar_type());
|
||||
}
|
||||
|
||||
alias_into_sparse(out, indices, final_vals);
|
||||
out._coalesced_(sparse.is_coalesced());
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTensor& r_) {
|
||||
TORCH_CHECK(r_.is_mps(), "mul: expected 'out' to be MPS, but got ", r_.device());
|
||||
|
||||
// Dense x sparse fallback (keep dense first)
|
||||
if (!t_.is_sparse() || !src_.is_sparse()) {
|
||||
const Tensor& dense = t_.is_sparse() ? src_ : t_;
|
||||
const Tensor& sparse = t_.is_sparse() ? t_ : src_;
|
||||
return mul_out_dense_sparse_mps(dense, sparse, r_);
|
||||
}
|
||||
|
||||
TORCH_CHECK(t_.is_mps(), "mul: expected 'self' to be MPS, but got ", t_.device());
|
||||
TORCH_CHECK(src_.is_mps(), "mul: expected 'other' to be MPS, but got ", src_.device());
|
||||
TORCH_CHECK(t_.sparse_dim() == src_.sparse_dim(),
|
||||
"mul(sparse, sparse): must have same sparse_dim, got ",
|
||||
t_.sparse_dim(), " vs ", src_.sparse_dim());
|
||||
TORCH_CHECK(t_.sizes().equals(src_.sizes()),
|
||||
"mul(sparse, sparse): sizes must match exactly (no broadcasting).");
|
||||
|
||||
// Coalesce and early-exit on structurally empty operands
|
||||
auto lhs = t_.coalesce();
|
||||
auto rhs = src_.coalesce();
|
||||
const int64_t lhs_nnz = lhs._nnz();
|
||||
const int64_t rhs_nnz = rhs._nnz();
|
||||
if (!lhs_nnz || !rhs_nnz) {
|
||||
r_.resize_as_(lhs);
|
||||
return r_.zero_();
|
||||
}
|
||||
|
||||
// dtype checks and promotion
|
||||
auto commonDtype = at::result_type(lhs, rhs);
|
||||
TORCH_CHECK(canCast(commonDtype, r_.scalar_type()),
|
||||
"Can't convert result type ", commonDtype, " to output ", r_.scalar_type());
|
||||
|
||||
const int64_t ndim_i = lhs.sparse_dim();
|
||||
|
||||
// ndim_i == 0, at most one structural entry
|
||||
if (ndim_i == 0) {
|
||||
r_.resize_as_(lhs);
|
||||
const bool has = (lhs_nnz && rhs_nnz);
|
||||
|
||||
auto out_indices = lhs._indices().narrow(1, 0, has ? 1 : 0);
|
||||
|
||||
Tensor lhs_vals = lhs._values().to(commonDtype);
|
||||
Tensor rhs_vals = rhs._values().to(commonDtype);
|
||||
lhs_vals = lhs_vals.narrow(0, 0, has ? 1 : 0);
|
||||
rhs_vals = rhs_vals.narrow(0, 0, has ? 1 : 0);
|
||||
|
||||
Tensor out_values = lhs_vals.mul(rhs_vals);
|
||||
if (r_.scalar_type() != commonDtype) {
|
||||
out_values = out_values.to(r_.scalar_type());
|
||||
}
|
||||
|
||||
alias_into_sparse(r_, out_indices, out_values);
|
||||
r_._coalesced_(true);
|
||||
return r_;
|
||||
}
|
||||
|
||||
// General path, intersect keys, then gather + multiply on GPU
|
||||
const auto device = r_.device();
|
||||
auto stream = getCurrentMPSStream();
|
||||
|
||||
auto lhs_indices = lhs._indices();
|
||||
auto rhs_indices = rhs._indices();
|
||||
auto lhs_values = lhs._values().to(commonDtype);
|
||||
auto rhs_values = rhs._values().to(commonDtype);
|
||||
|
||||
// Flatten sparse indices to keys
|
||||
auto lhs_keys = flatten_indices(lhs_indices, lhs.sizes());
|
||||
auto rhs_keys = flatten_indices(rhs_indices, rhs.sizes());
|
||||
|
||||
// Intersect sorted keys (search the shorter in the longer)
|
||||
const bool A_is_lhs = (lhs_nnz <= rhs_nnz);
|
||||
const int64_t lenA = A_is_lhs ? lhs_nnz : rhs_nnz;
|
||||
const int64_t lenB = A_is_lhs ? rhs_nnz : lhs_nnz;
|
||||
auto A_keys = A_is_lhs ? lhs_keys : rhs_keys;
|
||||
auto B_keys = A_is_lhs ? rhs_keys : lhs_keys;
|
||||
|
||||
auto outA_idx = at::empty({lenA}, at::device(device).dtype(kLong));
|
||||
auto outB_idx = at::empty({lenA}, at::device(device).dtype(kLong));
|
||||
auto counter = at::zeros({1}, at::device(device).dtype(kInt));
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
|
||||
auto enc = stream->commandEncoder();
|
||||
[enc setComputePipelineState:pso];
|
||||
mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
|
||||
static_cast<uint32_t>(lenB), A_is_lhs);
|
||||
mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
|
||||
}
|
||||
});
|
||||
|
||||
const uint32_t M = counter.item<int32_t>(); // number of structural matches
|
||||
|
||||
r_.resize_as_(lhs);
|
||||
|
||||
auto out_indices = at::empty({ndim_i, static_cast<int64_t>(M)}, at::device(device).dtype(at::kLong));
|
||||
auto lhs_match = outA_idx.narrow(0, 0, M);
|
||||
auto rhs_match = outB_idx.narrow(0, 0, M);
|
||||
auto out_val_sizes = lhs_values.sizes().vec();
|
||||
out_val_sizes[0] = static_cast<int64_t>(M);
|
||||
auto out_values = at::empty(out_val_sizes, lhs_values.options());
|
||||
|
||||
const uint32_t cols = static_cast<uint32_t>(
|
||||
lhs_values.numel() / std::max<int64_t>(1, lhs_nnz));
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc(
|
||||
"fused_gather_mul_kernel_" + mps::scalarToMetalTypeString(lhs_values));
|
||||
auto enc = stream->commandEncoder();
|
||||
[enc setComputePipelineState:pso];
|
||||
|
||||
const uint32_t tew = pso.threadExecutionWidth;
|
||||
uint32_t tgW = std::min(cols, tew);
|
||||
MTLSize grid = MTLSizeMake(cols, 1, M);
|
||||
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
|
||||
|
||||
mtl_setArgs(enc,
|
||||
lhs_values, rhs_values,
|
||||
lhs_match, rhs_match,
|
||||
lhs_indices, out_indices,
|
||||
out_values,
|
||||
std::array<uint32_t, 2>{static_cast<uint32_t>(ndim_i), static_cast<uint32_t>(lhs_nnz)},
|
||||
std::array<uint32_t, 2>{M, cols});
|
||||
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
|
||||
}
|
||||
});
|
||||
|
||||
if (r_.scalar_type() != commonDtype) {
|
||||
out_values = out_values.to(r_.scalar_type());
|
||||
}
|
||||
|
||||
alias_into_sparse(r_, out_indices, out_values);
|
||||
r_._coalesced_(true);
|
||||
return r_;
|
||||
}
|
||||
|
||||
static Tensor& add_out_dense_sparse_mps(
|
||||
Tensor& out,
|
||||
const Tensor& dense,
|
||||
const SparseTensor& sparse,
|
||||
|
150
aten/src/ATen/native/sparse/mps/kernels/Mul.metal
Normal file
150
aten/src/ATen/native/sparse/mps/kernels/Mul.metal
Normal file
@ -0,0 +1,150 @@
|
||||
#include <metal_stdlib>
|
||||
#include <c10/metal/indexing.h>
|
||||
using namespace metal;
|
||||
|
||||
|
||||
template <typename T>
|
||||
kernel void dense_sparse_mul_kernel(
|
||||
device const T* dense [[buffer(0)]],
|
||||
device const T* values [[buffer(1)]],
|
||||
device T* out_values [[buffer(2)]],
|
||||
device const long* indices [[buffer(3)]],
|
||||
device const long* sizes [[buffer(4)]],
|
||||
constant uint3& sparse_params [[buffer(5)]],
|
||||
uint3 gid [[thread_position_in_grid]])
|
||||
{
|
||||
uint col = gid.x;
|
||||
uint i = gid.z;
|
||||
uint nnz = sparse_params.x;
|
||||
uint ndim_i = sparse_params.y;
|
||||
uint view_cols = sparse_params.z;
|
||||
|
||||
long key = 0;
|
||||
for (uint d = 0; d < ndim_i; ++d) {
|
||||
long idx_d = indices[(ulong)d * (ulong)nnz + (ulong)i];
|
||||
const auto sz_d = sizes[d];
|
||||
key = key * sz_d + idx_d;
|
||||
}
|
||||
|
||||
ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
|
||||
ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;
|
||||
|
||||
const auto a = static_cast<float>(values[val_idx]);
|
||||
const auto b = static_cast<float>(dense[dense_idx]);
|
||||
out_values[val_idx] = static_cast<T>(a * b);
|
||||
}
|
||||
|
||||
kernel void intersect_binary_search(
|
||||
device const long* keysA [[buffer(0)]],
|
||||
device const long* keysB [[buffer(1)]],
|
||||
device long* outA_idx [[buffer(2)]],
|
||||
device long* outB_idx [[buffer(3)]],
|
||||
device atomic_uint* counter [[buffer(4)]],
|
||||
constant uint& lenB [[buffer(5)]],
|
||||
constant bool& A_is_lhs [[buffer(6)]],
|
||||
uint3 tid_in_grid [[thread_position_in_grid]])
|
||||
{
|
||||
uint gid = tid_in_grid.x;
|
||||
|
||||
long key = keysA[gid];
|
||||
|
||||
// lower_bound in B
|
||||
uint lo = 0;
|
||||
uint hi = lenB;
|
||||
while (lo < hi) {
|
||||
uint mid = (lo + hi) >> 1;
|
||||
long v = keysB[mid];
|
||||
if (v < key) lo = mid + 1;
|
||||
else hi = mid;
|
||||
}
|
||||
|
||||
if (lo < lenB && keysB[lo] == key) {
|
||||
uint pos = atomic_fetch_add_explicit(counter, 1u, memory_order_relaxed);
|
||||
if (A_is_lhs) {
|
||||
outA_idx[pos] = (long)gid;
|
||||
outB_idx[pos] = (long)lo;
|
||||
} else {
|
||||
outA_idx[pos] = (long)lo;
|
||||
outB_idx[pos] = (long)gid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
kernel void fused_gather_mul_kernel(
|
||||
device const T* lhs_vals [[buffer(0)]],
|
||||
device const T* rhs_vals [[buffer(1)]],
|
||||
device const long* lhs_sel [[buffer(2)]],
|
||||
device const long* rhs_sel [[buffer(3)]],
|
||||
device const long* lhs_indices [[buffer(4)]],
|
||||
device long* out_indices [[buffer(5)]],
|
||||
device T* out_vals [[buffer(6)]],
|
||||
constant uint2& dims_input [[buffer(7)]],
|
||||
constant uint2& dims_output [[buffer(8)]],
|
||||
uint3 gid [[thread_position_in_grid]])
|
||||
{
|
||||
const uint col = gid.x;
|
||||
const uint k = gid.z;
|
||||
const uint n_dim_i = dims_input.x;
|
||||
const uint L = dims_input.y;
|
||||
const uint M = dims_output.x;
|
||||
const uint view_cols = dims_output.y;
|
||||
|
||||
const long iL = lhs_sel[k];
|
||||
const long iR = rhs_sel[k];
|
||||
|
||||
if (col < view_cols) {
|
||||
const ulong offL = (ulong)iL * (ulong)view_cols + (ulong)col;
|
||||
const ulong offR = (ulong)iR * (ulong)view_cols + (ulong)col;
|
||||
const ulong offO = (ulong)k * (ulong)view_cols + (ulong)col;
|
||||
|
||||
const float a = (float)lhs_vals[offL];
|
||||
const float b = (float)rhs_vals[offR];
|
||||
out_vals[offO] = (T)(a * b);
|
||||
}
|
||||
|
||||
// One thread per match copies the indices column
|
||||
if (col == 0) {
|
||||
const ulong uL = (ulong)L;
|
||||
const ulong uM = (ulong)M;
|
||||
const ulong src_col = (ulong)iL; // gather from lhs
|
||||
for (uint d = 0; d < n_dim_i; ++d) {
|
||||
const long v = lhs_indices[(ulong)d * uL + src_col];
|
||||
out_indices[(ulong)d * uM + (ulong)k] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE) \
|
||||
template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void \
|
||||
dense_sparse_mul_kernel<DTYPE>( \
|
||||
device const DTYPE* dense [[buffer(0)]], \
|
||||
device const DTYPE* values [[buffer(1)]], \
|
||||
device DTYPE* out_values [[buffer(2)]], \
|
||||
device const long* indices [[buffer(3)]], \
|
||||
device const long* sizes [[buffer(4)]], \
|
||||
constant uint3& sparse_params [[buffer(5)]], \
|
||||
uint3 gid [[thread_position_in_grid]]);
|
||||
|
||||
INSTANTIATE_DENSE_SPARSE_MUL(float);
|
||||
INSTANTIATE_DENSE_SPARSE_MUL(half);
|
||||
INSTANTIATE_DENSE_SPARSE_MUL(bfloat);
|
||||
|
||||
#define INSTANTIATE_FUSED_GATHER_MUL(DTYPE) \
|
||||
template [[host_name("fused_gather_mul_kernel_" #DTYPE)]] kernel void \
|
||||
fused_gather_mul_kernel<DTYPE>( \
|
||||
device const DTYPE* lhs_vals [[buffer(0)]], \
|
||||
device const DTYPE* rhs_vals [[buffer(1)]], \
|
||||
device const long* lhs_sel [[buffer(2)]], \
|
||||
device const long* rhs_sel [[buffer(3)]], \
|
||||
device const long* lhs_indices [[buffer(4)]], \
|
||||
device long* out_indices [[buffer(5)]], \
|
||||
device DTYPE* out_vals [[buffer(6)]], \
|
||||
constant uint2& dims_input [[buffer(7)]], \
|
||||
constant uint2& dims_output [[buffer(8)]], \
|
||||
uint3 gid [[thread_position_in_grid]]);
|
||||
|
||||
INSTANTIATE_FUSED_GATHER_MUL(float);
|
||||
INSTANTIATE_FUSED_GATHER_MUL(half);
|
||||
INSTANTIATE_FUSED_GATHER_MUL(bfloat);
|
@ -95,6 +95,72 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(USE_ROCM) && (defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION))
|
||||
namespace pytorch_flash
|
||||
{
|
||||
std::tuple<
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor>
|
||||
mha_fwd(
|
||||
const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size
|
||||
const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size
|
||||
const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size
|
||||
std::optional<at::Tensor>&
|
||||
out_, // batch_size x seqlen_q x num_heads x head_size
|
||||
std::optional<at::Tensor>&
|
||||
alibi_slopes_, // num_heads or batch_size x num_heads
|
||||
const float p_dropout,
|
||||
const float softmax_scale,
|
||||
bool is_causal,
|
||||
std::optional<int64_t> window_size_left,
|
||||
std::optional<int64_t> window_size_right,
|
||||
const float softcap,
|
||||
const bool return_softmax,
|
||||
std::optional<at::Generator> gen_) {
|
||||
#if defined(USE_ROCM_CK_SDPA)
|
||||
if (at::globalContext().getROCmFAPreferredBackend() ==
|
||||
at::ROCmFABackend::Ck) {
|
||||
const int non_null_window_left = window_size_left.value_or(-1);
|
||||
const int non_null_window_right = window_size_right.value_or(-1);
|
||||
std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
|
||||
return mha_fwd_ck(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
non_null_window_left,
|
||||
non_null_window_right,
|
||||
return_softmax,
|
||||
gen_,
|
||||
dummy_attn_bias); // Not used in flash attention
|
||||
}
|
||||
#endif
|
||||
return mha_fwd_aot(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
alibi_slopes_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
window_size_left,
|
||||
window_size_right,
|
||||
return_softmax,
|
||||
gen_);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace at {
|
||||
|
||||
namespace cuda::philox {
|
||||
|
@ -270,7 +270,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varle
|
||||
#endif
|
||||
|
||||
TORCH_API
|
||||
inline std::tuple<
|
||||
std::tuple<
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
@ -294,42 +294,7 @@ mha_fwd(
|
||||
std::optional<int64_t> window_size_right,
|
||||
const float softcap,
|
||||
const bool return_softmax,
|
||||
std::optional<at::Generator> gen_) {
|
||||
#if defined(USE_ROCM_CK_SDPA)
|
||||
if (at::globalContext().getROCmFAPreferredBackend() ==
|
||||
at::ROCmFABackend::Ck) {
|
||||
const int non_null_window_left = window_size_left.value_or(-1);
|
||||
const int non_null_window_right = window_size_right.value_or(-1);
|
||||
std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
|
||||
return mha_fwd_ck(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
non_null_window_left,
|
||||
non_null_window_right,
|
||||
return_softmax,
|
||||
gen_,
|
||||
dummy_attn_bias); // Not used in flash attention
|
||||
}
|
||||
#endif
|
||||
return mha_fwd_aot(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
alibi_slopes_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
window_size_left,
|
||||
window_size_right,
|
||||
return_softmax,
|
||||
gen_);
|
||||
}
|
||||
std::optional<at::Generator> gen_);
|
||||
|
||||
inline std::tuple<
|
||||
at::Tensor,
|
||||
|
@ -98,11 +98,11 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,pass,5
|
||||
doctr_det_predictor,pass,3
|
||||
|
||||
|
||||
|
||||
doctr_reco_predictor,pass,4
|
||||
doctr_reco_predictor,pass,1
|
||||
|
||||
|
||||
|
||||
|
|
@ -98,11 +98,11 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,pass,5
|
||||
doctr_det_predictor,pass,3
|
||||
|
||||
|
||||
|
||||
doctr_reco_predictor,pass,4
|
||||
doctr_reco_predictor,pass,1
|
||||
|
||||
|
||||
|
||||
|
|
@ -98,11 +98,11 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,pass,5
|
||||
doctr_det_predictor,pass,3
|
||||
|
||||
|
||||
|
||||
doctr_reco_predictor,pass,4
|
||||
doctr_reco_predictor,pass,1
|
||||
|
||||
|
||||
|
||||
|
|
@ -82,11 +82,11 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,pass,5
|
||||
doctr_det_predictor,pass,3
|
||||
|
||||
|
||||
|
||||
doctr_reco_predictor,pass,4
|
||||
doctr_reco_predictor,pass,1
|
||||
|
||||
|
||||
|
||||
|
|
@ -98,11 +98,11 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,pass,5
|
||||
doctr_det_predictor,pass,3
|
||||
|
||||
|
||||
|
||||
doctr_reco_predictor,pass,4
|
||||
doctr_reco_predictor,pass,1
|
||||
|
||||
|
||||
|
||||
|
|
@ -106,11 +106,11 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,eager_fail_to_run,5
|
||||
doctr_det_predictor,eager_fail_to_run,3
|
||||
|
||||
|
||||
|
||||
doctr_reco_predictor,eager_fail_to_run,4
|
||||
doctr_reco_predictor,eager_fail_to_run,1
|
||||
|
||||
|
||||
|
||||
|
|
@ -106,11 +106,11 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,eager_fail_to_run,5
|
||||
doctr_det_predictor,eager_fail_to_run,3
|
||||
|
||||
|
||||
|
||||
doctr_reco_predictor,eager_fail_to_run,4
|
||||
doctr_reco_predictor,eager_fail_to_run,1
|
||||
|
||||
|
||||
|
||||
|
|
@ -106,11 +106,11 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,eager_fail_to_run,5
|
||||
doctr_det_predictor,eager_fail_to_run,3
|
||||
|
||||
|
||||
|
||||
doctr_reco_predictor,eager_fail_to_run,4
|
||||
doctr_reco_predictor,eager_fail_to_run,1
|
||||
|
||||
|
||||
|
||||
|
|
@ -106,11 +106,11 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,eager_fail_to_run,5
|
||||
doctr_det_predictor,eager_fail_to_run,3
|
||||
|
||||
|
||||
|
||||
doctr_reco_predictor,eager_fail_to_run,4
|
||||
doctr_reco_predictor,eager_fail_to_run,1
|
||||
|
||||
|
||||
|
||||
|
|
@ -106,11 +106,11 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,eager_fail_to_run,5
|
||||
doctr_det_predictor,eager_fail_to_run,3
|
||||
|
||||
|
||||
|
||||
doctr_reco_predictor,eager_fail_to_run,4
|
||||
doctr_reco_predictor,eager_fail_to_run,1
|
||||
|
||||
|
||||
|
||||
|
|
@ -219,9 +219,7 @@ skip:
|
||||
- timm_regnet
|
||||
- timm_nfnet
|
||||
|
||||
cuda:
|
||||
# Temporary until https://github.com/pytorch/pytorch/issues/162282 is fixed
|
||||
- sam_fast
|
||||
cuda: []
|
||||
|
||||
test:
|
||||
training:
|
||||
|
@ -4,6 +4,7 @@ import csv
|
||||
import functools
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
import timeit
|
||||
from collections import namedtuple
|
||||
from dataclasses import asdict, dataclass
|
||||
@ -191,6 +192,11 @@ class BenchmarkRunner:
|
||||
self.predefined_minimum_secs = 1
|
||||
self.max_iters = 1e6
|
||||
self.use_jit = args.use_jit
|
||||
self.use_compile = args.use_compile
|
||||
if self.use_jit and self.use_compile:
|
||||
raise ValueError(
|
||||
"use_jit and use_compile are mutually exclusive, please specify one."
|
||||
)
|
||||
self.num_runs = args.num_runs
|
||||
self.print_per_iter = False
|
||||
self.output_csv = args.output_csv
|
||||
@ -222,7 +228,7 @@ class BenchmarkRunner:
|
||||
if self.args.operators:
|
||||
print(f"# {self.args.operators}")
|
||||
|
||||
def _print_perf_result(self, reported_run_time_us, test_case):
|
||||
def _print_perf_result(self, results, test_case):
|
||||
if self.args.report_aibench:
|
||||
# Output for AIBench
|
||||
# Print out per iteration execution time instead of avg time
|
||||
@ -236,12 +242,14 @@ class BenchmarkRunner:
|
||||
"type": test_name,
|
||||
"metric": "latency",
|
||||
"unit": "us",
|
||||
"value": str(reported_run_time_us[run]),
|
||||
"value": str(results["reported_run_time_us"[run]]),
|
||||
}
|
||||
)
|
||||
)
|
||||
else:
|
||||
print(f"# Mode: {'JIT' if self.use_jit else 'Eager'}")
|
||||
print(
|
||||
f"# Mode: {'JIT' if self.use_jit else 'Compile' if self.use_compile else 'Eager'}"
|
||||
)
|
||||
print(
|
||||
f"# Name: {test_case.test_config.test_name}\n# Input: {test_case.test_config.input_config}"
|
||||
)
|
||||
@ -250,25 +258,33 @@ class BenchmarkRunner:
|
||||
if self.num_runs > 1:
|
||||
for run in range(self.num_runs):
|
||||
print(
|
||||
f"Run: {run}, {mode} Execution Time (us) : {reported_run_time_us[run]:.3f}"
|
||||
f"Run: {run}, {mode} Execution Time (us) : {results['reported_run_time_us'][run]:.3f}"
|
||||
)
|
||||
print()
|
||||
else:
|
||||
print(f"{mode} Execution Time (us) : {reported_run_time_us[0]:.3f}\n")
|
||||
print(
|
||||
f"{mode} Execution Time (us) : {results['reported_run_time_us'][0]:.3f}"
|
||||
)
|
||||
print(f"Peak Memory (KB) : {results['peak_memory']}\n")
|
||||
|
||||
def _perf_result_to_dict(self, reported_run_time_us, test_case):
|
||||
def _perf_result_to_dict(self, results, test_case):
|
||||
"""This function is the parallel of _print_perf_result, which instead of
|
||||
writing information to terminal, returns a dictionary.
|
||||
"""
|
||||
if self.args.report_aibench:
|
||||
return {}
|
||||
|
||||
out = {
|
||||
"test_name": test_case.test_config.test_name,
|
||||
"input_config": test_case.test_config.input_config,
|
||||
"mode": "JIT" if self.use_jit else "Eager",
|
||||
"runtime": (
|
||||
"JIT" if self.use_jit else "Compile" if self.use_compile else "Eager"
|
||||
),
|
||||
"run": "Backward" if test_case.test_config.run_backward else "Forward",
|
||||
"latency": round(reported_run_time_us[0], 3),
|
||||
"latency": round(results["reported_run_time_us"][0], 3),
|
||||
"latency unit": "us",
|
||||
"peak memory": results["peak_memory"],
|
||||
"memory unit": "KB",
|
||||
}
|
||||
|
||||
# parsing test_case.test_config.input_config, adding it as entries to the 'out' dictionary
|
||||
@ -330,6 +346,8 @@ class BenchmarkRunner:
|
||||
func = test_case.run_forward
|
||||
if self.use_jit:
|
||||
func = test_case.run_jit_forward
|
||||
if self.use_compile:
|
||||
func = test_case.run_compile_forward
|
||||
forward_time = timeit.timeit(
|
||||
functools.partial(func, iters, print_per_iter, cuda_sync), number=1
|
||||
)
|
||||
@ -346,7 +364,7 @@ class BenchmarkRunner:
|
||||
)
|
||||
return backward_time
|
||||
|
||||
def _measure_time(self, launch_test, test_case, iters, print_per_iter):
|
||||
def _measure_metrics(self, launch_test, test_case, iters, print_per_iter):
|
||||
"""
|
||||
This function execute the operator for <iters> iterations then look at the time.
|
||||
If it's not significant, the number of iterations will be increased before rerun.
|
||||
@ -354,8 +372,25 @@ class BenchmarkRunner:
|
||||
"""
|
||||
curr_test_total_time = 0
|
||||
time_trace = []
|
||||
peak_memory = 0
|
||||
input_values = test_case.op_bench.inputs.values()
|
||||
device, device_module = None, None
|
||||
if input_values and isinstance(next(iter(input_values)), torch.Tensor):
|
||||
# The device and device module information are crucial for memory metric calculation,
|
||||
# In case of ops where inputs are integers (not tensor), memory metrics need not be calculated.
|
||||
sample_input = next(iter(input_values))
|
||||
device = sample_input.device
|
||||
device_module = torch.get_device_module(device.type)
|
||||
# TODO: add support for cpu memory measurement
|
||||
while True:
|
||||
if hasattr(device_module, "reset_peak_memory_stats"):
|
||||
device_module.reset_peak_memory_stats(device)
|
||||
run_time_sec = launch_test(test_case, iters, print_per_iter)
|
||||
if hasattr(device_module, "synchronize"):
|
||||
device_module.synchronize(device)
|
||||
# Memory measurement process
|
||||
if hasattr(device_module, "max_memory_allocated"):
|
||||
peak_memory = device_module.max_memory_allocated(device)
|
||||
curr_test_total_time += run_time_sec
|
||||
# Analyze time after each run to decide if the result is stable
|
||||
results_are_significant = self._iteration_result_is_significant(
|
||||
@ -369,7 +404,13 @@ class BenchmarkRunner:
|
||||
time_trace.append(report_run_time)
|
||||
# Print out the time spent in each epoch in ms
|
||||
if self.args.report_aibench:
|
||||
mode = "JIT" if self.use_jit else "Eager"
|
||||
mode = (
|
||||
"JIT"
|
||||
if self.use_jit
|
||||
else "Compile"
|
||||
if self.use_compile
|
||||
else "Eager"
|
||||
)
|
||||
test_name = "_".join(
|
||||
[test_case.framework, test_case.test_config.test_name, mode]
|
||||
)
|
||||
@ -381,7 +422,7 @@ class BenchmarkRunner:
|
||||
"metric": "latency",
|
||||
"unit": "ms",
|
||||
"value": str(report_run_time / 1e3),
|
||||
}
|
||||
},
|
||||
)
|
||||
)
|
||||
if results_are_significant:
|
||||
@ -391,7 +432,7 @@ class BenchmarkRunner:
|
||||
# iteration count, and run the benchmark again...
|
||||
iters = self._predict_num_iter_needed(iters)
|
||||
reported_run_time_us = np.percentile(np.array(time_trace), 50)
|
||||
return reported_run_time_us
|
||||
return reported_run_time_us, peak_memory / 1024
|
||||
|
||||
def _check_keep(self, test_flag, cmd_flag):
|
||||
return cmd_flag is None or test_flag == cmd_flag
|
||||
@ -478,6 +519,7 @@ class BenchmarkRunner:
|
||||
self,
|
||||
perf_list,
|
||||
output_file,
|
||||
benchmark_name="PyTorch operator benchmark",
|
||||
):
|
||||
"""
|
||||
Write the result into JSON format, so that it can be uploaded to the benchmark database
|
||||
@ -495,8 +537,10 @@ class BenchmarkRunner:
|
||||
input_config = perf_item.get("input_config", "")
|
||||
run_type = perf_item.get("run")
|
||||
latency = perf_item.get("latency", 0)
|
||||
|
||||
dtype = "float32" # default
|
||||
peak_memory = perf_item.get("peak memory", 0)
|
||||
device = perf_item.get("device", "unknown")
|
||||
dtype = perf_item.get("dtype", "torch.float").split(".")[1]
|
||||
runtime = perf_item.get("runtime", None)
|
||||
|
||||
# Extract mode based on run_type
|
||||
mode = None
|
||||
@ -505,6 +549,22 @@ class BenchmarkRunner:
|
||||
elif run_type == "Backward":
|
||||
mode = "training"
|
||||
|
||||
# Extract use_compile from it
|
||||
if runtime == "Compile":
|
||||
use_compile = True
|
||||
elif runtime == "Eager":
|
||||
use_compile = False
|
||||
else:
|
||||
use_compile = None
|
||||
|
||||
device_arch = (
|
||||
torch.cuda.get_device_name(0)
|
||||
if device == "cuda"
|
||||
else platform.processor()
|
||||
if device == "cpu"
|
||||
else "unknown"
|
||||
)
|
||||
|
||||
# Create the record
|
||||
@dataclass
|
||||
class BenchmarkInfo:
|
||||
@ -532,12 +592,18 @@ class BenchmarkRunner:
|
||||
model: ModelInfo
|
||||
metric: MetricInfo
|
||||
|
||||
record = BenchmarkRecord(
|
||||
# Add record for latency
|
||||
record_latency = BenchmarkRecord(
|
||||
benchmark=BenchmarkInfo(
|
||||
name="PyTorch operator benchmark",
|
||||
name=benchmark_name,
|
||||
mode=mode,
|
||||
dtype=dtype,
|
||||
extra_info={"input_config": input_config},
|
||||
extra_info={
|
||||
"input_config": input_config,
|
||||
"device": device,
|
||||
"arch": device_arch,
|
||||
"use_compile": use_compile,
|
||||
},
|
||||
),
|
||||
model=ModelInfo(
|
||||
name=test_name, type="micro-benchmark", origins=["pytorch"]
|
||||
@ -549,8 +615,17 @@ class BenchmarkRunner:
|
||||
target_value=None,
|
||||
),
|
||||
)
|
||||
records.append(asdict(record_latency))
|
||||
|
||||
records.append(asdict(record))
|
||||
# Add record for peak memory
|
||||
record_memory = copy.deepcopy(record_latency)
|
||||
record_memory.metric = MetricInfo(
|
||||
name="peak memory",
|
||||
unit="KB",
|
||||
benchmark_values=[peak_memory],
|
||||
target_value=None,
|
||||
)
|
||||
records.append(asdict(record_memory))
|
||||
|
||||
# Write all records to the output file
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
@ -566,6 +641,7 @@ class BenchmarkRunner:
|
||||
"tag",
|
||||
"run_backward",
|
||||
"Execution Time",
|
||||
"Peak Memory (KB)",
|
||||
]
|
||||
|
||||
if self.args.output_json or self.args.output_json_for_dashboard:
|
||||
@ -603,13 +679,16 @@ class BenchmarkRunner:
|
||||
test_case, self.args.warmup_iterations, print_per_iter=False
|
||||
)
|
||||
# Actual Execution
|
||||
reported_time = [
|
||||
self._measure_time(
|
||||
results = [
|
||||
self._measure_metrics(
|
||||
launch_func, test_case, self.iters, self.print_per_iter
|
||||
)
|
||||
for _ in range(self.num_runs)
|
||||
]
|
||||
self._print_perf_result(reported_time, test_case)
|
||||
result_dict = dict()
|
||||
result_dict["reported_run_time_us"] = [r[0] for r in results]
|
||||
result_dict["peak_memory"] = results[0][1]
|
||||
self._print_perf_result(results=result_dict, test_case=test_case)
|
||||
|
||||
# output results to csv
|
||||
self._output_csv(
|
||||
@ -625,16 +704,17 @@ class BenchmarkRunner:
|
||||
),
|
||||
test_case.test_config.tag,
|
||||
test_case.test_config.run_backward,
|
||||
reported_time[0],
|
||||
result_dict["reported_run_time_us"][0],
|
||||
result_dict["peak_memory"],
|
||||
],
|
||||
)
|
||||
if self.args.output_json or self.args.output_json_for_dashboard:
|
||||
perf_list.append(
|
||||
self._perf_result_to_dict(reported_time, test_case)
|
||||
)
|
||||
perf_list.append(self._perf_result_to_dict(result_dict, test_case))
|
||||
|
||||
if self.args.output_json_for_dashboard:
|
||||
self._output_json(perf_list, self.args.output_json_for_dashboard)
|
||||
self._output_json(
|
||||
perf_list, self.args.output_json_for_dashboard, self.args.benchmark_name
|
||||
)
|
||||
|
||||
if self.args.output_json:
|
||||
with open(self.args.output_json, "w") as f:
|
||||
|
@ -4,6 +4,15 @@ import time
|
||||
import torch
|
||||
|
||||
|
||||
# Import the C++ extension to register the _consume operator
|
||||
try:
|
||||
import benchmark_cpp_extension # noqa: F401
|
||||
except ImportError as err:
|
||||
# If the extension isn't built, the script must raise an error
|
||||
raise ImportError(
|
||||
"Failed to import C++ extension, please build it using \ncd pt_extension \npython -m pip install ."
|
||||
) from err
|
||||
|
||||
"""PyTorch performance microbenchmarks.
|
||||
|
||||
This module contains PyTorch-specific functionalities for performance
|
||||
@ -71,6 +80,16 @@ class TorchBenchmarkBase(torch.nn.Module):
|
||||
for _ in range(iters):
|
||||
torch.ops.operator_benchmark._consume(self.forward_impl())
|
||||
|
||||
def forward_impl_eager(self):
|
||||
# This is to supply the inputs to the forward function which
|
||||
# will be called in both the eager and compile mode of local runs
|
||||
return self.forward(*self.get_inputs())
|
||||
|
||||
def forward_consume_eager(self, iters: int):
|
||||
# Eager version of forward_consume without decorators (compilation handled by torch.compile)
|
||||
for _ in range(iters):
|
||||
torch.ops.operator_benchmark._consume(self.forward_impl_eager())
|
||||
|
||||
def module_name(self):
|
||||
"""this is used to label the operator being benchmarked"""
|
||||
if self.user_given_name:
|
||||
@ -117,18 +136,32 @@ class PyTorchOperatorTestCase:
|
||||
self.framework = "PyTorch"
|
||||
self.time_series = []
|
||||
self._jit_forward_graph = None
|
||||
self._compile_forward_graph = None
|
||||
|
||||
def _generate_jit_forward_graph(self):
|
||||
"""generate a graph for the forward function via scripting"""
|
||||
scripted_op_bench = torch.jit.script(self.op_bench)
|
||||
return scripted_op_bench.forward_consume
|
||||
|
||||
def _generate_compile_forward_graph(self):
|
||||
"""generate a compiled graph for the forward function via torch.compile"""
|
||||
compiled_forward_consume = torch.compile(
|
||||
self.op_bench.forward_consume_eager, backend="inductor"
|
||||
)
|
||||
return compiled_forward_consume
|
||||
|
||||
def run_jit_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
|
||||
"""Run the forward path of an op with JIT mode"""
|
||||
if self._jit_forward_graph is None:
|
||||
self._jit_forward_graph = self._generate_jit_forward_graph()
|
||||
self._jit_forward_graph(num_runs)
|
||||
|
||||
def run_compile_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
|
||||
"""Run the forward path of an op with compile mode"""
|
||||
if self._compile_forward_graph is None:
|
||||
self._compile_forward_graph = self._generate_compile_forward_graph()
|
||||
self._compile_forward_graph(num_runs)
|
||||
|
||||
def _print_per_iter(self):
|
||||
# print last 50 values
|
||||
length = min(len(self.time_series), 50)
|
||||
@ -150,14 +183,14 @@ class PyTorchOperatorTestCase:
|
||||
if print_per_iter:
|
||||
for _ in range(num_runs):
|
||||
start_time = time.time()
|
||||
self.output = self.op_bench.forward_impl()
|
||||
self.output = self.op_bench.forward_impl_eager()
|
||||
if cuda_sync:
|
||||
torch.cuda.synchronize(torch.cuda.current_device())
|
||||
end_time = time.time()
|
||||
self.time_series.append((end_time - start_time) * 1e3)
|
||||
else:
|
||||
for _ in range(num_runs):
|
||||
self.output = self.op_bench.forward_impl()
|
||||
self.output = self.op_bench.forward_impl_eager()
|
||||
if cuda_sync:
|
||||
torch.cuda.synchronize(torch.cuda.current_device())
|
||||
|
||||
|
@ -62,6 +62,13 @@ def parse_args():
|
||||
default=None,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--benchmark-name",
|
||||
"--benchmark_name",
|
||||
help="Name of the benchmark to store results to",
|
||||
default="PyTorch operator benchmark",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--list-tests",
|
||||
"--list_tests",
|
||||
@ -135,6 +142,16 @@ def parse_args():
|
||||
help="Run operators with PyTorch JIT mode",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--use-compile",
|
||||
"--use_compile",
|
||||
type=benchmark_utils.str2bool,
|
||||
nargs="?",
|
||||
const=True,
|
||||
default=False,
|
||||
help="Run operators with PyTorch Compile mode",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--forward-only",
|
||||
"--forward_only",
|
||||
@ -162,7 +179,7 @@ def parse_args():
|
||||
"--output-json-for-dashboard",
|
||||
"--output_json_for_dashboard",
|
||||
help="Save results in JSON format for display on the OSS dashboard",
|
||||
default="False",
|
||||
default="benchmark-results.json",
|
||||
)
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
|
@ -1,5 +1,5 @@
|
||||
Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time
|
||||
PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,3.9497
|
||||
PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,2.459
|
||||
PyTorch,add,add_M64_N64_K64_cpu,short,FALSE,14.3181
|
||||
PyTorch,add,add_M64_N64_K128_cpu,short,FALSE,14.6826
|
||||
PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,58.1449
|
||||
@ -376,10 +376,10 @@ PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",sho
|
||||
PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.6588
|
||||
PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.5969
|
||||
PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.547
|
||||
PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,68.739
|
||||
PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.21375
|
||||
PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,45.14133333
|
||||
PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,52.6664
|
||||
PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,69.1875
|
||||
PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,51.49525
|
||||
PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,48.3458
|
||||
PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,62.0719
|
||||
PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.5728
|
||||
@ -388,10 +388,10 @@ PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplace
|
||||
PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,8.1647
|
||||
PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.1768
|
||||
PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.0619
|
||||
PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.118
|
||||
PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,48.88475
|
||||
PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,43.702
|
||||
PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,50.3613
|
||||
PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.436
|
||||
PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.3995
|
||||
PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,46.9813
|
||||
PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,59.2295
|
||||
PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.5189
|
||||
@ -1316,4 +1316,4 @@ PyTorch,where,"where_cond_shape(8,16,1)_input_shape(1,)_other_shape(1,)_cpu_dtyp
|
||||
PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,FALSE,5.763
|
||||
PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.744666667
|
||||
PyTorch,clamp,clamp_M512_N512_cpu,short,FALSE,15.26233333
|
||||
PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667
|
||||
PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667
|
||||
|
|
@ -156,7 +156,7 @@ ROOT = "//" if IS_OSS else "//xplat/caffe2"
|
||||
# for targets in subfolders
|
||||
ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/"
|
||||
|
||||
C10 = "//c10:c10" if IS_OSS else "//xplat/caffe2/c10:c10"
|
||||
C10 = "//c10:c10" if IS_OSS else ("//xplat/caffe2/c10:c10_ovrsource" if is_arvr_mode() else "//xplat/caffe2/c10:c10")
|
||||
|
||||
# a dictionary maps third party library name to fbsource and oss target
|
||||
THIRD_PARTY_LIBS = {
|
||||
|
@ -638,10 +638,13 @@ libtorch_nativert_sources = [
|
||||
"torch/nativert/kernels/KernelHandlerRegistry.cpp",
|
||||
"torch/nativert/kernels/TritonKernel.cpp",
|
||||
"torch/nativert/executor/triton/CpuTritonKernelManager.cpp",
|
||||
"torch/nativert/executor/AOTInductorDelegateExecutor.cpp",
|
||||
"torch/nativert/kernels/ETCallDelegateKernel.cpp",
|
||||
]
|
||||
|
||||
libtorch_nativert_cuda_sources = [
|
||||
"torch/nativert/executor/triton/CudaTritonKernelManager.cpp",
|
||||
"torch/nativert/executor/AOTInductorModelContainerCudaShim.cpp",
|
||||
]
|
||||
|
||||
torch_mobile_tracer_sources = [
|
||||
|
@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
|
||||
is_le<sizeof(autograd_meta_), 16, FieldNameEnum::autograd_meta_>();
|
||||
is_le<sizeof(extra_meta_), 16, FieldNameEnum::extra_meta_>();
|
||||
are_equal<sizeof(version_counter_), 8, FieldNameEnum::version_counter_>();
|
||||
are_equal<sizeof(pyobj_slot_), 16, FieldNameEnum::pyobj_slot_>();
|
||||
are_equal<sizeof(pyobj_slot_), 8, FieldNameEnum::pyobj_slot_>();
|
||||
are_equal<sizeof(sizes_and_strides_), 88, FieldNameEnum::sizes_and_strides_>();
|
||||
are_equal<sizeof(storage_offset_), 8, FieldNameEnum::storage_offset_>();
|
||||
are_equal<sizeof(numel_), 8, FieldNameEnum::numel_>();
|
||||
|
@ -13,11 +13,10 @@ struct C10_API PyInterpreterHooksInterface {
|
||||
|
||||
// Get the PyInterpreter instance
|
||||
// Stub implementation throws error when Python is not available
|
||||
// We return nullptr rather than throwing an error since there are bits of c10
|
||||
// that expect an empty PyObjectSlot when python is not available.
|
||||
virtual PyInterpreter* getPyInterpreter() const {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"PyTorch was compiled without Python support. "
|
||||
"Cannot access Python interpreter from C++.");
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
|
||||
PyObjectSlot::PyObjectSlot() : pyobj_(nullptr) {}
|
||||
|
||||
PyObjectSlot::~PyObjectSlot() {
|
||||
maybe_destroy_pyobj();
|
||||
@ -10,9 +10,9 @@ PyObjectSlot::~PyObjectSlot() {
|
||||
|
||||
void PyObjectSlot::maybe_destroy_pyobj() {
|
||||
if (owns_pyobj()) {
|
||||
TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
|
||||
TORCH_INTERNAL_ASSERT(getGlobalPyInterpreter() != nullptr);
|
||||
TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
|
||||
(*pyobj_interpreter_.load(std::memory_order_acquire))
|
||||
(*getGlobalPyInterpreter())
|
||||
->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
|
||||
// NB: this destructor can only be entered when there are no
|
||||
// references to this C++ object (obviously), NOR any references
|
||||
@ -25,7 +25,7 @@ void PyObjectSlot::maybe_destroy_pyobj() {
|
||||
}
|
||||
|
||||
PyInterpreter* PyObjectSlot::pyobj_interpreter() {
|
||||
return pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
return getGlobalPyInterpreter();
|
||||
}
|
||||
|
||||
PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
|
||||
@ -35,7 +35,7 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
|
||||
}
|
||||
|
||||
PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
|
||||
auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
auto interpreter = getGlobalPyInterpreter();
|
||||
if (interpreter) {
|
||||
return *interpreter;
|
||||
}
|
||||
|
@ -6,10 +6,17 @@
|
||||
#include <c10/util/python_stub.h>
|
||||
#include <optional>
|
||||
|
||||
#include <atomic>
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
// Function pointer type for getting the global interpreter
|
||||
using GetPyInterpreterFn = PyInterpreter* (*)();
|
||||
|
||||
// Global function pointer (set by csrc initialization)
|
||||
C10_API extern GetPyInterpreterFn g_get_pyinterpreter_fn;
|
||||
|
||||
// Helper function to get the global interpreter
|
||||
C10_API PyInterpreter* getGlobalPyInterpreter();
|
||||
|
||||
struct C10_API PyObjectSlot {
|
||||
public:
|
||||
PyObjectSlot();
|
||||
@ -26,8 +33,6 @@ struct C10_API PyObjectSlot {
|
||||
// NB: THIS FUNCTION CAN RAISE AN EXCEPTION. Make sure to clean up after
|
||||
// PyObject if necessary!
|
||||
void init_pyobj(PyObject* pyobj) {
|
||||
pyobj_interpreter_.store(
|
||||
getGlobalPyInterpreter(), std::memory_order_relaxed);
|
||||
pyobj_ = pyobj;
|
||||
}
|
||||
|
||||
@ -55,18 +60,15 @@ struct C10_API PyObjectSlot {
|
||||
|
||||
// @todo alban: I'm not too sure what's going on here, we can probably delete
|
||||
// it but it's worthwhile making sure
|
||||
std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
|
||||
impl::PyInterpreter* interpreter =
|
||||
pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter == nullptr) {
|
||||
std::optional<PyObject*> check_pyobj() const {
|
||||
impl::PyInterpreter* interpreter = getGlobalPyInterpreter();
|
||||
if (interpreter == nullptr || pyobj_ == nullptr) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
if (c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
return std::nullopt;
|
||||
} else {
|
||||
return _unchecked_untagged_pyobj();
|
||||
}
|
||||
return _unchecked_untagged_pyobj();
|
||||
}
|
||||
|
||||
PyInterpreter& load_pyobj_interpreter() const;
|
||||
@ -76,30 +78,6 @@ struct C10_API PyObjectSlot {
|
||||
void set_owns_pyobj(bool b);
|
||||
|
||||
private:
|
||||
// This field contains the interpreter tag for this object. See
|
||||
// Note [Python interpreter tag] for general context
|
||||
//
|
||||
// Note [Memory ordering on Python interpreter tag]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// What memory_order do we need when accessing this atomic? We don't
|
||||
// need a single total modification order (as provided by
|
||||
// memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
|
||||
// transition from -1 to some positive integer and never changes afterwards.
|
||||
// Because there is only one modification, it trivially already has a total
|
||||
// modification order (e.g., we don't need fences or locked instructions on
|
||||
// x86)
|
||||
//
|
||||
// In fact, one could make a reasonable argument that relaxed reads are OK,
|
||||
// due to the presence of external locking (GIL) to ensure that interactions
|
||||
// with other data structures are still correctly synchronized, so that
|
||||
// we fall in the "Single-Location Data Structures" case as described in
|
||||
// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
|
||||
// However, on x86, it doesn't matter if I use acquire or relaxed on the load
|
||||
// as I get the same assembly in both cases. So I just use the more
|
||||
// conservative acquire (which will impede compiler optimizations but I don't
|
||||
// care)
|
||||
std::atomic<PyInterpreter*> pyobj_interpreter_;
|
||||
|
||||
// This field contains a reference to a PyObject representing this Tensor.
|
||||
// If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
|
||||
// PyObject for it and set this field. This field does not have to be
|
||||
|
@ -504,7 +504,16 @@ struct ExpandableSegment {
|
||||
SegmentRange share(SegmentRange range, std::ostream& buf) {
|
||||
auto begin = segmentLeft(range.ptr);
|
||||
auto end = segmentRight(range.ptr + range.size);
|
||||
ShareHeader header{getpid(), segment_size_, end - begin};
|
||||
|
||||
// header.pid needs to be padded with 4 bytes and initialized with
|
||||
// 0 values to avoid random padding of different bytes each time,
|
||||
// thereby ensuring that the handle can be correctly matched in
|
||||
// ipcMemHandle_to_devptr.
|
||||
ShareHeader header{};
|
||||
header.pid = getpid();
|
||||
header.segment_size = segment_size_;
|
||||
header.num_handles = end - begin;
|
||||
|
||||
buf.write((const char*)&header, sizeof(ShareHeader));
|
||||
for (auto i : c10::irange(begin, end)) {
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
|
@ -78,7 +78,7 @@ int device_count_impl(bool fail_if_no_driver) {
|
||||
"would like to use GPUs, turn off ASAN.");
|
||||
break;
|
||||
#endif // C10_ASAN_ENABLED
|
||||
#if _WIN32 && CUDA_VERSION >= 13000
|
||||
#if defined(_WIN32) && CUDA_VERSION >= 13000
|
||||
// Workaround for CUDA-13.0 error handling on Windows, see
|
||||
// https://github.com/pytorch/pytorch/issues/162333#issuecomment-3267929585
|
||||
case cudaErrorNotSupported:
|
||||
|
@ -18,9 +18,9 @@ cuda_supported_platforms = [
|
||||
|
||||
def define_c10_ovrsource(name, is_mobile):
|
||||
if is_mobile:
|
||||
pp_flags = ["-DC10_MOBILE=1"]
|
||||
pp_flags = ["-DC10_MOBILE=1", "-DC10_USE_GLOG"]
|
||||
else:
|
||||
pp_flags = []
|
||||
pp_flags = ["-DC10_USE_GLOG"]
|
||||
|
||||
oxx_static_library(
|
||||
name = name,
|
||||
|
@ -196,20 +196,25 @@ TTarget* assign_ptr_(TTarget* rhs) {
|
||||
}
|
||||
}
|
||||
|
||||
// Increment needs to be acquire-release to make use_count() and
|
||||
// unique() reliable.
|
||||
// The only requirement for refcount increment is that it happens-before
|
||||
// decrement, so no additional memory ordering is needed.
|
||||
inline uint32_t atomic_refcount_increment(std::atomic<uint32_t>& refcount) {
|
||||
return refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
|
||||
return refcount.fetch_add(1, std::memory_order_relaxed) + 1;
|
||||
}
|
||||
|
||||
// weak_use_count() is only used for testing, so we don't need it to
|
||||
// be reliable. Relaxed should be fine.
|
||||
inline uint32_t atomic_weakcount_increment(std::atomic<uint32_t>& weakcount) {
|
||||
return weakcount.fetch_add(1, std::memory_order_relaxed) + 1;
|
||||
}
|
||||
|
||||
// Both decrements need to be acquire-release for correctness. See
|
||||
// e.g. std::shared_ptr implementation.
|
||||
// The requirement is that all modifications to the managed object happen-before
|
||||
// invocation of the managed object destructor, and that allocation of the
|
||||
// managed object storage happens-before deallocation of the storage.
|
||||
//
|
||||
// To get this ordering, all non-final decrements must synchronize-with the
|
||||
// final decrement. So all non-final decrements have to store-release while the
|
||||
// final decrement has to load-acquire, either directly or with the help of
|
||||
// fences. But it's easiest just to have all decrements be acq-rel. And it turns
|
||||
// out, on modern architectures and chips, it's also fastest.
|
||||
inline uint32_t atomic_refcount_decrement(std::atomic<uint32_t>& refcount) {
|
||||
return refcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
|
||||
}
|
||||
@ -332,7 +337,7 @@ class intrusive_ptr final {
|
||||
intrusive_ptr() noexcept
|
||||
: intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
|
||||
|
||||
intrusive_ptr(std::nullptr_t) noexcept
|
||||
/* implicit */ intrusive_ptr(std::nullptr_t) noexcept
|
||||
: intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
|
||||
|
||||
// This constructor will not increase the ref counter for you.
|
||||
@ -445,14 +450,14 @@ class intrusive_ptr final {
|
||||
if (target_ == NullType::singleton()) {
|
||||
return 0;
|
||||
}
|
||||
return target_->refcount_.load(std::memory_order_acquire);
|
||||
return target_->refcount_.load(std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
uint32_t weak_use_count() const noexcept {
|
||||
if (target_ == NullType::singleton()) {
|
||||
return 0;
|
||||
}
|
||||
return target_->weakcount_.load(std::memory_order_acquire);
|
||||
return target_->weakcount_.load(std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
bool unique() const noexcept {
|
||||
@ -851,14 +856,14 @@ class weak_intrusive_ptr final {
|
||||
return 0;
|
||||
}
|
||||
return target_->refcount_.load(
|
||||
std::memory_order_acquire); // refcount, not weakcount!
|
||||
std::memory_order_relaxed); // refcount, not weakcount!
|
||||
}
|
||||
|
||||
uint32_t weak_use_count() const noexcept {
|
||||
if (target_ == NullType::singleton()) {
|
||||
return 0;
|
||||
}
|
||||
return target_->weakcount_.load(std::memory_order_acquire);
|
||||
return target_->weakcount_.load(std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
bool expired() const noexcept {
|
||||
@ -866,18 +871,22 @@ class weak_intrusive_ptr final {
|
||||
}
|
||||
|
||||
intrusive_ptr<TTarget, NullType> lock() const noexcept {
|
||||
if (expired()) {
|
||||
if (target_ == NullType::singleton()) {
|
||||
return intrusive_ptr<TTarget, NullType>();
|
||||
} else {
|
||||
auto refcount = target_->refcount_.load(std::memory_order_seq_cst);
|
||||
auto refcount = target_->refcount_.load(std::memory_order_relaxed);
|
||||
do {
|
||||
if (refcount == 0) {
|
||||
// Object already destructed, no strong references left anymore.
|
||||
// Return nullptr.
|
||||
return intrusive_ptr<TTarget, NullType>();
|
||||
}
|
||||
} while (
|
||||
!target_->refcount_.compare_exchange_weak(refcount, refcount + 1));
|
||||
} while (!target_->refcount_.compare_exchange_weak(
|
||||
refcount,
|
||||
refcount + 1,
|
||||
std::memory_order_acquire,
|
||||
std::memory_order_relaxed));
|
||||
|
||||
return intrusive_ptr<TTarget, NullType>(
|
||||
target_, raw::DontIncreaseRefcount{});
|
||||
}
|
||||
|
@ -550,6 +550,13 @@ if(USE_CUDA OR USE_ROCM)
|
||||
append_filelist("libtorch_cuda_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS)
|
||||
endif()
|
||||
|
||||
if(USE_CUDA)
|
||||
append_filelist("libtorch_nativert_cuda_sources" Caffe2_GPU_SRCS)
|
||||
endif()
|
||||
if(USE_ROCM)
|
||||
append_filelist("libtorch_nativert_cuda_sources" Caffe2_HIP_SRCS)
|
||||
endif()
|
||||
|
||||
if(USE_CUDA)
|
||||
list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
|
||||
add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
|
||||
@ -1830,6 +1837,12 @@ if(BUILD_TEST)
|
||||
target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::undefined)
|
||||
endif()
|
||||
endif()
|
||||
if(USE_LSAN AND TARGET Sanitizer::leak)
|
||||
target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::leak)
|
||||
endif()
|
||||
if(USE_TSAN AND TARGET Sanitizer::thread)
|
||||
target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::thread)
|
||||
endif()
|
||||
else()
|
||||
add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
|
||||
target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
|
||||
|
@ -27,6 +27,10 @@
|
||||
#include "caffe2/serialize/versions.h"
|
||||
#include "miniz.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <Windows.h>
|
||||
#endif // _WIN32
|
||||
|
||||
namespace caffe2 {
|
||||
namespace serialize {
|
||||
constexpr std::string_view kDebugPklSuffix(".debug_pkl");
|
||||
@ -711,21 +715,35 @@ void PyTorchStreamWriter::setup(const string& file_name) {
|
||||
if (archive_name_.size() == 0) {
|
||||
CAFFE_THROW("invalid file name: ", file_name);
|
||||
}
|
||||
if (!writer_func_) {
|
||||
file_stream_.open(
|
||||
file_name,
|
||||
std::ofstream::out | std::ofstream::trunc | std::ofstream::binary);
|
||||
valid("opening archive ", file_name.c_str());
|
||||
|
||||
const std::string dir_name = parentdir(file_name);
|
||||
if (!dir_name.empty()) {
|
||||
struct stat st;
|
||||
bool dir_exists =
|
||||
(stat(dir_name.c_str(), &st) == 0 && (st.st_mode & S_IFDIR));
|
||||
TORCH_CHECK(
|
||||
dir_exists, "Parent directory ", dir_name, " does not exist.");
|
||||
const std::string dir_name = parentdir(file_name);
|
||||
if (!dir_name.empty()) {
|
||||
struct stat st;
|
||||
bool dir_exists =
|
||||
(stat(dir_name.c_str(), &st) == 0 && (st.st_mode & S_IFDIR));
|
||||
TORCH_CHECK(
|
||||
dir_exists, "Parent directory ", dir_name, " does not exist.");
|
||||
}
|
||||
TORCH_CHECK(file_stream_, "File ", file_name, " cannot be opened.");
|
||||
|
||||
if (!writer_func_) {
|
||||
valid("opening archive ", file_name.c_str());
|
||||
try {
|
||||
file_stream_.exceptions(std::ios_base::failbit | std::ios_base::badbit);
|
||||
file_stream_.open(
|
||||
file_name,
|
||||
std::ofstream::out | std::ofstream::trunc | std::ofstream::binary
|
||||
);
|
||||
} catch (const std::ios_base::failure& e) {
|
||||
#ifdef _WIN32
|
||||
// Windows have verbose error code, we prefer to use it than std errno.
|
||||
uint32_t error_code = GetLastError();
|
||||
CAFFE_THROW("open file failed with error code: ", error_code);
|
||||
#else // !_WIN32
|
||||
CAFFE_THROW("open file failed with strerror: ", strerror(errno));
|
||||
#endif // _WIN32
|
||||
}
|
||||
TORCH_CHECK(file_stream_, "File ", file_name, " cannot be opened.");
|
||||
|
||||
writer_func_ = [this](const void* buf, size_t nbytes) -> size_t {
|
||||
if (!buf) {
|
||||
// See [Note: write_record_metadata]
|
||||
|
@ -108,24 +108,32 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_BUILD_MOBILE)
|
||||
enable_ubsan()
|
||||
endif()
|
||||
|
||||
if(USE_ASAN OR USE_TSAN)
|
||||
if(USE_ASAN OR USE_LSAN OR USE_TSAN)
|
||||
find_package(Sanitizer REQUIRED)
|
||||
if(USE_ASAN)
|
||||
if(TARGET Sanitizer::address)
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::address)
|
||||
else()
|
||||
message(WARNING "Not ASAN found. Suppress this warning with -DUSE_ASAN=OFF.")
|
||||
message(WARNING "ASAN not found. Suppress this warning with -DUSE_ASAN=OFF.")
|
||||
caffe2_update_option(USE_ASAN OFF)
|
||||
endif()
|
||||
if(TARGET Sanitizer::undefined)
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::undefined)
|
||||
endif()
|
||||
endif()
|
||||
if(USE_LSAN)
|
||||
if(TARGET Sanitizer::leak)
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::leak)
|
||||
else()
|
||||
message(WARNING "LSAN not found. Suppress this warning with -DUSE_LSAN=OFF.")
|
||||
caffe2_update_option(USE_LSAN OFF)
|
||||
endif()
|
||||
endif()
|
||||
if(USE_TSAN)
|
||||
if(TARGET Sanitizer::thread)
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::thread)
|
||||
else()
|
||||
message(WARNING "Not TSAN found. Suppress this warning with -DUSE_TSAN=OFF.")
|
||||
message(WARNING "TSAN not found. Suppress this warning with -DUSE_TSAN=OFF.")
|
||||
caffe2_update_option(USE_TSAN OFF)
|
||||
endif()
|
||||
endif()
|
||||
|
113
cmake/External/aotriton.cmake
vendored
113
cmake/External/aotriton.cmake
vendored
@ -45,13 +45,88 @@ if(NOT __AOTRITON_INCLUDED)
|
||||
)
|
||||
set(__AOTRITON_BASE_URL "https://github.com/ROCm/aotriton/releases/download/") # @lint-ignore
|
||||
set(__AOTRITON_Z "gz")
|
||||
# Set the default __AOTRITON_LIB path
|
||||
set(__AOTRITON_LIB "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so")
|
||||
if(WIN32)
|
||||
set(__AOTRITON_LIB "${__AOTRITON_INSTALL_DIR}/lib/aotriton_v2.lib")
|
||||
endif()
|
||||
|
||||
function(aotriton_build_windows_dependencies dlfcn-win32_external xz_external dlfcn-win32_DIR liblzma_DIR)
|
||||
# Windows-specific dependencies - build these first
|
||||
if(NOT noimage)
|
||||
message(FATAL_ERROR "noimage must be ON for Windows builds")
|
||||
endif()
|
||||
# Build dlfcn-win32
|
||||
set(__DLFCN_WIN32_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dlfcn-win32")
|
||||
set(__DLFCN_WIN32_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/dlfcn-win32-install")
|
||||
|
||||
ExternalProject_Add(${dlfcn-win32_external}
|
||||
GIT_REPOSITORY https://github.com/dlfcn-win32/dlfcn-win32.git
|
||||
GIT_TAG v1.4.2
|
||||
PREFIX ${__DLFCN_WIN32_PREFIX}
|
||||
INSTALL_DIR ${__DLFCN_WIN32_INSTALL_DIR}
|
||||
CMAKE_ARGS
|
||||
-DCMAKE_INSTALL_PREFIX=${__DLFCN_WIN32_INSTALL_DIR}
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DCMAKE_C_COMPILER=cl
|
||||
-DCMAKE_CXX_COMPILER=cl
|
||||
-DBUILD_SHARED_LIBS=ON
|
||||
-DBUILD_TESTS=OFF
|
||||
BUILD_BYPRODUCTS
|
||||
"${__DLFCN_WIN32_INSTALL_DIR}/lib/dl.lib"
|
||||
"${__DLFCN_WIN32_INSTALL_DIR}/bin/dl.dll"
|
||||
)
|
||||
ExternalProject_Add_Step(${dlfcn-win32_external} copy_to_aotriton
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
"${__DLFCN_WIN32_INSTALL_DIR}/bin/dl.dll"
|
||||
"${__AOTRITON_INSTALL_DIR}/lib/"
|
||||
DEPENDEES install
|
||||
)
|
||||
set(${dlfcn-win32_DIR} "${__DLFCN_WIN32_INSTALL_DIR}/share/dlfcn-win32" CACHE PATH "Path to dlfcn-win32 CMake config" FORCE)
|
||||
|
||||
# Build xz/liblzma
|
||||
set(__XZ_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/xz")
|
||||
set(__XZ_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/xz-install")
|
||||
|
||||
ExternalProject_Add(${xz_external}
|
||||
GIT_REPOSITORY https://github.com/tukaani-project/xz.git
|
||||
GIT_TAG v5.8.1
|
||||
PREFIX ${__XZ_PREFIX}
|
||||
INSTALL_DIR ${__XZ_INSTALL_DIR}
|
||||
CMAKE_ARGS
|
||||
-DCMAKE_INSTALL_PREFIX=${__XZ_INSTALL_DIR}
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DBUILD_SHARED_LIBS=ON
|
||||
-DENABLE_NLS=OFF
|
||||
-DXZ_TOOL_LZMAINFO=OFF
|
||||
-DXZ_TOOL_XZ=OFF
|
||||
-DXZ_TOOL_XZDEC=OFF
|
||||
-DXZ_TOOL_LZMADEC=OFF
|
||||
BUILD_BYPRODUCTS
|
||||
"${__XZ_INSTALL_DIR}/lib/lzma.lib"
|
||||
"${__XZ_INSTALL_DIR}/bin/liblzma.dll"
|
||||
)
|
||||
ExternalProject_Add_Step(${xz_external} copy_to_aotriton
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
"${__XZ_INSTALL_DIR}/bin/liblzma.dll"
|
||||
"${__AOTRITON_INSTALL_DIR}/lib/"
|
||||
DEPENDEES install
|
||||
)
|
||||
set(${liblzma_DIR} "${__XZ_INSTALL_DIR}/lib/cmake/liblzma" CACHE PATH "Path to xz/liblzma CMake config" FORCE)
|
||||
endfunction()
|
||||
|
||||
function(aotriton_build_from_source noimage project)
|
||||
if(noimage)
|
||||
SET(RECURSIVE "OFF")
|
||||
else()
|
||||
SET(RECURSIVE "ON")
|
||||
endif()
|
||||
if(WIN32)
|
||||
message(STATUS "Building AOTriton Windows dependencies")
|
||||
aotriton_build_windows_dependencies(dlfcn-win32_external xz_external dlfcn-win32_DIR liblzma_DIR)
|
||||
endif()
|
||||
message(STATUS "PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}")
|
||||
|
||||
ExternalProject_Add(${project}
|
||||
GIT_REPOSITORY https://github.com/ROCm/aotriton.git
|
||||
GIT_SUBMODULES_RECURSE ${RECURSIVE}
|
||||
@ -65,12 +140,19 @@ if(NOT __AOTRITON_INCLUDED)
|
||||
-DAOTRITON_GPU_BUILD_TIMEOUT=0
|
||||
-DAOTRITON_NO_PYTHON=ON
|
||||
-DAOTRITON_NOIMAGE_MODE=${noimage}
|
||||
BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
|
||||
-DHIP_PLATFORM=amd
|
||||
$<$<BOOL:${WIN32}>:-Ddlfcn-win32_DIR=${dlfcn-win32_DIR}>
|
||||
$<$<BOOL:${WIN32}>:-Dliblzma_DIR=${liblzma_DIR}>
|
||||
BUILD_BYPRODUCTS
|
||||
"${__AOTRITON_LIB}"
|
||||
USES_TERMINAL_DOWNLOAD TRUE
|
||||
USES_TERMINAL_CONFIGURE TRUE
|
||||
USES_TERMINAL_BUILD TRUE
|
||||
USES_TERMINAL_INSTALL TRUE
|
||||
)
|
||||
if(WIN32)
|
||||
add_dependencies(${project} dlfcn-win32_external xz_external)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
|
||||
@ -95,7 +177,7 @@ if(NOT __AOTRITON_INCLUDED)
|
||||
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/aotriton_runtime"
|
||||
"${__AOTRITON_INSTALL_DIR}"
|
||||
BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
|
||||
BUILD_BYPRODUCTS "${__AOTRITON_LIB}"
|
||||
)
|
||||
message(STATUS "Using AOTriton Runtime from pre-compiled binary ${__AOTRITON_URL}.\
|
||||
Set env variables AOTRITON_INSTALL_FROM_SOURCE=1 to build from source.")
|
||||
@ -111,14 +193,35 @@ if(NOT __AOTRITON_INCLUDED)
|
||||
string(CONCAT __AOTRITON_URL
|
||||
"${__AOTRITON_BASE_URL}"
|
||||
"${__AOTRITON_VER}/${__AOTRITON_FILE}")
|
||||
|
||||
# Set up directories
|
||||
set(__AOTRITON_DOWNLOAD_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_download-${image})
|
||||
set(__AOTRITON_EXTRACT_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image})
|
||||
set(__AOTRITON_INSTALL_SOURCE_DIR ${__AOTRITON_EXTRACT_DIR})
|
||||
set(__DOWNLOAD_NO_EXTRACT "")
|
||||
set(__BUILD_COMMANDS "")
|
||||
|
||||
# On Windows, we need custom tar extraction with UTF-8 support
|
||||
if(WIN32)
|
||||
set(__DOWNLOAD_NO_EXTRACT "DOWNLOAD_NO_EXTRACT;TRUE")
|
||||
set(__BUILD_COMMANDS
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory "${__AOTRITON_EXTRACT_DIR}"
|
||||
COMMAND tar --options hdrcharset=UTF-8 -xf "${__AOTRITON_DOWNLOAD_DIR}/${__AOTRITON_FILE}" -C "${__AOTRITON_EXTRACT_DIR}"
|
||||
)
|
||||
set(__AOTRITON_INSTALL_SOURCE_DIR ${__AOTRITON_EXTRACT_DIR}/aotriton)
|
||||
endif()
|
||||
|
||||
ExternalProject_Add(${project}
|
||||
URL "${__AOTRITON_URL}"
|
||||
URL_HASH SHA256=${__AOTRITON_SHA256}
|
||||
SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image}
|
||||
DOWNLOAD_DIR ${__AOTRITON_DOWNLOAD_DIR}
|
||||
${__DOWNLOAD_NO_EXTRACT}
|
||||
SOURCE_DIR ${__AOTRITON_EXTRACT_DIR}
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
${__BUILD_COMMANDS}
|
||||
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image}"
|
||||
"${__AOTRITON_INSTALL_SOURCE_DIR}"
|
||||
"${__AOTRITON_INSTALL_DIR}"
|
||||
BUILD_BYPRODUCTS
|
||||
"${__AOTRITON_INSTALL_DIR}/lib/aotriton.images/${image}/__signature__"
|
||||
@ -164,7 +267,7 @@ if(NOT __AOTRITON_INCLUDED)
|
||||
endforeach()
|
||||
endforeach()
|
||||
endif()
|
||||
target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so)
|
||||
target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_LIB})
|
||||
target_include_directories(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include)
|
||||
set(AOTRITON_FOUND TRUE)
|
||||
endif() # __AOTRITON_INCLUDED
|
||||
|
@ -66,6 +66,7 @@ function(caffe2_print_configuration_summary)
|
||||
message(STATUS " LAPACK : ${LAPACK_INFO}")
|
||||
endif()
|
||||
message(STATUS " USE_ASAN : ${USE_ASAN}")
|
||||
message(STATUS " USE_LSAN : ${USE_LSAN}")
|
||||
message(STATUS " USE_TSAN : ${USE_TSAN}")
|
||||
message(STATUS " USE_CPP_CODE_COVERAGE : ${USE_CPP_CODE_COVERAGE}")
|
||||
message(STATUS " USE_CUDA : ${USE_CUDA}")
|
||||
|
80
docs/source/accelerator/autoload.md
Normal file
80
docs/source/accelerator/autoload.md
Normal file
@ -0,0 +1,80 @@
|
||||
# Autoload Mechanism
|
||||
|
||||
The **Autoload** mechanism in PyTorch simplifies the integration of a custom backend by enabling automatic discovery and initialization at runtime. This eliminates the need for explicit imports or manual initialization, allowing developers to seamlessly integrate a new accelerator or backend into PyTorch.
|
||||
|
||||
## Background
|
||||
|
||||
The **Autoload Device Extension** proposal in PyTorch is centered on improving support for various hardware backend devices, especially those implemented as out-of-the-tree extensions (not part of PyTorch’s main codebase). Currently, users must manually import or load these device-specific extensions to use them, which complicates the experience and increases cognitive overhead.
|
||||
|
||||
In contrast, in-tree devices (devices officially supported within PyTorch) are seamlessly integrated—users don’t need extra imports or steps. The goal of autoloading is to make out-of-the-tree devices just as easy to use, so users can follow the standard PyTorch device programming model without explicit loading or code changes. This would allow existing PyTorch applications to run on new devices without any modification, making hardware support more user-friendly and reducing barriers to adoption.
|
||||
|
||||
For more information about the background of **Autoload**, please refer to its [RFC](https://github.com/pytorch/pytorch/issues/122468).
|
||||
|
||||
## Design
|
||||
|
||||
The core idea of **Autoload** is to Use Python’s plugin discovery (entry points) so PyTorch automatically loads out-of-tree device extensions when torch is imported—no explicit user import needed.
|
||||
|
||||
For more instructions of the design of **Autoload**, please refer to [**How it works**](https://docs.pytorch.org/tutorials/unstable/python_extension_autoload.html#how-it-works).
|
||||
|
||||
## Implementation
|
||||
|
||||
This tutorial will take **OpenReg** as a new out-of-the-tree device and guide you through the steps to enable and use the **Autoload** mechanism.
|
||||
|
||||
### Entry Point Setup
|
||||
|
||||
To enable **Autoload**, register the `_autoload` function as an entry point in [setup.py](https://github.com/pytorch/pytorch/blob/main/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py) file.
|
||||
|
||||
::::{tab-set}
|
||||
|
||||
:::{tab-item} Python
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/setup.py
|
||||
:language: python
|
||||
:start-after: LITERALINCLUDE START: SETUP
|
||||
:end-before: LITERALINCLUDE END: SETUP
|
||||
:linenos:
|
||||
:emphasize-lines: 9-13
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
### Backend Setup
|
||||
|
||||
Define the initialization hook `_autoload` for backend initialization in [torch_openreg](https://github.com/pytorch/pytorch/blob/main/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/__init__.py). This hook will be automatically invoked by PyTorch during startup.
|
||||
|
||||
::::{tab-set-code}
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/__init__.py
|
||||
:language: python
|
||||
:start-after: LITERALINCLUDE START: AUTOLOAD
|
||||
:end-before: LITERALINCLUDE END: AUTOLOAD
|
||||
:linenos:
|
||||
```
|
||||
|
||||
::::
|
||||
|
||||
## Result
|
||||
|
||||
After setting up the entry point and backend, build and install your backend. Now, we can use the new accelerator without explicitly importing it.
|
||||
|
||||
```{eval-rst}
|
||||
.. grid:: 2
|
||||
|
||||
.. grid-item-card:: :octicon:`terminal;1em;` Without Autoload
|
||||
|
||||
>>> import torch
|
||||
>>> import torch_openreg
|
||||
>>> torch.tensor(1, device="openreg")
|
||||
tensor(1, device='openreg:0')
|
||||
|
||||
.. grid-item-card:: :octicon:`terminal;1em;` With Autoload
|
||||
|
||||
>>> import torch # Automatically import torch_openreg
|
||||
>>>
|
||||
>>> torch.tensor(1, device="openreg")
|
||||
tensor(1, device='openreg:0')
|
||||
```
|
@ -2,6 +2,10 @@
|
||||
|
||||
Since PyTorch 2.1, the community has made significant progress in streamlining the process of integrating new accelerators into the PyTorch ecosystem. These improvements include, but are not limited to: refinements to the `PrivateUse1` Dispatch Key, the introduction and enhancement of core subsystem extension mechanisms, and the device-agnostic refactoring of key modules (e.g., `torch.accelerator`, `memory management`). Taken together, these advances provide the foundation for a **robust**, **flexible**, and **developer-friendly** pathway for accelerator integration.
|
||||
|
||||
```{note}
|
||||
This guide is a work in progress. For more details, please refer to the [roadmap](https://github.com/pytorch/pytorch/issues/158917).
|
||||
```
|
||||
|
||||
## Why Does This Matter?
|
||||
|
||||
This integration pathway offers several major benefits:
|
||||
@ -10,16 +14,6 @@ This integration pathway offers several major benefits:
|
||||
* **Future-proofing**: This is the default integration path for all future PyTorch features, meaning that as new modules and features are added, they will automatically support scaling to new accelerators if this path is followed.
|
||||
* **Autonomy**: Vendors maintain full control over their accelerator integration timelines, enabling fast iteration cycles and reducing reliance on upstream coordination.
|
||||
|
||||
## About This Document
|
||||
|
||||
This guide aims to provide a **comprehensive overview of the modern integration pathway** for new accelerator in PyTorch. It walks through the full integration surface, from low-level device primitives to higher-level domain modules like compilation and quantization. The structure follows a **modular and scenario-driven approach**, where each topic is paired with corresponding code examples from [torch_openreg][OpenReg URL], an official reference implementation.
|
||||
|
||||
The goal is to help developers:
|
||||
|
||||
* Understand the full scope of accelerator integration;
|
||||
* Follow best practices to quickly launch new accelerators;
|
||||
* Avoid common pitfalls through clear, targeted examples.
|
||||
|
||||
## Target Audience
|
||||
|
||||
This document is intended for:
|
||||
@ -27,25 +21,28 @@ This document is intended for:
|
||||
* **Accelerator Developers** who are integrating accelerator into PyTorch;
|
||||
* **Advanced PyTorch Users** interested in the inner workings of key modules;
|
||||
|
||||
## Quick Overview
|
||||
## About This Document
|
||||
|
||||
This document outlines the key processes and practical scenarios involved in integrating new devices into PyTorch, providing developers with a comprehensive and detailed guide for bringing up new backends. The discussion is structured around four major axes:
|
||||
This guide aims to provide a **comprehensive overview of the modern integration pathway** for new accelerator in PyTorch. It walks through the full integration surface, from low-level device primitives to higher-level domain modules like compilation and quantization. The structure follows a **modular and scenario-driven approach**, where each topic is paired with corresponding code examples from [torch_openreg][OpenReg URL], an official reference implementation, and this series is structured around four major axes:
|
||||
|
||||
* **Runtime**: Covers core components such as Event, Stream, Memory, Generator, Guard, Hooks, as well as the supporting C++ scaffolding.
|
||||
* **Operators**: Involve the minimum necessary set of operators, forward and backward operators, fallback operators, fallthroughs, STUBs, etc. in both C++ and Python implementations.
|
||||
* **Python Frontend**: Focuses on Python bindings for modules and device-agnostic APIs.
|
||||
* **High-level Modules**: Explores integration with major subsystems such as `AMP`, `Compiler`, `ONNX`, and `Distributed` and so on.
|
||||
|
||||
Next, we will officially embark on the integration journey for a new PyTorch accelerator.
|
||||
The goal is to help developers:
|
||||
|
||||
```{note}
|
||||
This guide is a work in progress. For more details, please refer to the [roadmap](https://github.com/pytorch/pytorch/issues/158917).
|
||||
```
|
||||
* Understand the full scope of accelerator integration;
|
||||
* Follow best practices to quickly launch new accelerators;
|
||||
* Avoid common pitfalls through clear, targeted examples.
|
||||
|
||||
Next, we will delve into each chapter of this guide. Each chapter focuses on a key aspect of integration, providing detailed explanations and illustrative examples. Since some chapters build upon previous ones, readers are encouraged to follow the sequence to achieve a more coherent understanding.
|
||||
|
||||
```{toctree}
|
||||
:glob:
|
||||
:maxdepth: 1
|
||||
|
||||
autoload
|
||||
operators
|
||||
```
|
||||
|
||||
|
@ -169,7 +169,7 @@ Of course, global fallbacks can also be combined with a blacklist of fallbacks,
|
||||
|
||||
### PyTorch STUB
|
||||
|
||||
PyTorch also provides another approach for built-in operators: `STUB`. This method is essentially based on the `Step 1<step-one>` approach, but adds secondary scheduling capabilities (for example, scheduling based on CPU characteristics).
|
||||
PyTorch also provides another approach for built-in operators: `STUB`. This method is essentially based on the {ref}`Step 1<step-one>` approach, but adds secondary scheduling capabilities (for example, scheduling based on CPU characteristics).
|
||||
|
||||
```{note}
|
||||
The `STUB` method currently supports only a limited set of operators. For new accelerator devices, the advantage of the `STUB` method is that it significantly reduces the cost of development at the cost of a small performance overhead. PyTorch currently does not clearly list the set of operators that can be registered through `STUB`. Due to the large number of related operators, only the query method for the supported operator list is provided here.
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Working with Graph Breaks
|
||||
|
||||
As you might remember from (Dynamo Core Concepts)[programming_model.dynamo_core_concepts] that Dynamo performs a graph break when
|
||||
As you might remember from [Dynamo Core Concepts](programming_model.dynamo_core_concepts) that Dynamo performs a graph break when
|
||||
it encounters code that can't be traced. In the default `torch.compile` settings, Dynamo compiles the FX graph
|
||||
that has been determined up to that point, executes the unsupported code in regular Python, and then resumes tracing.
|
||||
|
||||
|
@ -5,7 +5,7 @@
|
||||
# Tensor Parallelism - torch.distributed.tensor.parallel
|
||||
|
||||
Tensor Parallelism(TP) is built on top of the PyTorch DistributedTensor
|
||||
(DTensor)[https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md]
|
||||
([DTensor](https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md))
|
||||
and provides different parallelism styles: Colwise, Rowwise, and Sequence Parallelism.
|
||||
|
||||
:::{warning}
|
||||
@ -89,4 +89,4 @@ Parallelized cross-entropy loss computation (loss parallelism), is supported via
|
||||
```
|
||||
:::{warning}
|
||||
The loss_parallel API is experimental and subject to change.
|
||||
:::
|
||||
:::
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user