mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-03 23:45:05 +08:00
Compare commits
195 Commits
viable/str
...
codex/enha
| Author | SHA1 | Date | |
|---|---|---|---|
| cf5223c29d | |||
| b6e2679219 | |||
| 4a6abba0d9 | |||
| 96181d6f76 | |||
| 2164b66121 | |||
| bde18c445d | |||
| f3e43ff2d7 | |||
| 39d0c06ed0 | |||
| 4ab847bbc7 | |||
| 4bd1505f84 | |||
| 1f9614cef8 | |||
| 35f66b83f8 | |||
| 4a39820e5e | |||
| 600267ea56 | |||
| f11ac803d7 | |||
| ea42517e45 | |||
| 91c211fb8c | |||
| 660e369a68 | |||
| 2883b5ab77 | |||
| 9fff8155c3 | |||
| 331191ce4b | |||
| 2c5ed6e7c0 | |||
| 5d7360bb03 | |||
| 321e602692 | |||
| 3c5ca685d6 | |||
| 5178d0a480 | |||
| cf0a00d4f3 | |||
| 5ed4270440 | |||
| 8c728e129d | |||
| 9fc2c6446d | |||
| 409aece3f9 | |||
| b116c51330 | |||
| 2e1742dd63 | |||
| f7ad6dbad6 | |||
| f46bb04dcc | |||
| 6f6a919366 | |||
| 83d71dfb2f | |||
| 5103ecc5d8 | |||
| 9580539e2f | |||
| a11a66ef32 | |||
| 6b768e1890 | |||
| 35c4130fd1 | |||
| 34042a9145 | |||
| 9d1ab4f4bb | |||
| 3e0826c9d7 | |||
| 86c789849e | |||
| f3afbcf340 | |||
| 40b25578e4 | |||
| 412c6d28ec | |||
| 7d570129e0 | |||
| 97ca21106d | |||
| 27234792ad | |||
| b6b7a44dec | |||
| 3ddf2018d0 | |||
| fac6f20ae3 | |||
| 1894082000 | |||
| 5a66ff4915 | |||
| abadea70f3 | |||
| f414aa8e0d | |||
| e438db2546 | |||
| 10335ffb2c | |||
| f006aee601 | |||
| 8d53d788fe | |||
| 0b4f2b46d9 | |||
| 960c4b9937 | |||
| 1f8ee5da11 | |||
| da49a57d34 | |||
| 8ec8c14ace | |||
| 2d50678dcc | |||
| 3ca09d65f1 | |||
| 1bb68271b7 | |||
| 9eb89a4ad5 | |||
| 15d726005d | |||
| 16f9bef642 | |||
| 3c59351c6e | |||
| 7eb1eb4313 | |||
| f39789cdab | |||
| 3d9d41c801 | |||
| 5b0b4cda4a | |||
| 2a11ce2c78 | |||
| 3288fbf374 | |||
| fa5306b4f5 | |||
| 5656d45c8f | |||
| e40fe634b1 | |||
| 3db2164341 | |||
| 5bb8f04d3e | |||
| 5743d731c1 | |||
| aed66248a0 | |||
| 6c3c9414eb | |||
| eccf561326 | |||
| ddf8de28c2 | |||
| 7617b113ad | |||
| 2a760dc51e | |||
| 6c209bfc5c | |||
| 1051c1de5c | |||
| d1cbb74fb1 | |||
| 91c4db76cb | |||
| 4691fe6070 | |||
| ef50c6e3e3 | |||
| 86474ce996 | |||
| 18e18488e8 | |||
| f7082e92b3 | |||
| 95a053284c | |||
| c7e30ae4dd | |||
| dca73982c5 | |||
| 43848b71d9 | |||
| 15c8bdcc5e | |||
| 22e219d996 | |||
| bdc0a421d7 | |||
| ece5e0f01b | |||
| a34797e031 | |||
| f465ea6752 | |||
| a8edccfbf4 | |||
| 6389658ec6 | |||
| cc71ab86a6 | |||
| 2a7c486750 | |||
| 5f18f240de | |||
| 6b7970192f | |||
| 115af42e9d | |||
| 5f775bdfb7 | |||
| 8c54101933 | |||
| c45d56dd00 | |||
| 33b17bc619 | |||
| 22b1710252 | |||
| 4661200125 | |||
| 6a31f42da4 | |||
| c6a6c80a73 | |||
| bf717ce346 | |||
| f6f7676756 | |||
| e6d4b26776 | |||
| 6bb021c125 | |||
| b9e73e639e | |||
| 0319556a35 | |||
| f4cf75688f | |||
| 39189592fd | |||
| 235b995ce1 | |||
| ac7b4e7fe4 | |||
| c6329524d8 | |||
| b0985144b5 | |||
| 7cfecd76b2 | |||
| bac0f289a3 | |||
| 39c340ec9e | |||
| cfd46d13e6 | |||
| 0e5773b7fa | |||
| 2c2e1268b7 | |||
| 00f0365b95 | |||
| 6bb586eafd | |||
| 9697a7ce9e | |||
| 27eb36debb | |||
| a43c4c3972 | |||
| bcafea5c92 | |||
| 3924f784ba | |||
| 93e833de0f | |||
| 14791ea947 | |||
| 702f6e703b | |||
| 39b31a6bfd | |||
| 0fbe3f19c7 | |||
| 144378615a | |||
| 5dbae1eae2 | |||
| 3e03deab6f | |||
| 349e9e922d | |||
| 8b29c59844 | |||
| 53860ef4e1 | |||
| 723ba21393 | |||
| a10207e61b | |||
| ffda8e5ddf | |||
| 1a5d023a5b | |||
| 566ea4e86a | |||
| 9065364995 | |||
| 6eb8d9671b | |||
| b5c4f46bb9 | |||
| 773c6762b8 | |||
| 7320f44cdc | |||
| e5c0e6b5e3 | |||
| 7304b9e7d2 | |||
| 315ffdc1e4 | |||
| 8c590cab9d | |||
| 9357c31b53 | |||
| f63d16c6a9 | |||
| 8dfc8efffd | |||
| 3ffaab3bc8 | |||
| ebd0707578 | |||
| 76ddbc2bbb | |||
| 69c5c08a01 | |||
| 3dab36bdb4 | |||
| 1288c6d8bb | |||
| 80ed522910 | |||
| f7ab8a2710 | |||
| e419dc6d08 | |||
| 5f868ca110 | |||
| 20edc5b26a | |||
| 59a86cb137 | |||
| 36a37b81cd | |||
| 2610746375 | |||
| b1033789fe |
@ -1 +1 @@
|
||||
e0dda9059d082537cee36be6c5e4fe3b18c880c0
|
||||
deb42f2a8e48f5032b4a98ee781a15fa87a157cf
|
||||
|
||||
@ -1 +1 @@
|
||||
v2.28.3-1
|
||||
v2.27.5-1
|
||||
@ -1 +1 @@
|
||||
v2.28.3-1
|
||||
v2.27.7-1
|
||||
|
||||
@ -19,8 +19,8 @@ pip_install \
|
||||
transformers==4.36.2
|
||||
|
||||
pip_install coloredlogs packaging
|
||||
pip_install onnxruntime==1.22.1
|
||||
pip_install onnxscript==0.4.0
|
||||
pip_install onnxruntime==1.23.0
|
||||
pip_install onnxscript==0.5.3
|
||||
|
||||
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers
|
||||
# package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
|
||||
|
||||
9
.ci/docker/common/patch_libstdc.sh
Executable file
9
.ci/docker/common/patch_libstdc.sh
Executable file
@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
set -xe
|
||||
# Script used in Linux x86 and aarch64 CD pipeline
|
||||
|
||||
# Workaround for exposing statically linked libstdc++ CXX11 ABI symbols.
|
||||
# see: https://github.com/pytorch/pytorch/issues/133437
|
||||
LIBNONSHARED=$(gcc -print-file-name=libstdc++_nonshared.a)
|
||||
nm -g $LIBNONSHARED | grep " T " | grep recursive_directory_iterator | cut -c 20- > weaken-symbols.txt
|
||||
objcopy --weaken-symbols weaken-symbols.txt $LIBNONSHARED $LIBNONSHARED
|
||||
@ -130,7 +130,8 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op
|
||||
RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
|
||||
/opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \
|
||||
done;
|
||||
|
||||
ADD ./common/patch_libstdc.sh patch_libstdc.sh
|
||||
RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
|
||||
|
||||
# cmake-3.18.4 from pip; force in case cmake3 already exists
|
||||
RUN yum install -y python3-pip && \
|
||||
|
||||
@ -78,4 +78,6 @@ RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
|
||||
RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
|
||||
COPY --from=openblas /opt/OpenBLAS/ /opt/OpenBLAS/
|
||||
COPY --from=arm_compute /acl /acl
|
||||
ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:/acl/build/:$LD_LIBRARY_PATH
|
||||
ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:/acl/build/:$LD_LIBRARY_PATH
|
||||
ADD ./common/patch_libstdc.sh patch_libstdc.sh
|
||||
RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
|
||||
|
||||
@ -106,3 +106,5 @@ COPY --from=arm_compute /acl /acl
|
||||
RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
|
||||
ENV PATH=/usr/local/cuda/bin:$PATH
|
||||
ENV LD_LIBRARY_PATH=/acl/build/:$LD_LIBRARY_PATH
|
||||
ADD ./common/patch_libstdc.sh patch_libstdc.sh
|
||||
RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
|
||||
|
||||
@ -341,7 +341,7 @@ onnx==1.18.0
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
onnxscript==0.4.0
|
||||
onnxscript==0.5.3
|
||||
#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
@ -67,7 +67,7 @@ fi
|
||||
# wheels with cxx11-abi
|
||||
|
||||
echo "Checking that the gcc ABI is what we expect"
|
||||
if [[ "$(uname)" != 'Darwin' ]]; then
|
||||
if [[ "$(uname)" != 'Darwin' && "$(uname -m)" != "s390x" ]]; then
|
||||
# We also check that there are cxx11 symbols in libtorch
|
||||
#
|
||||
echo "Checking that symbols in libtorch.so have the right gcc abi"
|
||||
|
||||
@ -32,6 +32,9 @@ LIBTORCH_NAMESPACE_LIST = (
|
||||
"torch::",
|
||||
)
|
||||
|
||||
# Patterns for detecting statically linked libstdc++ symbols
|
||||
STATICALLY_LINKED_CXX11_ABI = [re.compile(r".*recursive_directory_iterator.*")]
|
||||
|
||||
|
||||
def _apply_libtorch_symbols(symbols):
|
||||
return [
|
||||
@ -53,12 +56,17 @@ def get_symbols(lib: str) -> list[tuple[str, str, str]]:
|
||||
return [x.split(" ", 2) for x in lines.decode("latin1").split("\n")[:-1]]
|
||||
|
||||
|
||||
def grep_symbols(lib: str, patterns: list[Any]) -> list[str]:
|
||||
def grep_symbols(
|
||||
lib: str, patterns: list[Any], symbol_type: str | None = None
|
||||
) -> list[str]:
|
||||
def _grep_symbols(
|
||||
symbols: list[tuple[str, str, str]], patterns: list[Any]
|
||||
) -> list[str]:
|
||||
rc = []
|
||||
for _s_addr, _s_type, s_name in symbols:
|
||||
# Filter by symbol type if specified
|
||||
if symbol_type and _s_type != symbol_type:
|
||||
continue
|
||||
for pattern in patterns:
|
||||
if pattern.match(s_name):
|
||||
rc.append(s_name)
|
||||
@ -80,6 +88,18 @@ def grep_symbols(lib: str, patterns: list[Any]) -> list[str]:
|
||||
return functools.reduce(list.__add__, (x.result() for x in tasks), [])
|
||||
|
||||
|
||||
def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None:
|
||||
cxx11_statically_linked_symbols = grep_symbols(
|
||||
lib, STATICALLY_LINKED_CXX11_ABI, symbol_type="T"
|
||||
)
|
||||
num_statically_linked_symbols = len(cxx11_statically_linked_symbols)
|
||||
print(f"num_statically_linked_symbols (T): {num_statically_linked_symbols}")
|
||||
if num_statically_linked_symbols > 0:
|
||||
raise RuntimeError(
|
||||
f"Found statically linked libstdc++ symbols (recursive_directory_iterator): {cxx11_statically_linked_symbols[:100]}"
|
||||
)
|
||||
|
||||
|
||||
def check_lib_symbols_for_abi_correctness(lib: str) -> None:
|
||||
print(f"lib: {lib}")
|
||||
cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
|
||||
@ -107,6 +127,7 @@ def main() -> None:
|
||||
|
||||
libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
|
||||
check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
|
||||
check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -34,12 +34,14 @@ fi
|
||||
|
||||
|
||||
# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
|
||||
NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
|
||||
if [ -n "$NUMBA_CUDA_DIR" ]; then
|
||||
NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
|
||||
pushd "$NUMBA_CUDA_DIR"
|
||||
patch -p4 <"$NUMBA_PATCH"
|
||||
popd
|
||||
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
|
||||
NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
|
||||
if [ -n "$NUMBA_CUDA_DIR" ]; then
|
||||
NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
|
||||
pushd "$NUMBA_CUDA_DIR"
|
||||
patch -p4 <"$NUMBA_PATCH"
|
||||
popd
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Environment variables:"
|
||||
|
||||
@ -37,10 +37,10 @@ IF "%CUDA_PATH_V128%"=="" (
|
||||
)
|
||||
|
||||
IF "%BUILD_VISION%" == "" (
|
||||
set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0
|
||||
set TORCH_CUDA_ARCH_LIST=7.0;7.5;8.0;8.6;9.0;10.0;12.0
|
||||
set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
|
||||
) ELSE (
|
||||
set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
|
||||
set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
|
||||
)
|
||||
|
||||
set "CUDA_PATH=%CUDA_PATH_V128%"
|
||||
|
||||
@ -59,13 +59,14 @@ performance-*,
|
||||
-performance-enum-size,
|
||||
readability-container-size-empty,
|
||||
readability-delete-null-pointer,
|
||||
readability-duplicate-include
|
||||
readability-duplicate-include,
|
||||
readability-misplaced-array-index,
|
||||
readability-redundant*
|
||||
readability-redundant*,
|
||||
readability-simplify-subscript-expr,
|
||||
readability-string-compare,
|
||||
-readability-redundant-access-specifiers,
|
||||
-readability-redundant-control-flow,
|
||||
-readability-redundant-inline-specifier,
|
||||
'
|
||||
HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
|
||||
WarningsAsErrors: '*'
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
78a47f87ce259a48f0391fa9ae15add05ea7432b
|
||||
0ad9951c416d33c5da4f7a504fb162cbe62386f5
|
||||
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
0fc62aa26a30ed7ca419d285f285cb5ba02c4394
|
||||
2a9138a26ee257fef05310ad3fecf7c55fe80d73
|
||||
|
||||
16
.github/ci_configs/vllm/Dockerfile.tmp_vllm
vendored
16
.github/ci_configs/vllm/Dockerfile.tmp_vllm
vendored
@ -202,7 +202,7 @@ ARG max_jobs=16
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
ARG nvcc_threads=4
|
||||
ENV NVCC_THREADS=$nvcc_threads
|
||||
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
|
||||
ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0'
|
||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
|
||||
ARG USE_SCCACHE
|
||||
@ -297,16 +297,28 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
|
||||
echo "[INFO] Showing torch_build_versions.txt content:" && \
|
||||
cat torch_build_versions.txt
|
||||
|
||||
# Install build and runtime dependencies, this is needed for flashinfer install
|
||||
COPY requirements/build.txt requirements/build.txt
|
||||
COPY use_existing_torch.py use_existing_torch.py
|
||||
RUN python3 use_existing_torch.py
|
||||
RUN cat requirements/build.txt
|
||||
|
||||
# Install uv for faster pip installs if not existed
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if ! python3 -m uv --version > /dev/null 2>&1; then \
|
||||
python3 -m pip install uv==0.8.4; \
|
||||
fi
|
||||
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/build.txt
|
||||
|
||||
|
||||
# Default mount file as placeholder, this just avoid the mount error
|
||||
ARG TORCH_WHEELS_PATH="./requirements"
|
||||
# Install torch, torchaudio and torchvision
|
||||
@ -332,13 +344,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
# Install xformers wheel from previous stage
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system /wheels/xformers/*.whl --verbose
|
||||
|
||||
# Build flashinfer from source.
|
||||
ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
|
||||
# install package for build flashinfer
|
||||
# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
|
||||
|
||||
RUN pip install build==1.3.0
|
||||
RUN pip freeze | grep -E 'setuptools|packaging|build'
|
||||
|
||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
|
||||
@ -1,9 +1,14 @@
|
||||
import glob
|
||||
import os
|
||||
|
||||
|
||||
requires_files = glob.glob("requirements/*.txt")
|
||||
requires_files += ["pyproject.toml"]
|
||||
|
||||
for file in requires_files:
|
||||
if not os.path.exists(file):
|
||||
print(f"!!! skipping missing {file}")
|
||||
continue
|
||||
print(f">>> cleaning {file}")
|
||||
with open(file) as f:
|
||||
lines = f.readlines()
|
||||
|
||||
6
.github/scripts/filter_test_configs.py
vendored
6
.github/scripts/filter_test_configs.py
vendored
@ -502,6 +502,7 @@ def perform_misc_tasks(
|
||||
job_name: str,
|
||||
pr_body: str,
|
||||
branch: Optional[str] = None,
|
||||
tag: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
In addition to apply the filter logic, the script also does the following
|
||||
@ -509,7 +510,9 @@ def perform_misc_tasks(
|
||||
"""
|
||||
set_output(
|
||||
"keep-going",
|
||||
branch == MAIN_BRANCH or check_for_setting(labels, pr_body, "keep-going"),
|
||||
branch == MAIN_BRANCH
|
||||
or bool(tag and re.match(r"^trunk/[a-f0-9]{40}$", tag))
|
||||
or check_for_setting(labels, pr_body, "keep-going"),
|
||||
)
|
||||
set_output(
|
||||
"ci-verbose-test-logs",
|
||||
@ -634,6 +637,7 @@ def main() -> None:
|
||||
job_name=args.job_name,
|
||||
pr_body=pr_body if pr_body else "",
|
||||
branch=args.branch,
|
||||
tag=tag,
|
||||
)
|
||||
|
||||
# Set the filtered test matrix as the output
|
||||
|
||||
@ -53,7 +53,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
|
||||
@ -70,7 +70,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
|
||||
@ -87,7 +87,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
|
||||
|
||||
9
.github/workflows/_get-changed-files.yml
vendored
9
.github/workflows/_get-changed-files.yml
vendored
@ -40,6 +40,15 @@ jobs:
|
||||
# Use gh CLI to get changed files in the PR with explicit repo
|
||||
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
|
||||
|
||||
# See https://github.com/pytorch/pytorch/pull/134215#issuecomment-2332128790
|
||||
PYI_FILES_TO_ADD=""
|
||||
for file in ${CHANGED_FILES}; do
|
||||
if [[ "${file}" == *".pyi.in" ]]; then
|
||||
PYI_FILES_TO_ADD="${PYI_FILES_TO_ADD} ${file//.in/}"
|
||||
fi
|
||||
done
|
||||
CHANGED_FILES="${CHANGED_FILES}${PYI_FILES_TO_ADD}"
|
||||
|
||||
if [ -z "$CHANGED_FILES" ]; then
|
||||
echo "No changed files found, setting to '*'"
|
||||
CHANGED_FILES="*"
|
||||
|
||||
42
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
42
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -132,7 +132,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -178,7 +178,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -224,7 +224,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -335,7 +335,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -381,7 +381,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -427,7 +427,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -538,7 +538,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -584,7 +584,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -630,7 +630,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -741,7 +741,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -787,7 +787,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -833,7 +833,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -944,7 +944,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -990,7 +990,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1036,7 +1036,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1147,7 +1147,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1193,7 +1193,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1239,7 +1239,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1350,7 +1350,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1396,7 +1396,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1442,7 +1442,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
42
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
42
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -127,7 +127,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_6-test: # Testing
|
||||
@ -193,7 +193,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_8-test: # Testing
|
||||
@ -259,7 +259,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda13_0-test: # Testing
|
||||
@ -721,7 +721,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_6-test: # Testing
|
||||
@ -787,7 +787,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_8-test: # Testing
|
||||
@ -853,7 +853,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda13_0-test: # Testing
|
||||
@ -1315,7 +1315,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_6-test: # Testing
|
||||
@ -1381,7 +1381,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_8-test: # Testing
|
||||
@ -1447,7 +1447,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda13_0-test: # Testing
|
||||
@ -1909,7 +1909,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_6-test: # Testing
|
||||
@ -1975,7 +1975,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_8-test: # Testing
|
||||
@ -2041,7 +2041,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda13_0-test: # Testing
|
||||
@ -2503,7 +2503,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_6-test: # Testing
|
||||
@ -2569,7 +2569,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_8-test: # Testing
|
||||
@ -2635,7 +2635,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda13_0-test: # Testing
|
||||
@ -3097,7 +3097,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_6-test: # Testing
|
||||
@ -3163,7 +3163,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_8-test: # Testing
|
||||
@ -3229,7 +3229,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda13_0-test: # Testing
|
||||
@ -3691,7 +3691,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_6-test: # Testing
|
||||
@ -3757,7 +3757,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_8-test: # Testing
|
||||
@ -3823,7 +3823,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda13_0-test: # Testing
|
||||
|
||||
10
.github/workflows/inductor-periodic.yml
vendored
10
.github/workflows/inductor-periodic.yml
vendored
@ -106,6 +106,16 @@ jobs:
|
||||
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
54
.github/workflows/operator_microbenchmark.yml
vendored
54
.github/workflows/operator_microbenchmark.yml
vendored
@ -18,6 +18,7 @@ permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
# H100 A100 runners
|
||||
opmicrobenchmark-build:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: opmicrobenchmark-build
|
||||
@ -44,3 +45,56 @@ jobs:
|
||||
docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
# B200 runner
|
||||
opmicrobenchmark-build-b200:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: opmicrobenchmark-build-b200
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '10.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
opmicrobenchmark-test-b200:
|
||||
name: opmicrobenchmark-test-b200
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: opmicrobenchmark-build-b200
|
||||
with:
|
||||
timeout-minutes: 500
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
secrets: inherit
|
||||
|
||||
# ROCM MI300 runner
|
||||
opmicrobenchmark-build-rocm:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: opmicrobenchmark-build-rocm
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3_10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
opmicrobenchmark-test-rocm:
|
||||
name: opmicrobenchmark-test-rocm
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs: opmicrobenchmark-build-rocm
|
||||
with:
|
||||
timeout-minutes: 500
|
||||
build-environment: linux-jammy-rocm-py3_10
|
||||
docker-image: ${{ needs.opmicrobenchmark-build-rocm.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opmicrobenchmark-build-rocm.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
6
.github/workflows/periodic.yml
vendored
6
.github/workflows/periodic.yml
vendored
@ -213,9 +213,9 @@ jobs:
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
2
.github/workflows/pull.yml
vendored
2
.github/workflows/pull.yml
vendored
@ -127,8 +127,6 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build with asan
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-clang18-asan
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
|
||||
2
.github/workflows/slow.yml
vendored
2
.github/workflows/slow.yml
vendored
@ -140,8 +140,6 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build with asan
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-clang18-asan
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
|
||||
42
.github/workflows/trunk.yml
vendored
42
.github/workflows/trunk.yml
vendored
@ -160,9 +160,10 @@ jobs:
|
||||
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
|
||||
{ config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
|
||||
{ config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
|
||||
{ config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
|
||||
{ config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
|
||||
{ config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
|
||||
{ config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
@ -189,41 +190,6 @@ jobs:
|
||||
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.4" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-test:
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
|
||||
secrets: inherit
|
||||
|
||||
inductor-build:
|
||||
name: inductor-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
2
.github/workflows/update-viablestrict.yml
vendored
2
.github/workflows/update-viablestrict.yml
vendored
@ -23,7 +23,7 @@ jobs:
|
||||
with:
|
||||
repository: pytorch/pytorch
|
||||
stable-branch: viable/strict
|
||||
requires: '[\"pull\", \"trunk\", \"lint\", \"^linux-binary-manywheel$\", \"^linux-binary-libtorch-release$\", \"linux-aarch64\"]'
|
||||
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-aarch64\"]'
|
||||
secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
|
||||
clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
|
||||
clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
|
||||
|
||||
2
.github/workflows/vllm.yml
vendored
2
.github/workflows/vllm.yml
vendored
@ -42,7 +42,7 @@ jobs:
|
||||
build-external-packages: "vllm"
|
||||
build-environment: linux-jammy-cuda12.8-py3.12-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
|
||||
cuda-arch-list: '8.0;8.9;9.0'
|
||||
cuda-arch-list: '8.0 8.9 9.0'
|
||||
runner: linux.24xlarge.memory
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
|
||||
@ -18,6 +18,7 @@ exclude_patterns = [
|
||||
'torch/_inductor/autoheuristic/artifacts/**',
|
||||
'scripts/**',
|
||||
'test/generated_type_hints_smoketest.py',
|
||||
'test/test_torchfuzz_repros.py',
|
||||
# CPython tests
|
||||
'test/dynamo/cpython/**',
|
||||
# Tests from the NumPy test suite
|
||||
@ -27,6 +28,7 @@ exclude_patterns = [
|
||||
'torch/lib/**',
|
||||
'venv/**',
|
||||
'**/*.pyi',
|
||||
"tools/experimental/dynamic_shapes/torchfuzz/**",
|
||||
'tools/test/test_selective_build.py',
|
||||
]
|
||||
command = [
|
||||
@ -1571,6 +1573,7 @@ exclude_patterns = [
|
||||
'torch/_inductor/fx_passes/serialized_patterns/**',
|
||||
'torch/_inductor/autoheuristic/artifacts/**',
|
||||
'test/dynamo/cpython/**',
|
||||
'test/test_torchfuzz_repros.py',
|
||||
'scripts/**',
|
||||
'third_party/**',
|
||||
'fb/**',
|
||||
|
||||
18
CODEOWNERS
18
CODEOWNERS
@ -181,15 +181,15 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
|
||||
/torch/csrc/jit/python/init.cpp @mikaylagawarecki
|
||||
|
||||
# CUDA and CUDA math libraries
|
||||
aten/src/ATen/cuda/ @eqy @syed-ahmed
|
||||
aten/src/ATen/cudnn/ @eqy @syed-ahmed
|
||||
aten/src/ATen/native/cuda/ @eqy @syed-ahmed
|
||||
aten/src/ATen/native/cudnn/ @eqy @syed-ahmed
|
||||
c10/cuda @eqy @syed-ahmed
|
||||
torch/cuda/ @eqy @syed-ahmed
|
||||
torch/csrc/cuda/ @eqy @syed-ahmed
|
||||
torch/backends/cuda/ @eqy @syed-ahmed
|
||||
torch/backends/cudnn/ @eqy @syed-ahmed
|
||||
aten/src/ATen/cuda/ @eqy @syed-ahmed @Aidyn-A
|
||||
aten/src/ATen/cudnn/ @eqy @syed-ahmed @Aidyn-A
|
||||
aten/src/ATen/native/cuda/ @eqy @syed-ahmed @Aidyn-A
|
||||
aten/src/ATen/native/cudnn/ @eqy @syed-ahmed @Aidyn-A
|
||||
c10/cuda @eqy @syed-ahmed @Aidyn-A
|
||||
torch/cuda/ @eqy @syed-ahmed @Aidyn-A
|
||||
torch/csrc/cuda/ @eqy @syed-ahmed @Aidyn-A
|
||||
torch/backends/cuda/ @eqy @syed-ahmed @Aidyn-A
|
||||
torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
|
||||
|
||||
# PyTree utilities
|
||||
/torch/utils/_pytree.py @XuehaiPan
|
||||
|
||||
@ -50,11 +50,10 @@ RUN git submodule update --init --recursive
|
||||
FROM conda as conda-installs
|
||||
ARG PYTHON_VERSION=3.11
|
||||
ARG CUDA_PATH=cu121
|
||||
ARG CUDA_CHANNEL=nvidia
|
||||
ARG INSTALL_CHANNEL=whl/nightly
|
||||
# Automatically set by buildx
|
||||
RUN /opt/conda/bin/conda update -y -n base -c defaults conda
|
||||
RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION}
|
||||
# pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
|
||||
RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=25.7.0
|
||||
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
|
||||
17
RELEASE.md
17
RELEASE.md
@ -3,6 +3,7 @@
|
||||
<!-- toc -->
|
||||
|
||||
- [Release Compatibility Matrix](#release-compatibility-matrix)
|
||||
- [PyTorch CUDA Support Matrix](#pytorch-cuda-support-matrix)
|
||||
- [Release Cadence](#release-cadence)
|
||||
- [General Overview](#general-overview)
|
||||
- [Frequently Asked Questions](#frequently-asked-questions)
|
||||
@ -63,6 +64,22 @@ Following is the Release Compatibility Matrix for PyTorch releases:
|
||||
| 1.13 | >=3.7, <=3.10 | C++14 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
|
||||
| 1.12 | >=3.7, <=3.10 | C++14 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |
|
||||
|
||||
### PyTorch CUDA Support Matrix
|
||||
|
||||
For Release 2.9 PyTorch Supports following CUDA Architectures:
|
||||
|
||||
| CUDA | architectures supported for Linux x86 and Windows builds | notes |
|
||||
| --- | --- | --- |
|
||||
| 12.6.3 | Maxwell(5.0), Pascal(6.0), Volta(7.0), Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0) | |
|
||||
| 12.8.1 | Volta(7.0), Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0), Blackwell(10.0, 12.0) | |
|
||||
| 13.0.0 | Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0), Blackwell(10.0, 12.0+PTX) | +PTX available on linux builds only |
|
||||
|
||||
| CUDA | architectures supported for Linux aarch64 builds |
|
||||
| --- | --- |
|
||||
| 12.6.3 | Ampere(8.0), Hopper(9.0) |
|
||||
| 12.8.1 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 12.0) |
|
||||
| 13.0.0 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 11.0, 12.0+PTX) |
|
||||
|
||||
## Release Cadence
|
||||
|
||||
Following is the release cadence. All future dates below are tentative. For latest updates on the release schedule, please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.
|
||||
|
||||
@ -605,6 +605,11 @@ if(UNIX)
|
||||
if(HAVE_MALLOC_USABLE_SIZE)
|
||||
add_definitions(-DHAVE_MALLOC_USABLE_SIZE=1)
|
||||
endif(HAVE_MALLOC_USABLE_SIZE)
|
||||
set(CMAKE_EXTRA_INCLUDE_FILES "fcntl.h")
|
||||
CHECK_FUNCTION_EXISTS(posix_fallocate HAVE_POSIX_FALLOCATE)
|
||||
if(HAVE_POSIX_FALLOCATE)
|
||||
add_definitions(-DHAVE_POSIX_FALLOCATE=1)
|
||||
endif(HAVE_POSIX_FALLOCATE)
|
||||
endif(UNIX)
|
||||
|
||||
ADD_DEFINITIONS(-DUSE_EXTERNAL_MZCRC)
|
||||
|
||||
@ -40,41 +40,6 @@ namespace {
|
||||
->conv
|
||||
->rnn
|
||||
*/
|
||||
const std::map<std::string, std::vector<std::string>> _fp32_precisions = {
|
||||
{"generic", {{"ieee", "tf32", "bf16", "none"}}},
|
||||
{"mkldnn", {{"ieee", "tf32", "bf16", "none"}}},
|
||||
{"cuda", {{"ieee", "tf32", "none"}}}};
|
||||
|
||||
// Check whether the backend and op are legal
|
||||
void check_fp32_prec_backend_and_op(
|
||||
const std::string& backend,
|
||||
const std::string& op) {
|
||||
static std::vector<std::string> backends = {"generic", "mkldnn", "cuda"};
|
||||
static std::vector<std::string> operators = {"conv", "matmul", "rnn", "all"};
|
||||
TORCH_CHECK(
|
||||
std::find(backends.begin(), backends.end(), backend) != backends.end(),
|
||||
"Invalid backend: ",
|
||||
backend);
|
||||
TORCH_CHECK(
|
||||
std::find(operators.begin(), operators.end(), op) != operators.end(),
|
||||
"Invalid operator: ",
|
||||
op);
|
||||
if (backend == "generic") {
|
||||
TORCH_CHECK(op == "all", "Invalid operation for generic backend: ", op);
|
||||
}
|
||||
}
|
||||
|
||||
// Return whether the precision is supported by backends
|
||||
bool validate_fp32_prec(
|
||||
const std::string& backend,
|
||||
const std::string& precision) {
|
||||
auto iterp = _fp32_precisions.find(backend);
|
||||
TORCH_CHECK(iterp != _fp32_precisions.end());
|
||||
auto precisions = iterp->second;
|
||||
bool valid = std::find(precisions.begin(), precisions.end(), precision) !=
|
||||
precisions.end();
|
||||
return valid;
|
||||
}
|
||||
|
||||
C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){
|
||||
TORCH_WARN_ONCE(
|
||||
@ -86,6 +51,54 @@ void check_fp32_prec_backend_and_op(
|
||||
}
|
||||
} // namespace
|
||||
|
||||
Float32Backend str2backend(const std::string& name) {
|
||||
if (name == "generic")
|
||||
return Float32Backend::GENERIC;
|
||||
else if (name == "cuda")
|
||||
return Float32Backend::CUDA;
|
||||
else if (name == "mkldnn")
|
||||
return Float32Backend::MKLDNN;
|
||||
TORCH_CHECK(false, "Unknown backend: ", name);
|
||||
}
|
||||
|
||||
Float32Op str2op(const std::string& name) {
|
||||
if (name == "all")
|
||||
return Float32Op::ALL;
|
||||
else if (name == "conv")
|
||||
return Float32Op::CONV;
|
||||
else if (name == "rnn")
|
||||
return Float32Op::RNN;
|
||||
else if (name == "matmul")
|
||||
return Float32Op::MATMUL;
|
||||
TORCH_CHECK(false, "Unknown op: ", name);
|
||||
}
|
||||
|
||||
Float32Precision str2precision(const std::string& name) {
|
||||
if (name == "none")
|
||||
return Float32Precision::NONE;
|
||||
else if (name == "ieee")
|
||||
return Float32Precision::IEEE;
|
||||
else if (name == "tf32")
|
||||
return Float32Precision::TF32;
|
||||
else if (name == "bf16")
|
||||
return Float32Precision::BF16;
|
||||
TORCH_CHECK(false, "Unknown precision: ", name);
|
||||
}
|
||||
|
||||
std::string precision2str(Float32Precision prec) {
|
||||
switch (prec) {
|
||||
case Float32Precision::NONE:
|
||||
return "none";
|
||||
case Float32Precision::IEEE:
|
||||
return "ieee";
|
||||
case Float32Precision::TF32:
|
||||
return "tf32";
|
||||
case Float32Precision::BF16:
|
||||
return "bf16";
|
||||
}
|
||||
TORCH_CHECK(false, "Invalid enum Float32Precision(", static_cast<int>(prec), ")");
|
||||
}
|
||||
|
||||
Context::Context() = default;
|
||||
|
||||
// TODO: This could be bad juju if someone calls globalContext() in the
|
||||
@ -179,10 +192,10 @@ void Context::setUserEnabledNNPACK(bool e) {
|
||||
enabled_nnpack = e;
|
||||
}
|
||||
|
||||
bool Context::allowTF32CuDNN(const std::string& op) const {
|
||||
if (op.empty()){
|
||||
bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
|
||||
bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
|
||||
bool Context::allowTF32CuDNN(std::optional<Float32Op> op) const {
|
||||
if (!op.has_value()) {
|
||||
bool allow_tf32_rnn = float32Precision(Float32Backend::CUDA, Float32Op::RNN) == Float32Precision::TF32;
|
||||
bool allow_tf32_conv = float32Precision(Float32Backend::CUDA, Float32Op::CONV) == Float32Precision::TF32;
|
||||
TORCH_CHECK(
|
||||
allow_tf32_rnn == allow_tf32_conv && allow_tf32_rnn == allow_tf32_cudnn,
|
||||
"PyTorch is checking whether allow_tf32 is enabled for cuDNN without a specific operator name,",
|
||||
@ -191,15 +204,15 @@ bool Context::allowTF32CuDNN(const std::string& op) const {
|
||||
"We suggest only using the new API to set the TF32 flag(s). See also: ",
|
||||
"https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
|
||||
} else {
|
||||
return float32Precision("cuda", op) == "tf32";
|
||||
return float32Precision(Float32Backend::CUDA, op.value()) == Float32Precision::TF32;
|
||||
}
|
||||
warn_deprecated_fp32_precision_api();
|
||||
return allow_tf32_cudnn;
|
||||
}
|
||||
|
||||
void Context::setAllowTF32CuDNN(bool b) {
|
||||
setFloat32Precision("cuda", "rnn", b ? "tf32" : "none");
|
||||
setFloat32Precision("cuda", "conv", b ? "tf32" : "none");
|
||||
setFloat32Precision(Float32Backend::CUDA, Float32Op::RNN, b ? Float32Precision::TF32 : Float32Precision::NONE);
|
||||
setFloat32Precision(Float32Backend::CUDA, Float32Op::CONV, b ? Float32Precision::TF32 : Float32Precision::NONE);
|
||||
allow_tf32_cudnn = b;
|
||||
warn_deprecated_fp32_precision_api();
|
||||
}
|
||||
@ -279,42 +292,6 @@ bool Context::userEnabledOverrideableSDP() const {
|
||||
return enabled_overrideable;
|
||||
}
|
||||
|
||||
static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
|
||||
static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
|
||||
|
||||
bool Context::checkCuBLASConfigDeterministic() {
|
||||
// If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
|
||||
// is set to deterministic setting
|
||||
if (hasCUDART()) {
|
||||
const auto workspace_config = c10::utils::get_env(cublas_config_var_name);
|
||||
return (workspace_config == cublas_deterministic_configs[0] || workspace_config == cublas_deterministic_configs[1]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void Context::alertCuBLASConfigNotDeterministic() const {
|
||||
static const bool cublas_config_deterministic = checkCuBLASConfigDeterministic();
|
||||
if (C10_LIKELY(!deterministicAlgorithms() || cublas_config_deterministic)) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto msg = c10::str(
|
||||
"Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ",
|
||||
"`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ",
|
||||
"it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this ",
|
||||
"case, you must set an environment variable before running your PyTorch application: ",
|
||||
cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ",
|
||||
cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ",
|
||||
"https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility"
|
||||
);
|
||||
|
||||
if (deterministicAlgorithmsWarnOnly()) {
|
||||
TORCH_WARN(msg);
|
||||
} else {
|
||||
TORCH_CHECK(false, msg);
|
||||
}
|
||||
}
|
||||
|
||||
bool Context::benchmarkCuDNN() const {
|
||||
return benchmark_cudnn;
|
||||
}
|
||||
@ -341,7 +318,7 @@ void Context::setImmediateMiopen(bool b) {
|
||||
|
||||
bool Context::allowTF32CuBLAS() const {
|
||||
bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
|
||||
bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
|
||||
bool allow_tf32_new = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32;
|
||||
TORCH_CHECK(
|
||||
legacy_allow_tf32 == allow_tf32_new,
|
||||
"PyTorch is checking whether allow_tf32_new is enabled for cuBlas matmul,",
|
||||
@ -354,17 +331,17 @@ bool Context::allowTF32CuBLAS() const {
|
||||
|
||||
void Context::setAllowTF32CuBLAS(bool b) {
|
||||
float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
|
||||
setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
|
||||
setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, b ? Float32Precision::TF32 : Float32Precision::IEEE);
|
||||
}
|
||||
|
||||
Float32MatmulPrecision Context::float32MatmulPrecision() const {
|
||||
bool invalid = float32Precision("cuda", "matmul") == "tf32" &&
|
||||
bool invalid = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32 &&
|
||||
float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST;
|
||||
invalid = invalid ||
|
||||
(float32Precision("mkldnn", "matmul") == "bf16" &&
|
||||
(float32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL) == Float32Precision::BF16 &&
|
||||
float32_matmul_precision != at::Float32MatmulPrecision::MEDIUM);
|
||||
invalid = invalid ||
|
||||
(float32Precision("mkldnn", "matmul") == "tf32" &&
|
||||
(float32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL) == Float32Precision::TF32 &&
|
||||
float32_matmul_precision != at::Float32MatmulPrecision::HIGH);
|
||||
TORCH_CHECK(
|
||||
!invalid,
|
||||
@ -376,15 +353,26 @@ Float32MatmulPrecision Context::float32MatmulPrecision() const {
|
||||
return float32_matmul_precision;
|
||||
}
|
||||
|
||||
std::string Context::float32Precision(const std::string& backend, const std::string& op) const {
|
||||
check_fp32_prec_backend_and_op(backend, op);
|
||||
auto precision = fp32_precision.find(backend)->second.find(op)->second;
|
||||
if (precision == "none")
|
||||
precision = fp32_precision.find(backend)->second.find("all")->second;
|
||||
if (precision == "none")
|
||||
precision = fp32_precision.find("generic")->second.find("all")->second;
|
||||
bool valid_prec = validate_fp32_prec(backend, precision);
|
||||
return valid_prec ? precision : "none";
|
||||
Float32Precision Context::float32Precision(Float32Backend backend, Float32Op op) const {
|
||||
std::pair<Float32Backend, Float32Op> key{backend, op};
|
||||
auto it = fp32_precision.find(key);
|
||||
TORCH_CHECK(it != fp32_precision.end(), "Invalid (backend, op) pair: (", backend, ", ", op, ")");
|
||||
|
||||
Float32Precision precision = it->second;
|
||||
if (precision == Float32Precision::NONE) {
|
||||
key.second = Float32Op::ALL;
|
||||
precision = fp32_precision.find(key)->second;
|
||||
}
|
||||
if (precision == Float32Precision::NONE) {
|
||||
key.first = Float32Backend::GENERIC;
|
||||
precision = fp32_precision.find(key)->second;
|
||||
}
|
||||
|
||||
// "cuda" does not support "bf16"
|
||||
if (backend == Float32Backend::CUDA && precision == Float32Precision::BF16) {
|
||||
return Float32Precision::NONE;
|
||||
}
|
||||
return precision;
|
||||
}
|
||||
|
||||
void Context::setFloat32MatmulPrecision(const std::string &s) {
|
||||
@ -393,18 +381,18 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
|
||||
// TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention
|
||||
if (s_ == "highest") {
|
||||
float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST;
|
||||
setFloat32Precision("cuda", "matmul", "ieee");
|
||||
setFloat32Precision("mkldnn", "matmul", "ieee");
|
||||
setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::IEEE);
|
||||
setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::IEEE);
|
||||
return true;
|
||||
} else if (s_ == "high") {
|
||||
float32_matmul_precision = at::Float32MatmulPrecision::HIGH;
|
||||
setFloat32Precision("cuda", "matmul", "tf32");
|
||||
setFloat32Precision("mkldnn", "matmul", "tf32");
|
||||
setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32);
|
||||
setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::TF32);
|
||||
return true;
|
||||
} else if (s_ == "medium") {
|
||||
float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM;
|
||||
setFloat32Precision("cuda", "matmul", "tf32");
|
||||
setFloat32Precision("mkldnn", "matmul", "bf16");
|
||||
setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32);
|
||||
setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::BF16);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
@ -418,25 +406,16 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
|
||||
"setFloat32MatmulPrecision call has no effect.");
|
||||
}
|
||||
|
||||
void Context::setFloat32Precision(const std::string& backend, const std::string& op, const std::string& p) {
|
||||
check_fp32_prec_backend_and_op(backend, op);
|
||||
if (validate_fp32_prec(backend, p)) {
|
||||
fp32_precision[backend][op] = p;
|
||||
} else {
|
||||
std::string msg;
|
||||
auto iterp = _fp32_precisions.find(backend);
|
||||
TORCH_CHECK(iterp != _fp32_precisions.end());
|
||||
for (const auto& p : iterp->second) {
|
||||
msg += p;
|
||||
msg += " ";
|
||||
}
|
||||
TORCH_WARN(
|
||||
"you have set wrong precision for backend:",
|
||||
backend,
|
||||
" setFloat32Precision call has no effect.",
|
||||
"Please choose precision from: ",
|
||||
msg);
|
||||
}
|
||||
void Context::setFloat32Precision(Float32Backend backend, Float32Op op, Float32Precision p) {
|
||||
auto it = fp32_precision.find(std::make_pair(backend, op));
|
||||
TORCH_CHECK(
|
||||
it != fp32_precision.end(),
|
||||
"Invalid (backend, op) pair: (", backend, ", ", op, ")");
|
||||
TORCH_CHECK(
|
||||
!(backend == Float32Backend::CUDA && p == Float32Precision::BF16),
|
||||
"backend 'cuda' does not support precision 'bf16'");
|
||||
|
||||
it->second = p;
|
||||
}
|
||||
|
||||
at::LinalgBackend Context::linalgPreferredBackend() const {
|
||||
@ -608,20 +587,33 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
|
||||
rocm_fa_preferred_backend = b;
|
||||
}
|
||||
|
||||
bool Context::allowFP16ReductionCuBLAS() const {
|
||||
CuBLASReductionOption Context::allowFP16ReductionCuBLAS() const {
|
||||
return allow_fp16_reduction_cublas;
|
||||
}
|
||||
|
||||
void Context::setAllowFP16ReductionCuBLAS(bool b) {
|
||||
allow_fp16_reduction_cublas = b;
|
||||
CuBLASReductionOption inline get_reduction_option(bool allow_reduced_precision, bool allow_splitk) {
|
||||
TORCH_CHECK(
|
||||
!(allow_reduced_precision && !allow_splitk),
|
||||
"allow_splitk=False is not supported when reduced precision reductions are enabled");
|
||||
if (allow_reduced_precision) {
|
||||
return CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
} else if (allow_splitk) {
|
||||
return CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK;
|
||||
} else {
|
||||
return CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK;
|
||||
}
|
||||
}
|
||||
|
||||
bool Context::allowBF16ReductionCuBLAS() const {
|
||||
void Context::setAllowFP16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
|
||||
allow_fp16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
|
||||
}
|
||||
|
||||
CuBLASReductionOption Context::allowBF16ReductionCuBLAS() const {
|
||||
return allow_bf16_reduction_cublas;
|
||||
}
|
||||
|
||||
void Context::setAllowBF16ReductionCuBLAS(bool b) {
|
||||
allow_bf16_reduction_cublas = b;
|
||||
void Context::setAllowBF16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
|
||||
allow_bf16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
|
||||
}
|
||||
|
||||
bool Context::allowFP16AccumulationCuBLAS() const {
|
||||
|
||||
@ -25,11 +25,13 @@
|
||||
#include <c10/util/CallOnce.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/env.h>
|
||||
#include <c10/util/hash.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace at {
|
||||
|
||||
@ -37,6 +39,20 @@ class Tensor;
|
||||
|
||||
enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
|
||||
|
||||
enum class CuBLASReductionOption : uint8_t {
|
||||
AllowReducedPrecisionWithSplitK = 0,
|
||||
DisallowReducedPrecisionAllowSplitK = 1,
|
||||
DisallowReducedPrecisionDisallowSplitK = 2,
|
||||
};
|
||||
enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN };
|
||||
enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL };
|
||||
enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 };
|
||||
|
||||
TORCH_API Float32Backend str2backend(const std::string& name);
|
||||
TORCH_API Float32Op str2op(const std::string& name);
|
||||
TORCH_API Float32Precision str2precision(const std::string& name);
|
||||
TORCH_API std::string precision2str(Float32Precision prec);
|
||||
|
||||
class TORCH_API Context {
|
||||
public:
|
||||
Context();
|
||||
@ -310,13 +326,7 @@ class TORCH_API Context {
|
||||
//
|
||||
// * Throw an error when `Context::deterministicAlgorithms()` is true. Most
|
||||
// of the time, this should be accomplished by calling
|
||||
// `at::globalContext().alertNotDeterminstic()`. However, if the
|
||||
// nondeterministic behavior is caused by the CuBLAS workspace
|
||||
// configuration in CUDA >= 10.2,
|
||||
// `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
|
||||
// called instead (in this case, a comment explaining why the operation is
|
||||
// nondeterministic is not necessary). See below for details on these
|
||||
// methods.
|
||||
// `at::globalContext().alertNotDeterminstic().
|
||||
//
|
||||
// * Have an entry in the list of nondeterministic PyTorch operations in the
|
||||
// docstring of `use_deterministic_algorithms()` in torch/__init__.py
|
||||
@ -340,31 +350,27 @@ class TORCH_API Context {
|
||||
// Throws an error if `Context::deterministicAlgorithms()` is true
|
||||
static void alertNotDeterministic(std::string_view const& caller);
|
||||
|
||||
// Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
|
||||
// >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
|
||||
// ":4096:8". For more details:
|
||||
// https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
|
||||
void alertCuBLASConfigNotDeterministic() const;
|
||||
|
||||
void setFloat32MatmulPrecision(const std::string& s);
|
||||
void setFloat32Precision(
|
||||
const std::string& backend,
|
||||
const std::string& op,
|
||||
const std::string& s);
|
||||
bool allowTF32CuDNN(const std::string& op = std::string()) const;
|
||||
Float32Backend backend,
|
||||
Float32Op op,
|
||||
Float32Precision p);
|
||||
bool allowTF32CuDNN(std::optional<Float32Op> op = std::nullopt) const;
|
||||
void setAllowTF32CuDNN(bool);
|
||||
bool allowTF32OneDNN() const;
|
||||
void setAllowTF32OneDNN(bool);
|
||||
bool allowTF32CuBLAS() const;
|
||||
void setAllowTF32CuBLAS(bool);
|
||||
Float32MatmulPrecision float32MatmulPrecision() const;
|
||||
std::string float32Precision(
|
||||
const std::string& backend,
|
||||
const std::string& op) const;
|
||||
bool allowFP16ReductionCuBLAS() const;
|
||||
void setAllowFP16ReductionCuBLAS(bool);
|
||||
bool allowBF16ReductionCuBLAS() const;
|
||||
void setAllowBF16ReductionCuBLAS(bool);
|
||||
Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
|
||||
CuBLASReductionOption allowFP16ReductionCuBLAS() const;
|
||||
void setAllowFP16ReductionCuBLAS(
|
||||
bool allow_reduced_precision,
|
||||
bool allow_splitk = true);
|
||||
CuBLASReductionOption allowBF16ReductionCuBLAS() const;
|
||||
void setAllowBF16ReductionCuBLAS(
|
||||
bool allow_reduced_precision,
|
||||
bool allow_splitk = true);
|
||||
bool allowFP16AccumulationCuBLAS() const;
|
||||
void setAllowFP16AccumulationCuBLAS(bool);
|
||||
|
||||
@ -429,7 +435,6 @@ class TORCH_API Context {
|
||||
}
|
||||
|
||||
private:
|
||||
static bool checkCuBLASConfigDeterministic();
|
||||
std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES> init_;
|
||||
bool enabled_cudnn = true;
|
||||
bool deterministic_cudnn = false;
|
||||
@ -457,8 +462,10 @@ class TORCH_API Context {
|
||||
: at::Float32MatmulPrecision::HIGHEST;
|
||||
int benchmark_limit_cudnn = 10;
|
||||
bool allow_tf32_cudnn = true;
|
||||
bool allow_fp16_reduction_cublas = true;
|
||||
bool allow_bf16_reduction_cublas = true;
|
||||
CuBLASReductionOption allow_fp16_reduction_cublas =
|
||||
CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
CuBLASReductionOption allow_bf16_reduction_cublas =
|
||||
CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
bool allow_fp16_accumulation_cublas = false;
|
||||
std::optional<int32_t> sm_carveout = std::nullopt;
|
||||
bool enabled_mkldnn = true;
|
||||
@ -488,21 +495,20 @@ class TORCH_API Context {
|
||||
bool enable_sparse_tensor_invariant_checks = false;
|
||||
bool allow_fp16_reduction_cpu = false;
|
||||
|
||||
std::map<std::string, std::map<std::string, std::string>> fp32_precision = {
|
||||
{"generic", {{"all", "none"}}},
|
||||
{"mkldnn",
|
||||
{{"matmul", "none"},
|
||||
{"conv", "none"},
|
||||
{"rnn", "none"},
|
||||
{"all", "none"}}},
|
||||
{"cuda",
|
||||
{{"matmul",
|
||||
float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST
|
||||
? "none"
|
||||
: "tf32"},
|
||||
{"conv", "tf32"},
|
||||
{"rnn", "tf32"},
|
||||
{"all", "none"}}},
|
||||
using Key = std::pair<Float32Backend, Float32Op>;
|
||||
std::unordered_map<Key, Float32Precision, c10::hash<Key>> fp32_precision = {
|
||||
{{Float32Backend::GENERIC, Float32Op::ALL}, Float32Precision::NONE},
|
||||
{{Float32Backend::MKLDNN, Float32Op::ALL}, Float32Precision::NONE},
|
||||
{{Float32Backend::MKLDNN, Float32Op::CONV}, Float32Precision::NONE},
|
||||
{{Float32Backend::MKLDNN, Float32Op::RNN}, Float32Precision::NONE},
|
||||
{{Float32Backend::MKLDNN, Float32Op::MATMUL}, Float32Precision::NONE},
|
||||
{{Float32Backend::CUDA, Float32Op::ALL}, Float32Precision::NONE},
|
||||
{{Float32Backend::CUDA, Float32Op::CONV}, Float32Precision::TF32},
|
||||
{{Float32Backend::CUDA, Float32Op::RNN}, Float32Precision::TF32},
|
||||
{{Float32Backend::CUDA, Float32Op::MATMUL},
|
||||
float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST
|
||||
? Float32Precision::NONE
|
||||
: Float32Precision::TF32},
|
||||
};
|
||||
|
||||
Allocator* prev_allocator_ptr_{nullptr};
|
||||
@ -684,5 +690,4 @@ struct TORCH_API ROCmBackwardPassGuard {
|
||||
~ROCmBackwardPassGuard();
|
||||
static bool is_backward_pass();
|
||||
};
|
||||
|
||||
} // namespace at
|
||||
|
||||
@ -292,6 +292,28 @@ MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags,
|
||||
if (ftruncate(fd, static_cast<off_t>(size)) == -1) {
|
||||
TORCH_CHECK(false, "unable to resize file <", filename_, "> to the right size: ", c10::utils::str_error(errno), " (", errno, ")");
|
||||
}
|
||||
|
||||
#ifdef HAVE_POSIX_FALLOCATE
|
||||
if (flags_ & ALLOCATOR_MAPPED_SHAREDMEM) {
|
||||
for (;;) {
|
||||
if (posix_fallocate(fd, 0, static_cast<off_t>(size)) == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (errno == EINTR) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (errno == EINVAL || errno == EOPNOTSUPP) {
|
||||
// the underlying filesystem does not support the operation
|
||||
break;
|
||||
}
|
||||
|
||||
TORCH_CHECK(false, "unable to allocate shared memory(shm) for file <", filename_, ">: ", c10::utils::str_error(errno), " (", errno, ")");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (fstat(fd, &file_stat) == -1 || file_stat.st_size < static_cast<int64_t>(size)) {
|
||||
#ifndef STRIP_ERROR_MESSAGES
|
||||
int last_err = errno;
|
||||
|
||||
@ -179,7 +179,7 @@ void propagate_names_except(const Tensor& result, const Tensor& src, IntArrayRef
|
||||
return;
|
||||
}
|
||||
const auto src_names = src.names();
|
||||
const auto result_dim = static_cast<int64_t>(result.dim());
|
||||
const auto result_dim = result.dim();
|
||||
const auto src_dim = static_cast<int64_t>(src_names.size());
|
||||
const auto excluded_dim = static_cast<int64_t>(excluded_idxs.size());
|
||||
TORCH_INTERNAL_ASSERT(src_dim - excluded_dim == result_dim);
|
||||
|
||||
@ -214,7 +214,7 @@ inline Tensor applySlice(
|
||||
"step must be greater than zero");
|
||||
|
||||
// See NOTE [nested tensor size for indexing]
|
||||
if (self_sizes.has_value() && self_sizes.value().size() > 0) {
|
||||
if (self_sizes.has_value() && !self_sizes.value().empty()) {
|
||||
// Skip this optimization if we are tracing, as the trace may be polymorphic
|
||||
// over the shape of the `self` tensor, and we still want to record
|
||||
// the slice.
|
||||
|
||||
@ -273,11 +273,11 @@ void checkLayout(CheckedFrom c, at::ArrayRef<Tensor> tensors, at::Layout layout)
|
||||
}
|
||||
|
||||
void * maybe_data_ptr(const Tensor& tensor) {
|
||||
return tensor.defined() ? (void *)tensor.data_ptr() : nullptr;
|
||||
return tensor.defined() ? tensor.data_ptr() : nullptr;
|
||||
}
|
||||
|
||||
void * maybe_data_ptr(const TensorArg& tensor) {
|
||||
return tensor->defined() ? (void *)tensor->data_ptr() : nullptr;
|
||||
return tensor->defined() ? tensor->data_ptr() : nullptr;
|
||||
}
|
||||
|
||||
void check_dim_size(
|
||||
|
||||
@ -50,19 +50,57 @@ namespace {
|
||||
constexpr size_t MAX_SIZE_INDEX = 64;
|
||||
}
|
||||
|
||||
// A large reserved pinned memory segment that is created in advance which is used
|
||||
// to allocate small pinned memory requests to avoid calling into expensive APIs.
|
||||
// We never free this memory and move up the pointer as we allocate new blocks
|
||||
// and when blocks are freed, they are cached in the free lists.
|
||||
struct PinnedReserveSegment {
|
||||
PinnedReserveSegment(void *start, size_t size) : start_(start), size_(size),
|
||||
current_ptr_(start_), initialized_(true) {}
|
||||
|
||||
PinnedReserveSegment() : start_(nullptr), size_(0), current_ptr_(nullptr), initialized_(false) {}
|
||||
|
||||
bool initialized() {
|
||||
return initialized_;
|
||||
}
|
||||
|
||||
void* allocate(size_t bytes) {
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
|
||||
// Round up the requested size to 4KB boundary for all including the small ones.
|
||||
size_t rounded_bytes = (bytes + 4096 - 1) & ~(4096 - 1);
|
||||
|
||||
if (((uint8_t*)current_ptr_ + rounded_bytes) > ((uint8_t*)start_ + size_)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* ptr = current_ptr_;
|
||||
current_ptr_ = (uint8_t*)current_ptr_ + rounded_bytes;
|
||||
return ptr;
|
||||
}
|
||||
|
||||
bool owns(void* ptr) {
|
||||
return ptr >= start_ && ptr < (uint8_t*)start_ + size_;
|
||||
}
|
||||
|
||||
std::mutex mutex_;
|
||||
void* start_;
|
||||
size_t size_;
|
||||
void* current_ptr_;
|
||||
bool initialized_;
|
||||
};
|
||||
|
||||
// Struct containing memory allocator summary statistics for host.
|
||||
struct TORCH_API HostStats {
|
||||
// COUNT: allocations requested by client code. Note that active
|
||||
// count can be extracted by looking at current allocations
|
||||
Stat allocation;
|
||||
// COUNT: number of allocated segments from host memory allocation.
|
||||
Stat segment;
|
||||
|
||||
// SUM: bytes allocated by this memory alocator. Note that active bytes
|
||||
// can be extracted by looking at current bytes allocated
|
||||
// COUNT: total allocations (active)
|
||||
Stat active_requests;
|
||||
// SUM: bytes allocated/reserved by this memory alocator. (active)
|
||||
Stat active_bytes;
|
||||
// COUNT: total allocations (active + free)
|
||||
Stat allocations;
|
||||
// SUM: bytes allocated/reserved by this memory alocator. This accounts
|
||||
// for both free and in-use blocks.
|
||||
Stat allocated_bytes;
|
||||
// SUM: bytes reserved by this memory allocator (both free and used)
|
||||
Stat reserved_bytes;
|
||||
|
||||
// SUM: time spent in cudaHostAlloc/cudaHostRegister in microseconds
|
||||
DurationStat host_alloc_time;
|
||||
@ -77,7 +115,7 @@ struct TORCH_API HostStats {
|
||||
// COUNT: number of times cudaHostFree/cudaHostUnregister was called.
|
||||
int64_t num_host_free = 0; // This is derived from segment or timing
|
||||
|
||||
// Count of cudaHostFree/cudaHostUnregister per bucket
|
||||
// Count of cudaHostAlloc/cudaHostRegister per bucket
|
||||
std::vector<int64_t> bucket_allocation = std::vector<int64_t>(MAX_SIZE_INDEX);
|
||||
};
|
||||
|
||||
@ -86,17 +124,22 @@ struct TORCH_API HostStats {
|
||||
// avoid locking the allocator while collecting stats.
|
||||
struct alignas(64) HostStatsStaged {
|
||||
std::mutex timing_mutex_;
|
||||
// COUNT: allocations requested by client code resulting in a new segment/block allocation
|
||||
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
|
||||
Stat allocation;
|
||||
// SUM: bytes within active memory blocks, including blocks that are
|
||||
// currently in the free list.
|
||||
// COUNT: total allocations (active + free)
|
||||
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
|
||||
Stat allocations;
|
||||
// SUM: bytes allocated/reserved by this memory alocator. This accounts
|
||||
// for both free and in-use blocks.
|
||||
Stat allocated_bytes;
|
||||
// COUNT: number of allocations per bucket
|
||||
// COUNT: number of allocations per bucket (active)
|
||||
// LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
|
||||
std::vector<Stat> active_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
|
||||
// SUM: bytes of allocation per bucket (active)
|
||||
// LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
|
||||
std::vector<Stat> active_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
|
||||
// COUNT: number of allocations per bucket (active + free)
|
||||
// LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
|
||||
std::vector<Stat> allocation_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
|
||||
// SUM: bytes of allocation per bucket
|
||||
// SUM: bytes of allocation per bucket (active + free)
|
||||
// LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
|
||||
std::vector<Stat> allocated_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
|
||||
// SUM: time spent in cudaHostAlloc/cudaHostRegister
|
||||
@ -200,17 +243,6 @@ struct CachingHostAllocatorImpl {
|
||||
// background.
|
||||
if (!pinned_use_background_threads()) {
|
||||
process_events();
|
||||
} else {
|
||||
// Launch the background thread and process events in a loop.
|
||||
static bool background_thread_flag [[maybe_unused]] = [this] {
|
||||
getBackgroundThreadPool()->run([&]() {
|
||||
while (active_) {
|
||||
process_events();
|
||||
std::this_thread::sleep_for(std::chrono::microseconds(100));
|
||||
}
|
||||
});
|
||||
return true;
|
||||
}();
|
||||
}
|
||||
|
||||
// Round up the allocation to the nearest power of two to improve reuse.
|
||||
@ -223,6 +255,21 @@ struct CachingHostAllocatorImpl {
|
||||
return {block->ptr_, reinterpret_cast<void*>(block)};
|
||||
}
|
||||
|
||||
// Check in the recently freed blocks with pending events to see if we
|
||||
// can reuse them. Call get_free_block again after processing events
|
||||
if (pinned_use_background_threads()) {
|
||||
// Launch the background thread and process events in a loop.
|
||||
static bool background_thread_flag [[maybe_unused]] = [this] {
|
||||
getBackgroundThreadPool()->run([&]() {
|
||||
while (active_) {
|
||||
process_events();
|
||||
std::this_thread::sleep_for(std::chrono::microseconds(100));
|
||||
}
|
||||
});
|
||||
return true;
|
||||
}();
|
||||
}
|
||||
|
||||
// Slow path: if we can't allocate from the cached free list, we need
|
||||
// to create a new block.
|
||||
void* ptr = nullptr;
|
||||
@ -333,7 +380,7 @@ struct CachingHostAllocatorImpl {
|
||||
ptr_to_block_.erase(block->ptr_);
|
||||
auto index = size_index(block->size_);
|
||||
free_block(block);
|
||||
stats_.allocation.decrease(1);
|
||||
stats_.allocations.decrease(1);
|
||||
stats_.allocated_bytes.decrease(block->size_);
|
||||
stats_.allocation_bucket_stats[index].decrease(1);
|
||||
stats_.allocated_bytes_bucket_stats[index].decrease(block->size_);
|
||||
@ -383,16 +430,16 @@ struct CachingHostAllocatorImpl {
|
||||
// per bucket (we pick index 0 arbitrarily). These are also all the host
|
||||
// allocations, not taking into account caching and free lists.
|
||||
if (i == 0) {
|
||||
stats.segment = stats_.allocation;
|
||||
stats.reserved_bytes = stats_.allocated_bytes;
|
||||
stats.num_host_alloc = stats.segment.allocated;
|
||||
stats.num_host_free = stats.segment.freed;
|
||||
stats.allocations = stats_.allocations;
|
||||
stats.allocated_bytes = stats_.allocated_bytes;
|
||||
stats.num_host_alloc = stats.allocations.allocated;
|
||||
stats.num_host_free = stats.allocations.freed;
|
||||
}
|
||||
|
||||
// Bucket stats need to be merged with the slow-path stats. We do this in
|
||||
// a best effort manner, since we can't really replay the cached events per bucket.
|
||||
add_bucket_stats(stats.allocation, stats_.allocation_bucket_stats[i]);
|
||||
add_bucket_stats(stats.allocated_bytes, stats_.allocated_bytes_bucket_stats[i]);
|
||||
add_bucket_stats(stats.active_requests, stats_.active_bucket_stats[i]);
|
||||
add_bucket_stats(stats.active_bytes, stats_.active_bytes_bucket_stats[i]);
|
||||
stats.bucket_allocation[i] = stats_.allocation_bucket_stats[i].allocated;
|
||||
}
|
||||
|
||||
@ -417,9 +464,11 @@ struct CachingHostAllocatorImpl {
|
||||
std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
|
||||
|
||||
if (i == 0) {
|
||||
stats_.allocation.reset_accumulated();
|
||||
stats_.allocations.reset_accumulated();
|
||||
stats_.allocated_bytes.reset_accumulated();
|
||||
}
|
||||
stats_.active_bucket_stats[i].reset_accumulated();
|
||||
stats_.active_bytes_bucket_stats[i].reset_accumulated();
|
||||
stats_.allocation_bucket_stats[i].reset_accumulated();
|
||||
stats_.allocated_bytes_bucket_stats[i].reset_accumulated();
|
||||
}
|
||||
@ -442,9 +491,11 @@ struct CachingHostAllocatorImpl {
|
||||
std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
|
||||
|
||||
if (i == 0) {
|
||||
stats_.allocation.reset_peak();
|
||||
stats_.allocations.reset_peak();
|
||||
stats_.allocated_bytes.reset_peak();
|
||||
}
|
||||
stats_.active_bucket_stats[i].reset_peak();
|
||||
stats_.active_bytes_bucket_stats[i].reset_peak();
|
||||
stats_.allocation_bucket_stats[i].reset_peak();
|
||||
stats_.allocated_bytes_bucket_stats[i].reset_peak();
|
||||
}
|
||||
@ -461,7 +512,7 @@ struct CachingHostAllocatorImpl {
|
||||
virtual void add_allocated_block(B* block) {
|
||||
std::lock_guard<std::mutex> g(blocks_mutex_);
|
||||
blocks_.insert(block);
|
||||
stats_.allocation.increase(1);
|
||||
stats_.allocations.increase(1);
|
||||
stats_.allocated_bytes.increase(block->size_);
|
||||
ptr_to_block_.insert({block->ptr_, block});
|
||||
|
||||
@ -474,6 +525,8 @@ struct CachingHostAllocatorImpl {
|
||||
std::lock_guard<std::mutex> g(free_list_[index].mutex_);
|
||||
stats_.allocation_bucket_stats[index].increase(1);
|
||||
stats_.allocated_bytes_bucket_stats[index].increase(size);
|
||||
stats_.active_bucket_stats[index].increase(1);
|
||||
stats_.active_bytes_bucket_stats[index].increase(size);
|
||||
}
|
||||
}
|
||||
|
||||
@ -484,6 +537,8 @@ struct CachingHostAllocatorImpl {
|
||||
B* block = free_list_[index].list_.back();
|
||||
free_list_[index].list_.pop_back();
|
||||
block->allocated_ = true;
|
||||
stats_.active_bucket_stats[index].increase(1);
|
||||
stats_.active_bytes_bucket_stats[index].increase(size);
|
||||
return block;
|
||||
}
|
||||
return nullptr;
|
||||
@ -577,6 +632,8 @@ struct CachingHostAllocatorImpl {
|
||||
auto index = size_index(block->size_);
|
||||
std::lock_guard<std::mutex> g(free_list_[index].mutex_);
|
||||
free_list_[index].list_.push_back(block);
|
||||
stats_.active_bucket_stats[index].decrease(1);
|
||||
stats_.active_bytes_bucket_stats[index].decrease(size);
|
||||
if (size != -1) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -117,7 +117,7 @@ C10_HOST_DEVICE inline T cauchy(T val, T median, T sigma) {
|
||||
template <>
|
||||
C10_HOST_DEVICE inline double cauchy(double val, double median, double sigma) {
|
||||
// https://en.wikipedia.org/wiki/Cauchy_distribution#Cumulative_distribution_function
|
||||
return median + sigma * at::tan(c10::pi<double> * (val - static_cast<double>(0.5)));
|
||||
return median + sigma * at::tan(c10::pi<double> * (val - 0.5));
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
namespace c10 {
|
||||
|
||||
inline BoxedKernel::BoxedKernel() : functor_(), boxed_kernel_func_(nullptr) {}
|
||||
inline BoxedKernel::BoxedKernel() : boxed_kernel_func_(nullptr) {}
|
||||
|
||||
inline BoxedKernel::BoxedKernel(
|
||||
std::unique_ptr<OperatorKernel> functor,
|
||||
|
||||
@ -20,9 +20,7 @@ make_unique_base(Args&&... args) {
|
||||
} // namespace detail
|
||||
|
||||
inline KernelFunction::KernelFunction()
|
||||
: boxed_kernel_func_(),
|
||||
unboxed_kernel_func_(nullptr),
|
||||
sym_unboxed_kernel_func_(nullptr) {}
|
||||
: unboxed_kernel_func_(nullptr), sym_unboxed_kernel_func_(nullptr) {}
|
||||
|
||||
inline KernelFunction::~KernelFunction() {
|
||||
if (tokens_) {
|
||||
|
||||
@ -76,13 +76,7 @@ void _print_dispatch_trace(const std::string& label, const std::string& op_name,
|
||||
|
||||
OpRegistrationListener::~OpRegistrationListener()= default;
|
||||
|
||||
Dispatcher::Dispatcher()
|
||||
: operators_()
|
||||
, operatorLookupTable_()
|
||||
, backendFallbackKernels_()
|
||||
, listeners_(std::make_unique<detail::RegistrationListenerList>())
|
||||
, cond_var_()
|
||||
, guard_(std::make_shared<Guard>())
|
||||
Dispatcher::Dispatcher(): backendFallbackKernels_(), listeners_(std::make_unique<detail::RegistrationListenerList>()), guard_(std::make_shared<Guard>())
|
||||
{}
|
||||
|
||||
Dispatcher::~Dispatcher() {
|
||||
|
||||
@ -96,7 +96,7 @@ class TORCH_API Dispatcher final {
|
||||
friend class TypedOperatorHandle;
|
||||
|
||||
struct Guard final {
|
||||
Guard() : alive(true), mutex() {}
|
||||
Guard() : alive(true) {}
|
||||
std::atomic<bool> alive;
|
||||
std::mutex mutex;
|
||||
};
|
||||
|
||||
@ -62,17 +62,7 @@ static const auto& getDispatchTableIndexToKey() {
|
||||
}
|
||||
|
||||
OperatorEntry::OperatorEntry(OperatorName&& operator_name)
|
||||
: name_(std::move(operator_name))
|
||||
, schema_()
|
||||
#ifndef C10_MOBILE
|
||||
, tags_()
|
||||
#endif
|
||||
, dispatchTable_()
|
||||
, dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized())
|
||||
, kernels_()
|
||||
, cpp_signature_()
|
||||
, sym_cpp_signature_()
|
||||
, is_observed_(ObservedOperators::isObserved(name_))
|
||||
: name_(std::move(operator_name)), dispatchTable_(), dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized()), is_observed_(ObservedOperators::isObserved(name_))
|
||||
{
|
||||
// Pick up any backend fallbacks that were registered prior to this
|
||||
// OperatorEntry being created.
|
||||
|
||||
@ -357,7 +357,7 @@ IValue IValue::equals(const IValue& rhs) const {
|
||||
case Tag::Enum:
|
||||
return lhs.toEnumHolder()->is(*rhs.toEnumHolder());
|
||||
case Tag::Uninitialized:
|
||||
// Unitialized ivalues show up in no-ops when the compiler can prove a
|
||||
// Uninitialized ivalues show up in no-ops when the compiler can prove a
|
||||
// value will never be used. Just return false on any equality comparison.
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -114,7 +114,7 @@ constexpr bool allowlist_contains(std::string_view allowlist, std::string_view i
|
||||
}
|
||||
next++;
|
||||
} else {
|
||||
if (allowlist.substr(cur).compare(item) == 0) {
|
||||
if (allowlist.substr(cur) == item) {
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
|
||||
@ -73,7 +73,7 @@ c10::FunctionSchema RegisterOperators::inferSchemaFromKernels_(
|
||||
|
||||
std::optional<FunctionSchema> inferred_schema = std::nullopt;
|
||||
for (const auto& kernel : options.kernels) {
|
||||
if (nullptr != kernel.inferred_function_schema.get()) {
|
||||
if (nullptr != kernel.inferred_function_schema) {
|
||||
if (!inferred_schema.has_value()) {
|
||||
inferred_schema = *kernel.inferred_function_schema;
|
||||
break;
|
||||
|
||||
@ -411,7 +411,6 @@ public:
|
||||
|
||||
Options()
|
||||
: schemaOrName_(std::nullopt)
|
||||
, kernels()
|
||||
, aliasAnalysisKind_(std::nullopt)
|
||||
{}
|
||||
|
||||
@ -420,7 +419,6 @@ public:
|
||||
struct KernelRegistrationConfig final {
|
||||
KernelRegistrationConfig()
|
||||
: dispatch_key(std::nullopt)
|
||||
, func()
|
||||
, cpp_signature(std::nullopt)
|
||||
, inferred_function_schema(nullptr)
|
||||
{}
|
||||
|
||||
@ -905,7 +905,7 @@ class Vectorized8 : public Vectorizedi {
|
||||
// Because loadu(const void* ptr, T count) requires zero initialization for
|
||||
// upper 128 bits. However, by using _mm256_castsi128_si256, the upper 128
|
||||
// bits of the result are undefined.
|
||||
// TODO<leslie> We can use _mm256_zextsi128_si256 in the furture,
|
||||
// TODO<leslie> We can use _mm256_zextsi128_si256 in the future,
|
||||
// since gcc 9.3 doesn't support it now.
|
||||
__m128i input_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr));
|
||||
return _mm256_castsi128_si256(input_128);
|
||||
@ -1844,7 +1844,7 @@ Vectorized<int16_t> inline shift_256_16(
|
||||
c0 = _mm256_srav_epi32(a0, b0);
|
||||
c0 = _mm256_shuffle_epi8(c0, ctl_1_0);
|
||||
|
||||
// Peform shifting the same way for input array elements with
|
||||
// Perform shifting the same way for input array elements with
|
||||
// idx%2==1.
|
||||
__m256i a1 = _mm256_and_si256(a, keep_1);
|
||||
__m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
|
||||
@ -2180,7 +2180,7 @@ Vectorized<T> inline shift_256_8(
|
||||
c0 = _mm256_srlv_epi32(a0, b0);
|
||||
c0 = _mm256_shuffle_epi8(c0, ctl_3_0);
|
||||
|
||||
// Peform shifting the same way for input array elements with
|
||||
// Perform shifting the same way for input array elements with
|
||||
// idx%4==1.
|
||||
__m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3);
|
||||
__m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
|
||||
@ -2193,7 +2193,7 @@ Vectorized<T> inline shift_256_8(
|
||||
c1 = _mm256_srlv_epi32(a1, b1);
|
||||
c1 = _mm256_shuffle_epi8(c1, ctl_3_1);
|
||||
|
||||
// Peform shifting the same way for input array elements with
|
||||
// Perform shifting the same way for input array elements with
|
||||
// idx%4==2.
|
||||
__m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3);
|
||||
__m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0);
|
||||
@ -2206,7 +2206,7 @@ Vectorized<T> inline shift_256_8(
|
||||
c2 = _mm256_srlv_epi32(a2, b2);
|
||||
c2 = _mm256_shuffle_epi8(c2, ctl_3_2);
|
||||
|
||||
// Peform shifting the same way for input array elements with
|
||||
// Perform shifting the same way for input array elements with
|
||||
// idx%4==3.
|
||||
__m256i a3 = _mm256_and_si256(a, keep_3);
|
||||
__m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0);
|
||||
|
||||
@ -1088,7 +1088,7 @@ class Vectorized8 : public Vectorizedi {
|
||||
// Because loadu(const void* ptr, T count) requires zero initialization for
|
||||
// upper 384 bits. However, by using _mm512_castsi128_si512, the upper 384
|
||||
// bits of the result are undefined.
|
||||
// TODO<leslie> We can use _mm512_zextsi128_si512 in the furture,
|
||||
// TODO<leslie> We can use _mm512_zextsi128_si512 in the future,
|
||||
// since gcc 9.3 doesn't support it now.
|
||||
__m128i input_128 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
|
||||
return _mm512_castsi128_si512(input_128);
|
||||
@ -2022,7 +2022,7 @@ Vectorized<T> inline shift_512_8(
|
||||
c0 = _mm512_srlv_epi16(a0, b0);
|
||||
c0 = _mm512_shuffle_epi8(c0, ctl_1_0);
|
||||
|
||||
// Peform shifting the same way for input array elements with
|
||||
// Perform shifting the same way for input array elements with
|
||||
// idx%2==1.
|
||||
__m512i a1 = _mm512_and_si512(a, keep_1);
|
||||
__m512i b1 = _mm512_shuffle_epi8(b, ctl_1_0);
|
||||
|
||||
@ -191,6 +191,10 @@ uint32_t _getAlignment(uintptr_t address) {
|
||||
|
||||
#ifdef USE_ROCM
|
||||
static c10::cuda::CUDAStream _getCarveoutStream(int32_t value) {
|
||||
// 0 is default value, meaning full CUs i.e. no mask
|
||||
if (value == 0) {
|
||||
return at::cuda::getCurrentCUDAStream();
|
||||
}
|
||||
static int32_t last_value = 0;
|
||||
static hipStream_t stream;
|
||||
if (last_value == 0) {
|
||||
@ -209,15 +213,15 @@ static c10::cuda::CUDAStream _getCarveoutStream(int32_t value) {
|
||||
int32_t CUs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
|
||||
// how many uint32_t do we need to cover all CUs, fill bitmask with 1
|
||||
uint32_t mask_size = static_cast<uint32_t>((CUs + 32 - 1) / 32);
|
||||
std::vector<uint32_t> mask(mask_size, uint32_t{0xffffffff});
|
||||
std::vector<uint32_t> mask(mask_size, uint32_t{0x00000000});
|
||||
// starting from lowest order bits, in 32-bit chunks
|
||||
// set bits to 0 based on how many CUs to carve out
|
||||
int32_t full_shifts = value / 32;
|
||||
int32_t remainder = value % 32;
|
||||
for (int32_t i = 0; i < full_shifts; i++) {
|
||||
mask[i] = uint32_t{0x00000000};
|
||||
mask[i] = uint32_t{0xffffffff};
|
||||
}
|
||||
mask[full_shifts] = uint32_t{0xffffffff} << remainder;
|
||||
mask[full_shifts] = uint32_t{0xffffffff} << (32 - remainder);
|
||||
|
||||
// finally, create masked stream
|
||||
AT_CUDA_CHECK(hipExtStreamCreateWithCUMask(&stream, mask_size, &mask[0]));
|
||||
@ -319,7 +323,7 @@ class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
|
||||
descriptor_.reset(raw_descriptor);
|
||||
}
|
||||
template <typename T>
|
||||
inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
|
||||
void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
|
||||
// NOLINTNEXTLINE(bugprone-sizeof-expression)
|
||||
TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(value)));
|
||||
}
|
||||
@ -341,7 +345,7 @@ class CuBlasLtMatrixLayout : public CuBlasLtDescriptor<
|
||||
descriptor_.reset(raw_descriptor);
|
||||
}
|
||||
template <typename T>
|
||||
inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
|
||||
void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
|
||||
TORCH_CUDABLAS_CHECK(::cublasLtMatrixLayoutSetAttribute(descriptor(), attr, &value, sizeof(T)));
|
||||
}
|
||||
};
|
||||
@ -356,7 +360,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
|
||||
descriptor_.reset(raw_descriptor);
|
||||
}
|
||||
template <typename T>
|
||||
inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
|
||||
void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
|
||||
TORCH_CUDABLAS_CHECK(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T)));
|
||||
}
|
||||
};
|
||||
@ -391,7 +395,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
computeType = CUBLAS_COMPUTE_64F;
|
||||
scaleType = CUDA_R_64F;
|
||||
} else if constexpr (std::is_same_v<Dtype, float>) {
|
||||
if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
|
||||
if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
|
||||
computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
|
||||
}
|
||||
} else if constexpr (std::is_same_v<Dtype, c10::complex<double>>) {
|
||||
@ -418,25 +422,40 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
abType = CUDA_R_16F;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
|
||||
#ifndef USE_ROCM
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
fp16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
}
|
||||
#endif
|
||||
} else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
|
||||
abType = CUDA_R_16BF;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
|
||||
#ifndef USE_ROCM
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
bf16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
|
||||
}
|
||||
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -570,8 +589,6 @@ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_D
|
||||
|
||||
template <>
|
||||
void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -583,8 +600,6 @@ void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
|
||||
|
||||
template <>
|
||||
void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -596,8 +611,6 @@ void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
|
||||
|
||||
template <>
|
||||
void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -611,8 +624,6 @@ void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::co
|
||||
|
||||
template <>
|
||||
void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -626,8 +637,6 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com
|
||||
|
||||
template <typename C_Dtype>
|
||||
inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -699,8 +708,6 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
|
||||
|
||||
template <typename C_Dtype>
|
||||
inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
BGEMM_CHECK_ARGVALUES(at::BFloat16);
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
@ -1024,8 +1031,6 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dty
|
||||
|
||||
template <>
|
||||
void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -1037,8 +1042,6 @@ void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
|
||||
|
||||
template <>
|
||||
void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -1050,8 +1053,6 @@ void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
|
||||
|
||||
template <>
|
||||
void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -1065,8 +1066,6 @@ void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::comp
|
||||
|
||||
template <>
|
||||
void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -1080,8 +1079,6 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl
|
||||
|
||||
template <typename C_Dtype>
|
||||
inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -1139,8 +1136,15 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
|
||||
}
|
||||
if (prop->major >= 5) {
|
||||
cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
TORCH_CHECK(fp16_reduction !=
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
|
||||
"torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
|
||||
"..., allow_splitk=False) requires the cuBLASLt backend");
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
cublas_flags = static_cast<cublasMath_t>(
|
||||
cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
}
|
||||
// Disallow fp16 reductions that could lead to unexpected overflow issues.
|
||||
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
|
||||
@ -1190,7 +1194,6 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
|
||||
|
||||
template <typename C_Dtype>
|
||||
inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -1200,8 +1203,15 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT
|
||||
GEMM_CHECK_ARGVALUES(at::BFloat16);
|
||||
#ifndef USE_ROCM
|
||||
cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
TORCH_CHECK(bf16_reduction !=
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
|
||||
"torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
|
||||
"..., allow_splitk=False) requires the cuBLASLt backend");
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
cublas_flags = static_cast<cublasMath_t>(
|
||||
cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
}
|
||||
#endif
|
||||
#if defined(USE_ROCM)
|
||||
@ -1579,7 +1589,7 @@ bool gemm_and_bias(
|
||||
computeType = CUBLAS_COMPUTE_64F;
|
||||
scaleType = CUDA_R_64F;
|
||||
} else if constexpr (std::is_same_v<Dtype, float>) {
|
||||
if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
|
||||
if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
|
||||
computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
|
||||
}
|
||||
} else if constexpr (std::is_same_v<Dtype, at::Half>) {
|
||||
@ -1597,18 +1607,34 @@ bool gemm_and_bias(
|
||||
abType = CUDA_R_16F;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
|
||||
#ifndef USE_ROCM
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
fp16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
}
|
||||
#endif
|
||||
} else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
|
||||
abType = CUDA_R_16BF;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
|
||||
#ifndef USE_ROCM
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
bf16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -2404,8 +2430,6 @@ void trsmBatched<c10::complex<double>>(
|
||||
|
||||
template <>
|
||||
void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t op = _cublasOpFromChar(trans);
|
||||
_cublasAdjustLdLevel2(m, n, &lda);
|
||||
@ -2421,8 +2445,6 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
|
||||
// gemv is bw bound, and does not benefit from TF32. But the precision
|
||||
// loss still happens on TF32. So we disable it here.
|
||||
NoTF32Guard disable_tf32;
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t op = _cublasOpFromChar(trans);
|
||||
_cublasAdjustLdLevel2(m, n, &lda);
|
||||
@ -2435,8 +2457,6 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
|
||||
|
||||
template <>
|
||||
void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t op = _cublasOpFromChar(trans);
|
||||
_cublasAdjustLdLevel2(m, n, &lda);
|
||||
@ -2450,8 +2470,6 @@ void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
|
||||
// gemv is bw bound, and does not benefit from TF32. But the precision
|
||||
// loss still happens on TF32. So we disable it here.
|
||||
NoTF32Guard disable_tf32;
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t op = _cublasOpFromChar(trans);
|
||||
_cublasAdjustLdLevel2(m, n, &lda);
|
||||
|
||||
@ -109,7 +109,7 @@ void CUDAGeneratorState::increase(uint64_t increment) {
|
||||
offset_intragraph_ % 4 == 0, "RNG offset must be a multiple of 4.");
|
||||
// Ensures the increment does not cause overflow.
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
offset_intragraph_ <= std::numeric_limits<uint32_t>::max() - increment,
|
||||
offset_intragraph_ <= std::numeric_limits<uint64_t>::max() - increment,
|
||||
"Increment causes overflow in the offset value.");
|
||||
offset_intragraph_ += increment;
|
||||
} else {
|
||||
@ -461,7 +461,7 @@ void CUDAGeneratorImpl::unregister_graph(cuda::CUDAGraph* graph) {
|
||||
*/
|
||||
PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) {
|
||||
if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) {
|
||||
uint32_t offset = state_->offset_intragraph_;
|
||||
uint64_t offset = state_->offset_intragraph_;
|
||||
state_->increase(increment);
|
||||
return PhiloxCudaState(
|
||||
state_->seed_extragraph_.data_ptr<int64_t>(),
|
||||
|
||||
@ -96,16 +96,16 @@ struct CUDAGraph;
|
||||
struct CUDAGeneratorState : public c10::intrusive_ptr_target {
|
||||
uint64_t seed_;
|
||||
uint64_t philox_offset_per_thread_;
|
||||
uint32_t offset_intragraph_;
|
||||
uint64_t offset_intragraph_;
|
||||
bool capturing_{};
|
||||
std::unordered_set<cuda::CUDAGraph*> registered_graphs_;
|
||||
at::TensorBase seed_extragraph_{};
|
||||
at::TensorBase offset_extragraph_{};
|
||||
at::TensorBase seed_extragraph_;
|
||||
at::TensorBase offset_extragraph_;
|
||||
|
||||
CUDAGeneratorState(
|
||||
uint64_t seed = default_rng_seed_val,
|
||||
uint64_t philox_offset_per_thread = 0,
|
||||
uint32_t offset_intragraph = 0)
|
||||
uint64_t offset_intragraph = 0)
|
||||
: seed_(seed),
|
||||
philox_offset_per_thread_(philox_offset_per_thread),
|
||||
offset_intragraph_(offset_intragraph) {}
|
||||
@ -167,7 +167,7 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
|
||||
CUDAGeneratorImpl* clone_impl() const override;
|
||||
|
||||
c10::intrusive_ptr<CUDAGeneratorState> state_;
|
||||
std::atomic_flag no_reset_rnn_state_{};
|
||||
std::atomic_flag no_reset_rnn_state_;
|
||||
};
|
||||
|
||||
namespace cuda::detail {
|
||||
|
||||
@ -56,7 +56,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
|
||||
|
||||
// the ID assigned by cuda during graph capture,
|
||||
// used to identify when a stream is participating in capture
|
||||
CaptureId_t capture_id_ = -1;
|
||||
CaptureId_t capture_id_ = 0;
|
||||
|
||||
// uuid used to request a particular private mempool from CUDACachingAllocator.
|
||||
// By default, this will be set to {id_, 0}.
|
||||
|
||||
@ -6,43 +6,15 @@
|
||||
#define HIPSPARSE_VERSION ((hipsparseVersionMajor*100000) + (hipsparseVersionMinor*100) + hipsparseVersionPatch)
|
||||
#endif
|
||||
|
||||
// cuSparse Generic API added in CUDA 10.1
|
||||
// Windows support added in CUDA 11.0
|
||||
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && ((CUSPARSE_VERSION >= 10300) || (CUSPARSE_VERSION >= 11000 && defined(_WIN32)))
|
||||
#define AT_USE_CUSPARSE_GENERIC_API() 1
|
||||
#else
|
||||
#define AT_USE_CUSPARSE_GENERIC_API() 0
|
||||
#endif
|
||||
|
||||
// cuSparse Generic API descriptor pointers were changed to const in CUDA 12.0
|
||||
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \
|
||||
(CUSPARSE_VERSION < 12000)
|
||||
#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 1
|
||||
#else
|
||||
#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 0
|
||||
#endif
|
||||
|
||||
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \
|
||||
(CUSPARSE_VERSION >= 12000)
|
||||
#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 1
|
||||
#else
|
||||
#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 0
|
||||
#endif
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
// hipSparse const API added in v2.4.0
|
||||
#if HIPSPARSE_VERSION >= 200400
|
||||
#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 1
|
||||
#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
|
||||
#define AT_USE_HIPSPARSE_GENERIC_API() 1
|
||||
#else
|
||||
#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
|
||||
#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 1
|
||||
#define AT_USE_HIPSPARSE_GENERIC_API() 1
|
||||
#endif
|
||||
#else // USE_ROCM
|
||||
#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
|
||||
#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
|
||||
#define AT_USE_HIPSPARSE_GENERIC_API() 0
|
||||
#endif // USE_ROCM
|
||||
|
||||
|
||||
@ -12,8 +12,6 @@ cusparseStatus_t destroyConstDnMat(const cusparseDnMatDescr* dnMatDescr) {
|
||||
return cusparseDestroyDnMat(const_cast<cusparseDnMatDescr*>(dnMatDescr));
|
||||
}
|
||||
|
||||
#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
|
||||
|
||||
namespace {
|
||||
|
||||
// If a specific GPU model does not provide native support for a given data
|
||||
@ -210,6 +208,4 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6
|
||||
descriptor_.reset(raw_descriptor);
|
||||
}
|
||||
|
||||
#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
|
||||
|
||||
} // namespace at::cuda::sparse
|
||||
|
||||
@ -35,7 +35,6 @@ class CuSparseDescriptor {
|
||||
std::unique_ptr<T, CuSparseDescriptorDeleter<T, destructor>> descriptor_;
|
||||
};
|
||||
|
||||
#if AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
|
||||
template <typename T, cusparseStatus_t (*destructor)(const T*)>
|
||||
struct ConstCuSparseDescriptorDeleter {
|
||||
void operator()(T* x) {
|
||||
@ -58,7 +57,6 @@ class ConstCuSparseDescriptor {
|
||||
protected:
|
||||
std::unique_ptr<T, ConstCuSparseDescriptorDeleter<T, destructor>> descriptor_;
|
||||
};
|
||||
#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS || AT_USE_HIPSPARSE_CONST_DESCRIPTORS
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
using cusparseMatDescr = std::remove_pointer_t<hipsparseMatDescr_t>;
|
||||
@ -123,39 +121,8 @@ class TORCH_CUDA_CPP_API CuSparseBsrsm2Info
|
||||
|
||||
#endif // AT_USE_HIPSPARSE_TRIANGULAR_SOLVE
|
||||
|
||||
#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
|
||||
|
||||
cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
|
||||
|
||||
#if AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS()
|
||||
class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
|
||||
: public CuSparseDescriptor<cusparseDnMatDescr, &cusparseDestroyDnMat> {
|
||||
public:
|
||||
explicit CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
|
||||
};
|
||||
|
||||
class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor
|
||||
: public CuSparseDescriptor<const cusparseDnMatDescr, &destroyConstDnMat> {
|
||||
public:
|
||||
explicit CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
|
||||
cusparseDnMatDescr* unsafe_mutable_descriptor() const {
|
||||
return const_cast<cusparseDnMatDescr*>(descriptor());
|
||||
}
|
||||
cusparseDnMatDescr* unsafe_mutable_descriptor() {
|
||||
return const_cast<cusparseDnMatDescr*>(descriptor());
|
||||
}
|
||||
};
|
||||
|
||||
class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor
|
||||
: public CuSparseDescriptor<cusparseDnVecDescr, &cusparseDestroyDnVec> {
|
||||
public:
|
||||
explicit CuSparseDnVecDescriptor(const Tensor& input);
|
||||
};
|
||||
|
||||
class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
|
||||
: public CuSparseDescriptor<cusparseSpMatDescr, &cusparseDestroySpMat> {};
|
||||
|
||||
#elif AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
|
||||
class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
|
||||
: public ConstCuSparseDescriptor<
|
||||
cusparseDnMatDescr,
|
||||
@ -194,7 +161,6 @@ class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
|
||||
: public ConstCuSparseDescriptor<
|
||||
cusparseSpMatDescr,
|
||||
&cusparseDestroySpMat> {};
|
||||
#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
|
||||
|
||||
class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor
|
||||
: public CuSparseSpMatDescriptor {
|
||||
@ -283,6 +249,4 @@ class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor
|
||||
}
|
||||
};
|
||||
|
||||
#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
|
||||
|
||||
} // namespace at::cuda::sparse
|
||||
|
||||
@ -9,7 +9,6 @@
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <future>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace at::cuda {
|
||||
namespace {
|
||||
@ -72,9 +71,20 @@ using Block = HostBlock<CUDAStream>;
|
||||
struct CUDACachingHostAllocatorImpl
|
||||
: public CachingHostAllocatorImpl<CUDAStream, EventPool::Event> {
|
||||
private:
|
||||
std::unordered_map<void*, bool> use_host_register;
|
||||
ska::flat_hash_map<void*, bool> use_host_register;
|
||||
|
||||
void allocate_host_memory(size_t size, void** ptr) override {
|
||||
// try allocating from reserve segment first before calling into expensive APIs
|
||||
if (get_reserve_segment().initialized()) {
|
||||
*ptr = get_reserve_segment().allocate(size);
|
||||
if (*ptr != nullptr) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
allocate_host_memory_slowpath(size, ptr);
|
||||
}
|
||||
|
||||
void allocate_host_memory_slowpath(size_t size, void** ptr) {
|
||||
// Pinned memory pointers allocated by any device can be directly used by
|
||||
// any other device, regardless of the current device at the time of
|
||||
// allocation, since we assume unified addressing. So we grab any existing
|
||||
@ -113,6 +123,18 @@ struct CUDACachingHostAllocatorImpl
|
||||
}
|
||||
|
||||
void free_block(Block* block) override {
|
||||
// We never free blocks from the reserve segment
|
||||
if (get_reserve_segment().initialized()) {
|
||||
// Check if the block is from the reserve segment
|
||||
if (get_reserve_segment().owns(block->ptr_)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
free_block_slowpath(block);
|
||||
}
|
||||
|
||||
void free_block_slowpath(Block* block) {
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
// Users may change the allocator config at will. torch unit tests do this.
|
||||
// However, allocations using cudaHostRegister should use corresonding
|
||||
@ -172,6 +194,20 @@ struct CUDACachingHostAllocatorImpl
|
||||
return event_pool->get(idx);
|
||||
}
|
||||
|
||||
PinnedReserveSegment& get_reserve_segment() {
|
||||
static auto reserve_segment = [&]() {
|
||||
if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() > 0) {
|
||||
void *ptr;
|
||||
size_t sz = c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() * 1024 * 1024;
|
||||
allocate_host_memory_slowpath(sz, &ptr);
|
||||
return PinnedReserveSegment(ptr, sz);
|
||||
} else {
|
||||
return PinnedReserveSegment();
|
||||
}
|
||||
} ();
|
||||
return reserve_segment;
|
||||
}
|
||||
|
||||
TaskThreadPool* getThreadPool() {
|
||||
static TaskThreadPool* pool = new TaskThreadPool(
|
||||
static_cast<int>(c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
|
||||
@ -186,15 +222,15 @@ struct CUDACachingHostAllocatorImpl
|
||||
size_t numThreads,
|
||||
size_t pageSize) {
|
||||
uintptr_t start = (uintptr_t)ptr + (size * i / numThreads);
|
||||
uintptr_t end = (uintptr_t)start + (size / numThreads);
|
||||
uintptr_t end = start + (size / numThreads);
|
||||
if (i == (numThreads - 1)) {
|
||||
end = (uintptr_t)ptr + size;
|
||||
}
|
||||
|
||||
// pre-fault/map the pages by setting the first byte of the page
|
||||
uintptr_t alignedStart =
|
||||
(((uintptr_t)start + pageSize - 1) & ~(pageSize - 1));
|
||||
for (uintptr_t p = alignedStart; p < ((uintptr_t)end); p += pageSize) {
|
||||
((start + pageSize - 1) & ~(pageSize - 1));
|
||||
for (uintptr_t p = alignedStart; p < (end); p += pageSize) {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
memset((void*)p, 0, 1);
|
||||
}
|
||||
|
||||
@ -310,7 +310,7 @@ cublasHandle_t getCurrentCUDABlasHandle() {
|
||||
// FP32 data type calculations based on the value of the allow_tf32 flag.
|
||||
// To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH.
|
||||
if (!NoTF32Guard::should_disable_tf32() &&
|
||||
at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
|
||||
at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
|
||||
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
|
||||
} else {
|
||||
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
|
||||
|
||||
@ -122,7 +122,7 @@ struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThread
|
||||
|
||||
// Called by the destructor. Releases this thread's handles back into the pool.
|
||||
void release() {
|
||||
if(my_handles.size() > 0) {
|
||||
if(!my_handles.empty()) {
|
||||
auto parent = weak_parent.lock();
|
||||
if (!parent) {
|
||||
// If this thread exits after atexit handlers have completed, the
|
||||
|
||||
@ -19,7 +19,7 @@ struct PhiloxCudaState {
|
||||
// Called if graph capture is underway
|
||||
PhiloxCudaState(int64_t* seed,
|
||||
int64_t* offset_extragraph,
|
||||
uint32_t offset_intragraph) {
|
||||
uint64_t offset_intragraph) {
|
||||
seed_.ptr = seed;
|
||||
offset_.ptr = offset_extragraph;
|
||||
offset_intragraph_ = offset_intragraph;
|
||||
@ -36,7 +36,7 @@ struct PhiloxCudaState {
|
||||
|
||||
Payload seed_{};
|
||||
Payload offset_{};
|
||||
uint32_t offset_intragraph_ = 0;
|
||||
uint64_t offset_intragraph_ = 0;
|
||||
bool captured_ = false;
|
||||
};
|
||||
|
||||
|
||||
@ -162,7 +162,7 @@ inline std::string ComputeTypeFor() {
|
||||
// ROCBLAS and hipBLASLt.
|
||||
template <>
|
||||
inline std::string ComputeTypeFor<float>() {
|
||||
if (at::globalContext().float32Precision("cuda", "matmul") != "tf32") {
|
||||
if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) != at::Float32Precision::TF32) {
|
||||
return "f32_r";
|
||||
} else {
|
||||
return "xf32_r";
|
||||
|
||||
@ -506,7 +506,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {
|
||||
}
|
||||
|
||||
hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
|
||||
if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
|
||||
if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
|
||||
computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
|
||||
}
|
||||
HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F);
|
||||
|
||||
@ -141,7 +141,7 @@ class RocblasGemmOp : public Callable<GemmParams<T>> {
|
||||
|
||||
TuningStatus Call(const GemmParams<T>* params) override {
|
||||
auto input_output_type = RocBlasDataTypeFor<T>();
|
||||
if (at::globalContext().float32Precision("cuda", "matmul") == "tf32" && input_output_type == rocblas_datatype_f32_r)
|
||||
if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r)
|
||||
return FAIL; // no support for TF32 in rocBLAS
|
||||
auto compute_type = RocBlasComputeTypeFor<T>();
|
||||
auto h_a = DoCastForHalfOrBfloat16(params->alpha);
|
||||
@ -209,7 +209,7 @@ class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
|
||||
|
||||
TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
|
||||
auto input_output_type = RocBlasDataTypeFor<T>();
|
||||
if (at::globalContext().float32Precision("cuda", "matmul") == "tf32" && input_output_type == rocblas_datatype_f32_r)
|
||||
if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r)
|
||||
return FAIL; // no support for TF32 in rocBLAS
|
||||
auto compute_type = RocBlasComputeTypeFor<T>();
|
||||
auto h_a = DoCastForHalfOrBfloat16(params->alpha);
|
||||
|
||||
@ -404,8 +404,6 @@ TuningContext::TuningContext() :
|
||||
max_warmup_iterations_{0},
|
||||
icache_flush_{true},
|
||||
rotating_buffer_size_{-1},
|
||||
filename_{},
|
||||
untuned_file_{},
|
||||
results_count_from_input_file_{0},
|
||||
is_shutting_down_{false}
|
||||
{
|
||||
|
||||
@ -141,7 +141,7 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
|
||||
size[i] = (int) t.size(i);
|
||||
}
|
||||
for (const auto i : c10::irange(dim, pad)) {
|
||||
size[i] = (int) 1;
|
||||
size[i] = 1;
|
||||
}
|
||||
dim = std::max(dim, pad);
|
||||
cudnnTensorFormat_t filter_format{};
|
||||
|
||||
@ -176,7 +176,7 @@ struct LinalgCheckMatrixUnaryRuleHelper;
|
||||
|
||||
template <char const *op_name, typename F, F Func, typename A, typename... T>
|
||||
struct LinalgCheckMatrixUnaryRuleHelper<op_name, F, Func, typelist<A, T...>> {
|
||||
static inline Tensor check_and_reshape_input(const Tensor& tensor, std::optional<int64_t> batch_dim) {
|
||||
static Tensor check_and_reshape_input(const Tensor& tensor, std::optional<int64_t> batch_dim) {
|
||||
TORCH_CHECK(rankWithoutBatchDim(tensor, batch_dim) >= 2, op_name, ": The input tensor A must have at least 2 dimensions.");
|
||||
return moveBatchDimToFront(tensor, batch_dim);
|
||||
}
|
||||
@ -222,7 +222,7 @@ struct LinalgCheckMatrixBinaryRuleHelper;
|
||||
|
||||
template <char const *op_name, typename F, F Func, typename A, typename B, typename... T>
|
||||
struct LinalgCheckMatrixBinaryRuleHelper<op_name, F, Func, typelist<A, B, T...>> {
|
||||
static inline std::tuple<Tensor, Tensor> check_inputs_and_reshape_inputs(
|
||||
static std::tuple<Tensor, Tensor> check_inputs_and_reshape_inputs(
|
||||
const Tensor& first, std::optional<int64_t> first_bdim,
|
||||
const Tensor& second, std::optional<int64_t> second_bdim) {
|
||||
TORCH_CHECK(rankWithoutBatchDim(first, first_bdim) >= 2,
|
||||
|
||||
@ -58,7 +58,7 @@ scalar_t dot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y,
|
||||
template<typename scalar_t>
|
||||
scalar_t vdot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y, int64_t incy);
|
||||
|
||||
static constexpr inline bool lda_cond(int64_t m, int64_t n, int64_t lda) {
|
||||
static constexpr bool lda_cond(int64_t m, int64_t n, int64_t lda) {
|
||||
return n == 1 || lda >= std::max<int64_t>(1L, m);
|
||||
}
|
||||
|
||||
|
||||
@ -991,7 +991,7 @@ std::size_t UnsafeUkernelKeyHasher<PackKey>::operator()(const PackKey& key) cons
|
||||
template <typename key_t, typename value_t>
|
||||
struct KernelCache {
|
||||
using kstore_t = std::unordered_map<key_t, std::shared_ptr<value_t>, UnsafeUkernelKeyHasher<key_t>>;
|
||||
static inline std::shared_ptr<value_t>&& fetch_or_create(
|
||||
static std::shared_ptr<value_t>&& fetch_or_create(
|
||||
const key_t& key,
|
||||
const std::function<std::shared_ptr<value_t>()>& callback) {
|
||||
auto&& search = get_store().find(key);
|
||||
@ -1003,7 +1003,7 @@ struct KernelCache {
|
||||
}
|
||||
}
|
||||
|
||||
static inline kstore_t& get_store() {
|
||||
static kstore_t& get_store() {
|
||||
static thread_local kstore_t cache_kernels;
|
||||
return cache_kernels;
|
||||
}
|
||||
@ -1067,7 +1067,7 @@ struct GemmHelper {
|
||||
struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
|
||||
// Fetch/create GemmHelper object and execute brgemm with batch size = 1
|
||||
template <typename scalar_t_a, typename scalar_t_b, typename scalar_t_c>
|
||||
static inline void call(
|
||||
static void call(
|
||||
int64_t M,
|
||||
int64_t N,
|
||||
int64_t K,
|
||||
@ -1118,12 +1118,12 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
|
||||
.execute(A, B, (*value).A_B_offsets, C, (*value).scratchpad.data());
|
||||
}
|
||||
|
||||
static inline std::shared_ptr<GemmHelper>& get_current() {
|
||||
static std::shared_ptr<GemmHelper>& get_current() {
|
||||
static thread_local std::shared_ptr<GemmHelper> current;
|
||||
return current;
|
||||
}
|
||||
|
||||
static inline bool device_check(ScalarType dtype) {
|
||||
static bool device_check(ScalarType dtype) {
|
||||
if (!at::globalContext().userEnabledMkldnn()) {
|
||||
return false;
|
||||
}
|
||||
@ -1153,7 +1153,7 @@ using pack_t = dnnl::ukernel::brgemm_pack_B;
|
||||
using pack_t = dnnl::ukernel::transform;
|
||||
#endif
|
||||
struct Pack : public KernelCache <PackKey, pack_t> {
|
||||
static inline void call(
|
||||
static void call(
|
||||
int64_t K,
|
||||
int64_t N,
|
||||
int64_t ld_in,
|
||||
@ -1182,7 +1182,7 @@ struct Pack : public KernelCache <PackKey, pack_t> {
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool could_pack(ScalarType dtype) {
|
||||
static bool could_pack(ScalarType dtype) {
|
||||
if (!at::globalContext().userEnabledMkldnn()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -702,7 +702,7 @@ static void check_shape_forward(const at::Tensor& input,
|
||||
// If kernel size is incorrect
|
||||
std::ostringstream input_ss;
|
||||
std::ostringstream kernel_ss;
|
||||
std::string separator = "";
|
||||
std::string separator;
|
||||
|
||||
for (int i = 0, len = input_shape.size(); i < len; ++i) {
|
||||
input_ss << separator << input_shape[i];
|
||||
@ -1019,7 +1019,7 @@ static Tensor convolution_same(
|
||||
|
||||
if (symmetric_padding) {
|
||||
// All backends handle symmetric padding natively
|
||||
SymDimVector output_padding(static_cast<size_t>(dim));
|
||||
SymDimVector output_padding(dim);
|
||||
return at::convolution_symint(input, weight, bias, stride, padding_l, dilation,
|
||||
false, output_padding, groups);
|
||||
}
|
||||
@ -1039,7 +1039,7 @@ static Tensor convolution_same(
|
||||
}
|
||||
}
|
||||
auto padded_input = at::constant_pad_nd_symint(input, pad_nd, 0);
|
||||
SymDimVector output_padding(static_cast<size_t>(dim));
|
||||
SymDimVector output_padding(dim);
|
||||
return at::convolution_symint(padded_input, weight, bias, stride, padding_l,
|
||||
dilation, false, output_padding, groups);
|
||||
}
|
||||
@ -1174,7 +1174,7 @@ at::Tensor convolution(
|
||||
bool deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
|
||||
return at::_convolution(input, weight, bias, stride, padding, dilation,
|
||||
transposed, output_padding, groups,
|
||||
ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN("conv"));
|
||||
ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN(at::Float32Op::CONV));
|
||||
}
|
||||
|
||||
at::Tensor convolution_overrideable(
|
||||
@ -1319,7 +1319,7 @@ ConvBackend select_conv_backend(
|
||||
params.benchmark = ctx.benchmarkCuDNN();
|
||||
params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
|
||||
params.cudnn_enabled = ctx.userEnabledCuDNN();
|
||||
params.allow_tf32 = ctx.allowTF32CuDNN("conv");
|
||||
params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
|
||||
|
||||
auto input = input_r;
|
||||
auto weight = weight_r;
|
||||
@ -1699,7 +1699,7 @@ at::Tensor _convolution(
|
||||
c10::MaybeOwned<Tensor> bias_r_maybe_owned = at::borrow_from_optional_tensor(bias_r_opt);
|
||||
const Tensor& bias_r = *bias_r_maybe_owned;
|
||||
|
||||
return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN("conv"));
|
||||
return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN(at::Float32Op::CONV));
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor> convolution_backward_overrideable(
|
||||
@ -1997,7 +1997,7 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
|
||||
params.benchmark = ctx.benchmarkCuDNN();
|
||||
params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
|
||||
params.cudnn_enabled = ctx.userEnabledCuDNN();
|
||||
params.allow_tf32 = ctx.allowTF32CuDNN("conv");
|
||||
params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
|
||||
|
||||
// Validate inputs.
|
||||
check_shape_backward(input, weight.sizes(), params);
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/Copy.h>
|
||||
#include <ATen/native/Copy.h>
|
||||
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
|
||||
@ -70,7 +70,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
|
||||
new_shape.emplace_back(input_sizes[i]);
|
||||
}
|
||||
|
||||
for (const auto i : c10::irange((size_t)l_pad)) {
|
||||
for (const auto i : c10::irange(l_pad)) {
|
||||
auto pad_idx = pad.size() - ((i + 1) * 2);
|
||||
auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
|
||||
TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
|
||||
|
||||
@ -47,7 +47,7 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar
|
||||
int64_t sgn = (xstep > 0) - (xstep < 0);
|
||||
size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
|
||||
} else {
|
||||
size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
|
||||
size_d = std::ceil((end.to<double>() - start.to<double>())
|
||||
/ step.to<double>());
|
||||
}
|
||||
|
||||
|
||||
@ -107,11 +107,6 @@ void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes) {
|
||||
storage->set_nbytes(size_bytes);
|
||||
}
|
||||
|
||||
// Call the sparse implementation in SparseTensor.cpp directly.
|
||||
// A dynamic dispatch here is NOT necessary, so I didn't put
|
||||
// this function in native_functions.yaml
|
||||
const Tensor& resize_as_sparse_(const Tensor& self, const Tensor& src);
|
||||
|
||||
// TODO(VitalyFedyunin): Move it to HTML docs.
|
||||
//
|
||||
// Strides of the output tensor of `resize_as_` operator is defined by input
|
||||
|
||||
@ -145,12 +145,6 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
AdvancedIndex make_info(Tensor self, IOptTensorListRef orig);
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
namespace at::meta {
|
||||
|
||||
TORCH_META_FUNC(gather)
|
||||
|
||||
@ -73,7 +73,6 @@
|
||||
#include <ATen/ops/where_native.h>
|
||||
#include <ATen/ops/zeros_like.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <utility>
|
||||
#endif
|
||||
|
||||
|
||||
@ -1880,34 +1880,43 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
|
||||
|
||||
Tensor xtensor = self.expand(padded_size);
|
||||
|
||||
Tensor urtensor;
|
||||
if (self.is_quantized()) {
|
||||
urtensor = at::empty_quantized(target_size, self);
|
||||
} else {
|
||||
urtensor = at::empty(target_size, self.options());
|
||||
}
|
||||
|
||||
// return an empty tensor if one of the repeat dimensions is zero
|
||||
if (zero_tensor) {
|
||||
return self.is_quantized() ? at::empty_quantized(target_size, self)
|
||||
: at::empty(target_size, self.options());
|
||||
return urtensor;
|
||||
}
|
||||
|
||||
// Create view of shape [r0, s0, r1, s1, ...]
|
||||
// where ri is repeat[i], si is self.size(i).
|
||||
Tensor view = xtensor;
|
||||
auto expand_shape = std::vector<int64_t>();
|
||||
expand_shape.reserve(xtensor.dim() * 2);
|
||||
for (const auto i : c10::irange(xtensor.dim())) {
|
||||
view = view.unsqueeze(2 * i);
|
||||
expand_shape.push_back(repeats[i]);
|
||||
expand_shape.push_back(xtensor.size(i));
|
||||
// can't unfold with step 0, so make sure step is at least 1
|
||||
// (it doesn't matter what it is in that case, because the size is 0).
|
||||
auto size_i = xtensor.sizes()[i];
|
||||
urtensor = urtensor.unfold(i, size_i, std::max<int64_t>(size_i, 1));
|
||||
}
|
||||
// expanded_view is non-contiguous because .expand set stride to 0.
|
||||
auto expanded_view = view.expand(expand_shape);
|
||||
|
||||
// copy to contiguous tensor.
|
||||
auto contiguous_copy = at::empty(
|
||||
expanded_view.sizes(),
|
||||
expanded_view.options(),
|
||||
at::MemoryFormat::Contiguous);
|
||||
contiguous_copy.copy_(expanded_view);
|
||||
urtensor.copy_(xtensor.expand_as(urtensor));
|
||||
|
||||
// Reshape to [s0 * r0, s1 * r1, ...].
|
||||
// No extra copy of data during reshape for a contiguous tensor.
|
||||
return contiguous_copy.view(target_size);
|
||||
// Combine the dimensions to produce the target_size.
|
||||
// xtensor dims: [a0, ..., ad-1]
|
||||
// urtensor dims: [a0, ..., ad-1, b0, ..., bd-1]
|
||||
// b dims are produced by unfold.
|
||||
// Transform urtensor to [a0 * b0, ..., ad-1 * bd-1]
|
||||
const int64_t n_dims = xtensor.dim();
|
||||
auto range_a = at::arange(xtensor.dim(), at::TensorOptions(at::kLong));
|
||||
auto range_b = range_a + n_dims;
|
||||
auto stacked = stack({std::move(range_a), std::move(range_b)}, 1).flatten();
|
||||
auto permutation = IntArrayRef(stacked.data_ptr<int64_t>(), n_dims * 2);
|
||||
// Permute from [a0, ..., ad-1, b0, ..., bd-1] to [a0, b0, ..., ad-1, bd-1]
|
||||
urtensor = urtensor.permute(permutation);
|
||||
// Reshape from [a0, b0, ..., ad-1, bd-1] to [a0 * b0, ..., ad-1 * bd-1]
|
||||
urtensor = urtensor.reshape(target_size);
|
||||
|
||||
return urtensor;
|
||||
}
|
||||
|
||||
Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) {
|
||||
@ -2058,7 +2067,7 @@ Tensor _reshape_copy_symint(
|
||||
TORCH_CHECK(0, "_reshape_copy not implemented for mkldnn tensors");
|
||||
}
|
||||
|
||||
if (self.is_contiguous()) {
|
||||
if (self.is_contiguous_or_false()) {
|
||||
return self.view_symint(shape).clone(at::MemoryFormat::Contiguous);
|
||||
} else {
|
||||
return at::_unsafe_view_symint(
|
||||
|
||||
@ -124,7 +124,7 @@ struct IsUnique {};
|
||||
|
||||
template <typename scalar_t>
|
||||
struct IsUnique<scalar_t, false> {
|
||||
inline bool operator() (scalar_t* data_ptr, int64_t i) {
|
||||
bool operator() (scalar_t* data_ptr, int64_t i) {
|
||||
if (i == 0) { return true; }
|
||||
return c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]);
|
||||
}
|
||||
@ -132,7 +132,7 @@ struct IsUnique<scalar_t, false> {
|
||||
|
||||
template <typename scalar_t>
|
||||
struct IsUnique<scalar_t, true> {
|
||||
inline bool operator() (scalar_t* data_ptr, int64_t i) {
|
||||
bool operator() (scalar_t* data_ptr, int64_t i) {
|
||||
if (i == 0) { return true; }
|
||||
return (c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]))
|
||||
&& !(_isnan(data_ptr[i]) && _isnan(data_ptr[i - 1]));
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
|
||||
#include <ATen/OpMathType.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/cpu/vec/functional.h>
|
||||
#include <ATen/cpu/vec/vec.h>
|
||||
|
||||
@ -17,7 +17,7 @@
|
||||
|
||||
namespace ao::sparse {
|
||||
|
||||
int register_linear_params();
|
||||
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
|
||||
namespace ao::sparse {
|
||||
|
||||
int register_linear_params();
|
||||
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
namespace {
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
#endif
|
||||
|
||||
namespace ao::sparse {
|
||||
int register_linear_params();
|
||||
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
|
||||
|
||||
@ -22,7 +22,7 @@ static inline void cpu_atomic_add_float(float* dst, float fvalue)
|
||||
old_value.floatV = *dst;
|
||||
new_value.floatV = old_value.floatV + fvalue;
|
||||
|
||||
unsigned* old_intV = (unsigned*)(&old_value.intV);
|
||||
unsigned* old_intV = &old_value.intV;
|
||||
while (!std::atomic_compare_exchange_strong(dst_intV, old_intV, new_value.intV)) {
|
||||
#ifdef __aarch64__
|
||||
__asm__ __volatile__("yield;" : : : "memory");
|
||||
|
||||
@ -118,7 +118,7 @@ gemm_notrans_(
|
||||
scale_(m, n, beta, c, ldc);
|
||||
|
||||
// c += alpha * (a @ b)
|
||||
const uint64_t unsigned_m = static_cast<int64_t>(m);
|
||||
const uint64_t unsigned_m = m;
|
||||
const uint64_t i_m = unsigned_m / 4;
|
||||
for (const uint64_t l : c10::irange(k)) {
|
||||
for (const uint64_t j : c10::irange(n)) {
|
||||
|
||||
@ -8,7 +8,6 @@
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
#include <ATen/native/cpu/utils.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
|
||||
namespace at::native {
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
@ -17,7 +17,6 @@
|
||||
#include <ATen/cpu/vec/functional.h>
|
||||
#include <ATen/cpu/vec/vec.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
|
||||
// [Note AVX-SSE transitions] In general we avoid calls into cmath for code
|
||||
// compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in
|
||||
|
||||
@ -240,7 +240,7 @@ static void unfolded2d_copy(
|
||||
int64_t output_height,
|
||||
int64_t output_width) {
|
||||
at::parallel_for(
|
||||
0, (int64_t)n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) {
|
||||
0, n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) {
|
||||
for (const auto k : c10::irange(start, end)) {
|
||||
int64_t nip = k / (kH * kW);
|
||||
int64_t rest = k % (kH * kW);
|
||||
@ -316,7 +316,7 @@ static void unfolded2d_copy(
|
||||
for (int64_t x = 0; x < output_width; x++)
|
||||
memcpy(
|
||||
dst + (size_t)y * output_width + x,
|
||||
src + (size_t)iy * input_width + ix + (int64_t)x * dW,
|
||||
src + (size_t)iy * input_width + ix + x * dW,
|
||||
sizeof(scalar_t) * (1));
|
||||
}
|
||||
}
|
||||
|
||||
@ -906,7 +906,7 @@ static void ref_dyn_quant_matmul_4bit_channelwise_kernel(
|
||||
// Round to nearest integer
|
||||
const int32_t nudged_zero_point0 = lrintf(zero_point0);
|
||||
|
||||
int8_t* dst_ptr = (int8_t*)lhs_qa8dx + m_idx * dst_stride;
|
||||
int8_t* dst_ptr = lhs_qa8dx + m_idx * dst_stride;
|
||||
|
||||
// LHS offset at the beginning of the row
|
||||
*((float*)(dst_ptr)) = recip_scale0;
|
||||
@ -1048,7 +1048,7 @@ static void ref_dyn_quant_matmul_4bit_groupwise_kernel(
|
||||
zero_point0 = (std::min)(zero_point0, qmax);
|
||||
const int32_t nudged_zero_point0 = lrintf(zero_point0);
|
||||
|
||||
int8_t* dst_ptr = (int8_t*)lhs_qa8dx + row_idx * dst_stride;
|
||||
int8_t* dst_ptr = lhs_qa8dx + row_idx * dst_stride;
|
||||
|
||||
*((float*)(dst_ptr)) = recip_scale0;
|
||||
dst_ptr += sizeof(float);
|
||||
|
||||
@ -1375,7 +1375,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise
|
||||
&& ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
|
||||
// cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
|
||||
|| (dprops->major >= 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
|
||||
|| (dprops->major >= 10 && (!scale_a.sizes().empty() || !scale_b.sizes().empty())))) {
|
||||
TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
|
||||
at::cuda::detail::f8f8bf16_rowwise(
|
||||
mat1,
|
||||
@ -1919,7 +1919,7 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
|
||||
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
|
||||
|
||||
|
||||
addmm_out_cuda_impl(const_cast<Tensor&>(out), out, self, mat2, 0, 1);
|
||||
addmm_out_cuda_impl(out, out, self, mat2, 0, 1);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
@ -8,7 +8,6 @@
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/DynamicLibrary.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#include <ATen/native/cuda/MiscUtils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/LinearAlgebra.h>
|
||||
|
||||
@ -102,13 +102,7 @@ __host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::comple
|
||||
}
|
||||
|
||||
void launch_logcumsumexp_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim) {
|
||||
// Compile time for CUDA-11.4 is 3x slower than with CUDA-11.6+, specifically for complex numbers
|
||||
#if defined(FBCODE_CAFFE2) || defined(OVRSOURCE)
|
||||
#define _LCME_DISPATCH AT_DISPATCH_FLOATING_TYPES_AND2
|
||||
#else
|
||||
#define _LCME_DISPATCH AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2
|
||||
#endif
|
||||
_LCME_DISPATCH(ScalarType::Half, ScalarType::BFloat16,
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16,
|
||||
self.scalar_type(), "logcumsumexp_cuda",
|
||||
[&]() {
|
||||
using opmath_t = at::opmath_type<scalar_t>;
|
||||
|
||||
@ -1041,8 +1041,8 @@ std::string generate_code(
|
||||
// and `extra_args` for computation call if
|
||||
// extra arguments to capture runtime state are passed.
|
||||
// (look at polygamma for example).
|
||||
std::string extra_params = "";
|
||||
std::string extra_args = "";
|
||||
std::string extra_params;
|
||||
std::string extra_args;
|
||||
for (size_t i = 0; i < extra_args_typenames.size(); i++) {
|
||||
auto type = std::string(extra_args_typenames[i]);
|
||||
auto name = "extra_arg_" + std::to_string(i);
|
||||
@ -1352,7 +1352,7 @@ std::string generate_reduction_code(
|
||||
int vec_size,
|
||||
int max_threads_codegen) {
|
||||
TORCH_INTERNAL_ASSERT(desc.nInputs == 1);
|
||||
TORCH_INTERNAL_ASSERT(desc.extra_args_types.size() == 0);
|
||||
TORCH_INTERNAL_ASSERT(desc.extra_args_types.empty());
|
||||
|
||||
return generate_reduction_code(
|
||||
desc.nOutputs,
|
||||
@ -1451,7 +1451,7 @@ std::optional<std::string> get_cache_dir() {
|
||||
std::string cache_dir;
|
||||
char* ptkcp = std::getenv("PYTORCH_KERNEL_CACHE_PATH");
|
||||
// Create kernel_cache_dir if needed as we do not want to create the base directory passed by the user
|
||||
std::string kernels_cache_dir = "";
|
||||
std::string kernels_cache_dir;
|
||||
if (ptkcp != nullptr) {
|
||||
cache_dir = std::string(ptkcp);
|
||||
} else {
|
||||
|
||||
@ -14,7 +14,6 @@
|
||||
#include <ATen/native/LinearAlgebraUtils.h>
|
||||
#include <ATen/native/cuda/MiscUtils.h>
|
||||
#include <ATen/native/LinearAlgebra.h>
|
||||
#include <ATen/native/BatchLinearAlgebra.h>
|
||||
#include <ATen/native/cuda/linalg/BatchLinearAlgebraLib.h>
|
||||
#include <ATen/native/cuda/linalg/MagmaUtils.h>
|
||||
#include <ATen/native/cpu/zmath.h>
|
||||
@ -1615,16 +1614,7 @@ static void lu_factor(const Tensor& input, const Tensor& pivots, const Tensor& i
|
||||
const auto preferred_backend = at::globalContext().linalgPreferredBackend();
|
||||
#ifdef USE_LINALG_SOLVER
|
||||
const auto lu_factor_cusolver = [batch_size, m, n](const Tensor& input, const Tensor& pivots, const Tensor& infos, bool compute_pivots) {
|
||||
// In CUDA 10.2, lu_factor_looped_cusolver does not finish the computations when the input
|
||||
// matrix is exactly singular. The returned pivots contain garbage. This breaks linalg.det
|
||||
// Now, batched_cublas does not handle rectangular matrices, so we still dispatch to
|
||||
// looped_cusolver even if m != n.
|
||||
#ifdef USE_ROCM
|
||||
constexpr bool looped_correct = true;
|
||||
#else
|
||||
constexpr bool looped_correct = CUSOLVER_VERSION >= 11100;
|
||||
#endif
|
||||
if (m != n || (looped_correct && (batch_size == 1 || m >= 512))) {
|
||||
if (m != n || (batch_size == 1 || m >= 512)) {
|
||||
lu_factor_looped_cusolver(input, pivots, infos, compute_pivots);
|
||||
} else {
|
||||
lu_factor_batched_cublas(input, pivots, infos, compute_pivots);
|
||||
|
||||
@ -127,8 +127,7 @@ void apply_ldl_solve_cusolver(
|
||||
const Tensor& pivots,
|
||||
const Tensor& B,
|
||||
bool upper) {
|
||||
#if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) && \
|
||||
CUSOLVER_VERSION >= 11102)
|
||||
#if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION))
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Calling torch.linalg.ldl_solve on a CUDA tensor requires compiling ",
|
||||
|
||||
@ -169,7 +169,10 @@ std::string repro_from_args(const ConvolutionParams& params) {
|
||||
ss << "If that doesn't trigger the error, please include your original repro script when reporting this issue.\n\n";
|
||||
ss << "import torch\n";
|
||||
ss << "torch.backends.cuda.matmul.allow_tf32 = "
|
||||
<< pybool(at::globalContext().float32Precision("cuda", "matmul") == "tf32")
|
||||
<< pybool(
|
||||
at::globalContext().float32Precision(
|
||||
at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
|
||||
at::Float32Precision::TF32)
|
||||
<< "\n";
|
||||
ss << "torch.backends.cudnn.benchmark = "
|
||||
<< pybool(at::globalContext().benchmarkCuDNN()) << "\n";
|
||||
@ -726,7 +729,7 @@ Tensor cudnn_convolution_relu(
|
||||
|
||||
auto& ctx = at::globalContext();
|
||||
bool benchmark = ctx.benchmarkCuDNN();
|
||||
bool allow_tf32 = ctx.allowTF32CuDNN("conv");
|
||||
bool allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
|
||||
auto _bias = bias_t.has_value()
|
||||
? bias_t.value()
|
||||
: at::zeros(
|
||||
@ -784,7 +787,7 @@ Tensor cudnn_convolution_add_relu(
|
||||
}
|
||||
|
||||
auto& ctx = at::globalContext();
|
||||
bool allow_tf32 = ctx.allowTF32CuDNN("conv");
|
||||
bool allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
|
||||
bool benchmark = ctx.benchmarkCuDNN();
|
||||
auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
|
||||
auto _bias = bias_t.has_value()
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user