Compare commits

..

2 Commits

655 changed files with 6359 additions and 17484 deletions

View File

@ -13,6 +13,49 @@ def list_dir(path: str) -> list[str]:
return check_output(["ls", "-1", path]).decode().split("\n")
def build_ArmComputeLibrary() -> None:
"""
Using ArmComputeLibrary for aarch64 PyTorch
"""
print("Building Arm Compute Library")
acl_build_flags = [
"debug=0",
"neon=1",
"opencl=0",
"os=linux",
"openmp=1",
"cppthreads=0",
"arch=armv8a",
"multi_isa=1",
"fixed_format_kernels=1",
"build=native",
]
acl_install_dir = "/acl"
acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
if os.path.isdir(acl_install_dir):
shutil.rmtree(acl_install_dir)
if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
check_call(
[
"git",
"clone",
"https://github.com/ARM-software/ComputeLibrary.git",
"-b",
"v25.02",
"--depth",
"1",
"--shallow-submodules",
]
)
check_call(
["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
cwd=acl_checkout_dir,
)
for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
def replace_tag(filename) -> None:
with open(filename) as f:
lines = f.readlines()
@ -313,13 +356,19 @@ if __name__ == "__main__":
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
if enable_mkldnn:
build_ArmComputeLibrary()
print("build pytorch with mkldnn+acl backend")
build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
build_vars += "ACL_ROOT_DIR=/acl "
build_vars += (
"USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
"ACL_ROOT_DIR=/acl "
"LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "
"ACL_INCLUDE_DIR=/acl/build "
"ACL_LIBRARY=/acl/build "
)
if enable_cuda:
build_vars += "BLAS=NVPL "
else:
build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS "
else:
print("build pytorch without mkldnn backend")

View File

@ -299,6 +299,40 @@ def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
)
def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None:
print("Building OpenBLAS")
host.run_cmd(
f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.28 {git_clone_flags}"
)
make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8"
host.run_cmd(
f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS"
)
def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None:
print("Building Arm Compute Library")
acl_build_flags = " ".join(
[
"debug=0",
"neon=1",
"opencl=0",
"os=linux",
"openmp=1",
"cppthreads=0",
"arch=armv8a",
"multi_isa=1",
"fixed_format_kernels=1",
"build=native",
]
)
host.run_cmd(
f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}"
)
host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
host.run_cmd("pip3 install auditwheel")
host.run_cmd(
@ -666,6 +700,7 @@ def start_build(
configure_system(
host, compiler=compiler, use_conda=use_conda, python_version=python_version
)
build_OpenBLAS(host, git_clone_flags)
if host.using_docker():
print("Move libgfortant.a into a standard location")
@ -688,8 +723,6 @@ def start_build(
f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
)
host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
print("Building PyTorch wheel")
build_opts = ""
if pytorch_build_number is not None:
@ -710,18 +743,16 @@ def start_build(
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
if enable_mkldnn:
host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
build_ArmComputeLibrary(host, git_clone_flags)
print("build pytorch with mkldnn+acl backend")
build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
build_vars += " BLAS=OpenBLAS"
build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
build_vars += " ACL_ROOT_DIR=/acl"
host.run_cmd(
f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && "
f"{build_vars} python3 -m build --wheel --no-isolation{build_opts}"
)
print("Repair the wheel")
pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
ld_library_path = "$HOME/acl/build:$HOME/pytorch/build/lib"
host.run_cmd(
f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
)
@ -877,7 +908,7 @@ def terminate_instances(instance_type: str) -> None:
def parse_arguments():
from argparse import ArgumentParser
parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
parser = ArgumentParser("Builid and test AARCH64 wheels using EC2")
parser.add_argument("--key-name", type=str)
parser.add_argument("--debug", action="store_true")
parser.add_argument("--build-only", action="store_true")

View File

@ -1 +1 @@
v2.27.5-1
v2.28.3-1

View File

@ -1 +1 @@
v2.27.7-1
v2.28.3-1

View File

@ -1 +1 @@
27664085f804afc83df26f740bb46c365854f2c4
bbb06c0334a6772b92d24bde54956e675c8c6604

27
.ci/docker/common/install_acl.sh Executable file → Normal file
View File

@ -1,27 +1,16 @@
#!/bin/bash
# Script used only in CD pipeline
set -euo pipefail
set -eux
ACL_VERSION=${ACL_VERSION:-"v25.02"}
ACL_INSTALL_DIR="/acl"
readonly version=v25.02
readonly src_host=https://github.com/ARM-software
readonly src_repo=ComputeLibrary
# Clone ACL
git clone https://github.com/ARM-software/ComputeLibrary.git -b "${ACL_VERSION}" --depth 1 --shallow-submodules
[[ ! -d ${src_repo} ]] && git clone ${src_host}/${src_repo}.git
cd ${src_repo}
git checkout $version
ACL_CHECKOUT_DIR="ComputeLibrary"
# Build with scons
pushd $ACL_CHECKOUT_DIR
scons -j8 Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \
os=linux arch=armv8a build=native multi_isa=1 \
fixed_format_kernels=1 openmp=1 cppthreads=0
popd
# Install ACL
sudo mkdir -p ${ACL_INSTALL_DIR}
for d in arm_compute include utils support src build
do
sudo cp -r ${ACL_CHECKOUT_DIR}/${d} ${ACL_INSTALL_DIR}/${d}
done
rm -rf $ACL_CHECKOUT_DIR

12
.ci/docker/common/install_openblas.sh Executable file → Normal file
View File

@ -3,10 +3,8 @@
set -ex
OPENBLAS_VERSION=${OPENBLAS_VERSION:-"v0.3.30"}
# Clone OpenBLAS
git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" --depth 1 --shallow-submodules
cd /
git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules
OPENBLAS_CHECKOUT_DIR="OpenBLAS"
OPENBLAS_BUILD_FLAGS="
@ -19,7 +17,5 @@ CFLAGS=-O3
BUILD_BFLOAT16=1
"
make -j8 ${OPENBLAS_BUILD_FLAGS} -C $OPENBLAS_CHECKOUT_DIR
sudo make install -C $OPENBLAS_CHECKOUT_DIR
rm -rf $OPENBLAS_CHECKOUT_DIR
make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}

View File

@ -1,9 +0,0 @@
#!/bin/bash
set -xe
# Script used in Linux x86 and aarch64 CD pipeline
# Workaround for exposing statically linked libstdc++ CXX11 ABI symbols.
# see: https://github.com/pytorch/pytorch/issues/133437
LIBNONSHARED=$(gcc -print-file-name=libstdc++_nonshared.a)
nm -g $LIBNONSHARED | grep " T " | grep recursive_directory_iterator | cut -c 20- > weaken-symbols.txt
objcopy --weaken-symbols weaken-symbols.txt $LIBNONSHARED $LIBNONSHARED

View File

@ -130,8 +130,7 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op
RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
/opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \
done;
ADD ./common/patch_libstdc.sh patch_libstdc.sh
RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
# cmake-3.18.4 from pip; force in case cmake3 already exists
RUN yum install -y python3-pip && \

View File

@ -62,13 +62,6 @@ ARG OPENBLAS_VERSION
ADD ./common/install_openblas.sh install_openblas.sh
RUN bash ./install_openblas.sh && rm install_openblas.sh
# Install Arm Compute Library
FROM base as arm_compute
# use python3.9 to install scons
RUN python3.9 -m pip install scons==4.7.0
RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin
COPY ./common/install_acl.sh install_acl.sh
RUN bash ./install_acl.sh && rm install_acl.sh
FROM base as final
# remove unnecessary python versions
@ -77,7 +70,4 @@ RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
COPY --from=openblas /opt/OpenBLAS/ /opt/OpenBLAS/
COPY --from=arm_compute /acl /acl
ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:/acl/build/:$LD_LIBRARY_PATH
ADD ./common/patch_libstdc.sh patch_libstdc.sh
RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH

View File

@ -86,15 +86,6 @@ FROM base as nvpl
ADD ./common/install_nvpl.sh install_nvpl.sh
RUN bash ./install_nvpl.sh && rm install_nvpl.sh
# Install Arm Compute Library
FROM base as arm_compute
# use python3.9 to install scons
RUN python3.9 -m pip install scons==4.7.0
RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin
COPY ./common/install_acl.sh install_acl.sh
RUN bash ./install_acl.sh && rm install_acl.sh
FROM base as final
FROM final as cuda_final
ARG BASE_CUDA_VERSION
RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
@ -102,9 +93,5 @@ COPY --from=cuda /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BAS
COPY --from=magma /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION}
COPY --from=nvpl /opt/nvpl/lib/ /usr/local/lib/
COPY --from=nvpl /opt/nvpl/include/ /usr/local/include/
COPY --from=arm_compute /acl /acl
RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
ENV PATH=/usr/local/cuda/bin:$PATH
ENV LD_LIBRARY_PATH=/acl/build/:$LD_LIBRARY_PATH
ADD ./common/patch_libstdc.sh patch_libstdc.sh
RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh

View File

@ -28,7 +28,6 @@ fi
MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
OPENBLAS_VERSION=${OPENBLAS_VERSION:-}
ACL_VERSION=${ACL_VERSION:-}
case ${image} in
manylinux2_28-builder:cpu)
@ -42,6 +41,7 @@ case ${image} in
GPU_IMAGE=arm64v8/almalinux:8
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
MANY_LINUX_VERSION="2_28_aarch64"
OPENBLAS_VERSION="v0.3.30"
;;
manylinuxs390x-builder:cpu-s390x)
TARGET=final
@ -119,8 +119,7 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
DOCKER_BUILDKIT=1 docker build \
${DOCKER_GPU_BUILD_ARG} \
--build-arg "GPU_IMAGE=${GPU_IMAGE}" \
--build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION:-}" \
--build-arg "ACL_VERSION=${ACL_VERSION:-}" \
--build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \
--target "${TARGET}" \
-t "${tmp_tag}" \
$@ \

View File

@ -52,10 +52,10 @@ flatbuffers==24.12.23
#Pinned versions: 24.12.23
#test that import:
hypothesis==6.56.4
hypothesis==5.35.1
# Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
#Description: advanced library for generating parametrized tests
#Pinned versions: 6.56.4
#Pinned versions: 5.35.1
#test that import: test_xnnpack_integration.py, test_pruning_op.py, test_nn.py
junitparser==2.1.1
@ -98,7 +98,7 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
#Pinned versions:
#test that import:
mypy==1.16.0 ; platform_system == "Linux"
mypy==1.16.0 ; platform_system != "Windows"
# Pin MyPy version because new errors are likely to appear with each release
# Skip on Windows as lots of type annotations are POSIX specific
#Description: linter
@ -169,7 +169,7 @@ optree==0.13.0
pillow==11.0.0
#Description: Python Imaging Library fork
#Pinned versions: 11.0.0
#Pinned versions: 10.3.0
#test that import:
protobuf==5.29.5
@ -217,7 +217,7 @@ pytest-subtests==0.13.1
#Pinned versions:
#test that import:
xdoctest==1.3.0
xdoctest==1.1.0
#Description: runs doctests in pytest
#Pinned versions: 1.1.0
#test that import:
@ -268,7 +268,7 @@ scipy==1.14.1 ; python_version >= "3.12"
#test that import:
# needed by torchgen utils
typing-extensions==4.12.2
typing-extensions>=4.10.0
#Description: type hints for python
#Pinned versions:
#test that import:
@ -361,10 +361,9 @@ pwlf==2.2.1
#test that import: test_sac_estimator.py
# To build PyTorch itself
pyyaml==6.0.2
pyyaml
pyzstd
setuptools==78.1.1
packaging==23.1
setuptools>=70.1.0
six
scons==4.5.2 ; platform_machine == "aarch64"
@ -385,10 +384,7 @@ cmake==3.31.6
tlparse==0.4.0
#Description: required for log parsing
filelock==3.18.0
#Description: required for inductor testing
cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x" and platform_system != "Darwin"
cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
#Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits.
#test that import: test_cuda.py

View File

@ -9,7 +9,7 @@ standard-imghdr==3.13.0; python_version >= "3.13"
# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
# something related to Docker setup. We can investigate this later.

View File

@ -107,10 +107,6 @@ if [[ $ROCM_INT -ge 60200 ]]; then
ROCM_SO_FILES+=("librocm-core.so")
fi
if [[ $ROCM_INT -ge 70000 ]]; then
ROCM_SO_FILES+=("librocroller.so")
fi
OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
if [[ "$OS_NAME" == *"CentOS Linux"* || "$OS_NAME" == *"AlmaLinux"* ]]; then
LIBGOMP_PATH="/usr/lib64/libgomp.so.1"

View File

@ -89,7 +89,7 @@ fi
if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
export USE_MKLDNN=1
export USE_MKLDNN_ACL=1
export ACL_ROOT_DIR=/acl
export ACL_ROOT_DIR=/ComputeLibrary
fi
if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then

View File

@ -67,7 +67,7 @@ fi
# wheels with cxx11-abi
echo "Checking that the gcc ABI is what we expect"
if [[ "$(uname)" != 'Darwin' && "$(uname -m)" != "s390x" ]]; then
if [[ "$(uname)" != 'Darwin' ]]; then
# We also check that there are cxx11 symbols in libtorch
#
echo "Checking that symbols in libtorch.so have the right gcc abi"

View File

@ -26,7 +26,6 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
time python test/run_test.py --verbose -i distributed/test_aten_comm_compute_reordering
time python test/run_test.py --verbose -i distributed/test_store
time python test/run_test.py --verbose -i distributed/test_symmetric_memory
time python test/run_test.py --verbose -i distributed/test_pg_wrapper

View File

@ -32,9 +32,6 @@ LIBTORCH_NAMESPACE_LIST = (
"torch::",
)
# Patterns for detecting statically linked libstdc++ symbols
STATICALLY_LINKED_CXX11_ABI = [re.compile(r".*recursive_directory_iterator.*")]
def _apply_libtorch_symbols(symbols):
return [
@ -56,17 +53,12 @@ def get_symbols(lib: str) -> list[tuple[str, str, str]]:
return [x.split(" ", 2) for x in lines.decode("latin1").split("\n")[:-1]]
def grep_symbols(
lib: str, patterns: list[Any], symbol_type: str | None = None
) -> list[str]:
def grep_symbols(lib: str, patterns: list[Any]) -> list[str]:
def _grep_symbols(
symbols: list[tuple[str, str, str]], patterns: list[Any]
) -> list[str]:
rc = []
for _s_addr, _s_type, s_name in symbols:
# Filter by symbol type if specified
if symbol_type and _s_type != symbol_type:
continue
for pattern in patterns:
if pattern.match(s_name):
rc.append(s_name)
@ -88,18 +80,6 @@ def grep_symbols(
return functools.reduce(list.__add__, (x.result() for x in tasks), [])
def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None:
cxx11_statically_linked_symbols = grep_symbols(
lib, STATICALLY_LINKED_CXX11_ABI, symbol_type="T"
)
num_statically_linked_symbols = len(cxx11_statically_linked_symbols)
print(f"num_statically_linked_symbols (T): {num_statically_linked_symbols}")
if num_statically_linked_symbols > 0:
raise RuntimeError(
f"Found statically linked libstdc++ symbols (recursive_directory_iterator): {cxx11_statically_linked_symbols[:100]}"
)
def check_lib_symbols_for_abi_correctness(lib: str) -> None:
print(f"lib: {lib}")
cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
@ -127,7 +107,6 @@ def main() -> None:
libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path)
if __name__ == "__main__":

View File

@ -435,7 +435,7 @@ test_inductor_distributed() {
# this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
# with if required # gpus aren't available
python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_aten_comm_compute_reordering distributed/test_compute_comm_reordering --verbose
python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_compute_comm_reordering --verbose
assert_git_not_dirty
}

View File

@ -1,32 +0,0 @@
#!/bin/bash
set -ex -o pipefail
# Suppress ANSI color escape sequences
export TERM=vt100
# shellcheck source=./common.sh
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
# shellcheck source=./common-build.sh
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
echo "Environment variables"
env
echo "Testing FA3 stable wheel still works with currently built torch"
echo "Installing ABI Stable FA3 wheel"
# The wheel was built on https://github.com/Dao-AILab/flash-attention/commit/b3846b059bf6b143d1cd56879933be30a9f78c81
# on torch nightly torch==2.9.0.dev20250830+cu129
$MAYBE_SUDO pip -q install https://s3.amazonaws.com/ossci-linux/wheels/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl
pushd flash-attention/hopper
export PYTHONPATH=$PWD
pytest -v -s \
"test_flash_attn.py::test_flash_attn_output[1-1-192-False-False-False-0.0-False-False-mha-dtype0]" \
"test_flash_attn.py::test_flash_attn_varlen_output[511-1-64-True-False-False-0.0-False-False-gqa-dtype2]" \
"test_flash_attn.py::test_flash_attn_kvcache[1-128-128-False-False-True-None-0.0-False-False-True-False-True-False-gqa-dtype0]" \
"test_flash_attn.py::test_flash_attn_race_condition[97-97-192-True-dtype0]" \
"test_flash_attn.py::test_flash_attn_combine[2-3-64-dtype1]" \
"test_flash_attn.py::test_flash3_bw_compatibility"
popd

View File

@ -38,12 +38,10 @@ if errorlevel 1 goto fail
if not errorlevel 0 goto fail
:: Update CMake
:: TODO: Investigate why this helps MKL detection, even when CMake from choco is not used
call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=System' --apply-install-arguments-to-dependencies --version=3.27.9
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
:: TODO: Move to .ci/docker/requirements-ci.txt
call pip install mkl==2024.2.0 mkl-static==2024.2.0 mkl-include==2024.2.0
if errorlevel 1 goto fail
if not errorlevel 0 goto fail

View File

@ -15,35 +15,37 @@ if errorlevel 1 exit /b 1
if not errorlevel 0 exit /b 1
cd %TMP_DIR_WIN%\build\torch\test
:: Enable delayed variable expansion to make the list
setlocal enabledelayedexpansion
set EXE_LIST=
for /r "." %%a in (*.exe) do (
if "%%~na" == "c10_intrusive_ptr_benchmark" (
@REM NB: This is not a gtest executable file, thus couldn't be handled by
@REM pytest-cpp and is excluded from test discovery by run_test
call "%%~fa"
call :libtorch_check "%%~na" "%%~fa"
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
) else (
if "%%~na" == "verify_api_visibility" (
@REM Skip verify_api_visibility as it is a compile-level test
) else (
set EXE_LIST=!EXE_LIST! cpp/%%~na
)
)
)
goto :eof
:libtorch_check
cd %CWD%
set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test
:: Run python test\run_test.py on the list
set NO_TD=True && python test\run_test.py --cpp --verbose -i !EXE_LIST!
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
:: Skip verify_api_visibility as it a compile level test
if "%~1" == "verify_api_visibility" goto :eof
goto :eof
echo Running "%~2"
if "%~1" == "c10_intrusive_ptr_benchmark" (
:: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp
call "%~2"
goto :eof
)
python test\run_test.py --cpp --verbose -i "cpp/%~1"
if errorlevel 1 (
echo %1 failed with exit code %errorlevel%
goto fail
)
if not errorlevel 0 (
echo %1 failed with exit code %errorlevel%
goto fail
)
:eof
exit /b 0

View File

@ -37,8 +37,27 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
fi
# TODO: Move this to .ci/docker/requirements-ci.txt
python -m pip install "psutil==5.9.1" "pynvml==11.4.1" "pytest-shard==0.1.2"
# TODO: Move both of them to Windows AMI
python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
# scipy from 1.6.3 to 1.10
# expecttest from 0.1.3 to 0.3.0
# xdoctest from 1.0.2 to 1.3.0
python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"
# Install Z3 optional dependency for Windows builds.
python -m pip install z3-solver==4.15.1.0
# Install tlparse for test\dynamo\test_structured_trace.py UTs.
python -m pip install tlparse==0.4.0
# Install parameterized
python -m pip install parameterized==0.8.1
# Install pulp for testing ilps under torch\distributed\_tools
python -m pip install pulp==2.9.0
run_tests() {
# Run nvidia-smi if available

View File

@ -37,10 +37,10 @@ IF "%CUDA_PATH_V128%"=="" (
)
IF "%BUILD_VISION%" == "" (
set TORCH_CUDA_ARCH_LIST=7.0;7.5;8.0;8.6;9.0;10.0;12.0
set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0
set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
) ELSE (
set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
)
set "CUDA_PATH=%CUDA_PATH_V128%"

View File

@ -59,9 +59,9 @@ performance-*,
-performance-enum-size,
readability-container-size-empty,
readability-delete-null-pointer,
readability-duplicate-include,
readability-duplicate-include
readability-misplaced-array-index,
readability-redundant*,
readability-redundant*
readability-simplify-subscript-expr,
readability-string-compare,
-readability-redundant-access-specifiers,

View File

@ -23,6 +23,9 @@ runs:
run: |
.github\scripts\kill_active_ssh_sessions.ps1
- name: Clean up leftover processes on non-ephemeral Windows runner
uses: pytorch/test-infra/.github/actions/cleanup-runner@main
# Cleaning up Windows workspace sometimes fails flakily with device or resource busy
# error, meaning one or more processes haven't stopped completely yet. So trying to
# retry this step several time similar to how checkout-pytorch GHA does

View File

@ -1 +1 @@
0ad9951c416d33c5da4f7a504fb162cbe62386f5
0307428d65acf5cf1a73a70a7722e076bbb83f22

View File

@ -202,7 +202,7 @@ ARG max_jobs=16
ENV MAX_JOBS=${max_jobs}
ARG nvcc_threads=4
ENV NVCC_THREADS=$nvcc_threads
ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0'
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
ARG USE_SCCACHE
@ -297,28 +297,16 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
echo "[INFO] Showing torch_build_versions.txt content:" && \
cat torch_build_versions.txt
# Install build and runtime dependencies, this is needed for flashinfer install
COPY requirements/build.txt requirements/build.txt
COPY use_existing_torch.py use_existing_torch.py
RUN python3 use_existing_torch.py
RUN cat requirements/build.txt
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
if ! python3 -m uv --version > /dev/null 2>&1; then \
python3 -m pip install uv==0.8.4; \
fi
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt
# Default mount file as placeholder, this just avoid the mount error
ARG TORCH_WHEELS_PATH="./requirements"
# Install torch, torchaudio and torchvision
@ -344,11 +332,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Install xformers wheel from previous stage
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system /wheels/xformers/*.whl --verbose
# Build flashinfer from source.
ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
# install package for build flashinfer
# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
RUN pip install build==1.3.0
RUN pip freeze | grep -E 'setuptools|packaging|build'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}

View File

@ -1,14 +1,9 @@
import glob
import os
requires_files = glob.glob("requirements/*.txt")
requires_files += ["pyproject.toml"]
for file in requires_files:
if not os.path.exists(file):
print(f"!!! skipping missing {file}")
continue
print(f">>> cleaning {file}")
with open(file) as f:
lines = f.readlines()

View File

@ -0,0 +1,37 @@
boto3==1.35.42
build==1.2.2.post1
cmake==3.27.*
expecttest==0.3.0
fbscribelogger==0.1.7
filelock==3.18.0
hypothesis==6.56.4
librosa>=0.6.2
mpmath==1.3.0
networkx==2.8.7
ninja==1.10.2.4
numba==0.59.0
numpy==1.26.4
opt-einsum>=3.3
optree==0.13.0
packaging==23.1
parameterized==0.8.1
pillow==10.3.0
protobuf==5.29.5
psutil==5.9.8
pygments==2.15.0
pytest-cpp==2.3.0
pytest-flakefinder==1.1.0
pytest-rerunfailures==10.3
pytest-subtests==0.13.1
pytest-xdist==3.3.1
pytest==7.3.2
pyyaml==6.0.2
scipy==1.12.0
setuptools==78.1.1
sympy==1.13.3
tlparse==0.4.0
tensorboard==2.13.0
typing-extensions==4.12.2
unittest-xml-reporting<=3.2.0,>=2.0.0
xdoctest==1.1.0
z3-solver==4.15.1.0

View File

@ -502,7 +502,6 @@ def perform_misc_tasks(
job_name: str,
pr_body: str,
branch: Optional[str] = None,
tag: Optional[str] = None,
) -> None:
"""
In addition to apply the filter logic, the script also does the following
@ -510,9 +509,7 @@ def perform_misc_tasks(
"""
set_output(
"keep-going",
branch == MAIN_BRANCH
or bool(tag and re.match(r"^trunk/[a-f0-9]{40}$", tag))
or check_for_setting(labels, pr_body, "keep-going"),
branch == MAIN_BRANCH or check_for_setting(labels, pr_body, "keep-going"),
)
set_output(
"ci-verbose-test-logs",
@ -637,7 +634,6 @@ def main() -> None:
job_name=args.job_name,
pr_body=pr_body if pr_body else "",
branch=args.branch,
tag=tag,
)
# Set the filtered test matrix as the output

View File

@ -53,7 +53,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
"nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
@ -70,7 +70,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
"nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
@ -87,7 +87,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
"nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | "
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
"nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "

View File

@ -127,6 +127,53 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
),
]
ROCM_SMOKE_WORKFLOWS = [
BinaryBuildWorkflow(
os=OperatingSystem.LINUX,
package_type="manywheel",
build_variant="rocm",
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.LINUX,
arches=["6.4"],
python_versions=["3.10"],
),
ciflow_config=CIFlowConfig(
labels={
LABEL_CIFLOW_BINARIES,
LABEL_CIFLOW_BINARIES_WHEEL,
LABEL_CIFLOW_ROCM,
},
isolated_workflow=True,
),
branches="main",
),
]
LINUX_BINARY_SMOKE_WORKFLOWS = [
BinaryBuildWorkflow(
os=OperatingSystem.LINUX,
package_type="manywheel",
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.LINUX,
arches=["13.0"],
python_versions=["3.12"],
),
branches="main",
),
BinaryBuildWorkflow(
os=OperatingSystem.LINUX,
package_type="libtorch",
build_variant=generate_binary_build_matrix.RELEASE,
build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
OperatingSystem.LINUX,
generate_binary_build_matrix.RELEASE,
arches=["cpu"],
libtorch_variants=["shared-with-deps"],
),
branches="main",
),
]
WINDOWS_BINARY_BUILD_WORKFLOWS = [
BinaryBuildWorkflow(
os=OperatingSystem.WINDOWS,
@ -212,6 +259,39 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
),
]
WINDOWS_BINARY_SMOKE_WORKFLOWS = [
BinaryBuildWorkflow(
os=OperatingSystem.WINDOWS,
package_type="libtorch",
build_variant=generate_binary_build_matrix.RELEASE,
build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
OperatingSystem.WINDOWS,
generate_binary_build_matrix.RELEASE,
arches=["cpu"],
libtorch_variants=["shared-with-deps"],
),
branches="main",
ciflow_config=CIFlowConfig(
isolated_workflow=True,
),
),
BinaryBuildWorkflow(
os=OperatingSystem.WINDOWS,
package_type="libtorch",
build_variant=generate_binary_build_matrix.DEBUG,
build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
OperatingSystem.WINDOWS,
generate_binary_build_matrix.DEBUG,
arches=["cpu"],
libtorch_variants=["shared-with-deps"],
),
branches="main",
ciflow_config=CIFlowConfig(
isolated_workflow=True,
),
),
]
MACOS_BINARY_BUILD_WORKFLOWS = [
BinaryBuildWorkflow(
os=OperatingSystem.MACOS_ARM64,
@ -292,10 +372,23 @@ def main() -> None:
jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
S390X_BINARY_BUILD_WORKFLOWS,
),
(
# Give rocm it's own workflow file
jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
ROCM_SMOKE_WORKFLOWS,
),
(
jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
LINUX_BINARY_SMOKE_WORKFLOWS,
),
(
jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
WINDOWS_BINARY_BUILD_WORKFLOWS,
),
(
jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
WINDOWS_BINARY_SMOKE_WORKFLOWS,
),
(
jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
MACOS_BINARY_BUILD_WORKFLOWS,

View File

@ -1,255 +0,0 @@
# The point of this workflow is to test that a FA3 wheel that was built based off the
# stable ABI as of torch nightly 20250830 can still run on the newer torch.
#
# This workflow is very similar to the _linux-test.yml workflow, with the following
# differences:
# 1. It is simpler (there is no test matrix)
# 2. It pulls flash-attention as a secondary repository in order to access the tests.
# Note that it does not BUILD anything from flash-attention, as we have a prebuilt
# wheel. We pull flash-attention only to run a few tests.
# 3. It runs only FA3 tests. No PyTorch tests are run.
name: linux-test-stable-fa3
on:
workflow_call:
inputs:
build-environment:
required: true
type: string
description: Top-level label for what's being built/tested.
docker-image:
required: true
type: string
description: Docker image to run in.
timeout-minutes:
required: false
type: number
default: 30
description: |
Set the maximum (in minutes) how long the workflow should take to finish
s3-bucket:
description: S3 bucket to download artifact
required: false
type: string
default: "gha-artifacts"
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
VLLM_TEST_HUGGING_FACE_TOKEN:
required: false
description: |
HF Auth token to test vllm
SCRIBE_GRAPHQL_ACCESS_TOKEN:
required: false
description: |
FB app token to write to scribe endpoint
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
jobs:
test:
# Don't run on forked repos
if: github.repository_owner == 'pytorch'
runs-on: linux.aws.h100
timeout-minutes: ${{ inputs.timeout-minutes || 30 }}
permissions:
id-token: write
contents: read
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
no-sudo: true
- name: Checkout flash-attention as a secondary repository
uses: actions/checkout@v4
with:
repository: Dao-AILab/flash-attention
path: flash-attention
- name: Setup Linux
uses: ./.github/actions/setup-linux
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-image-name: ${{ inputs.docker-image }}
- name: Use following to pull public copy of the image
id: print-ghcr-mirror
env:
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
shell: bash
run: |
tag=${ECR_DOCKER_IMAGE##*:}
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Check if in a container runner
shell: bash
id: check_container_runner
run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
- name: Setup GPU_FLAG for docker run
id: setup-gpu-flag
run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
id: setup-sscache-port-flag
run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Download build artifacts
uses: ./.github/actions/download-build-artifacts
with:
name: ${{ inputs.build-environment }}
s3-bucket: ${{ inputs.s3-bucket }}
- name: Parse ref
id: parse-ref
run: .github/scripts/parse_ref.py
- name: Set Test step time
id: test-timeout
shell: bash
env:
JOB_TIMEOUT: ${{ inputs.timeout-minutes }}
run: |
echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
- name: Preserve github env variables for use in docker
shell: bash
run: |
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test
id: test
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
env:
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
PR_NUMBER: ${{ github.event.pull_request.number }}
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_WORKFLOW: ${{ github.workflow }}
GITHUB_JOB: ${{ github.job }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
BRANCH: ${{ steps.parse-ref.outputs.branch }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
SHM_SIZE: '2g'
DOCKER_IMAGE: ${{ inputs.docker-image }}
VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ steps.get-job-id.outputs.job-id }}
run: |
set -x
TEST_COMMAND=.ci/pytorch/test_fa3_abi_stable.sh
# Leaving 1GB for the runner and other things
TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
# https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
# comes from https://github.com/pytorch/test-infra/pull/6058
TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))
SHM_OPTS="--shm-size=${SHM_SIZE}"
JENKINS_USER="--user jenkins"
DOCKER_SHELL_CMD=
# detached container should get cleaned up by teardown_ec2_linux
# TODO: Stop building test binaries as part of the build phase
# Used for GPU_FLAG, SHM_OPTS, JENKINS_USER and DOCKER_SHELL_CMD since that doesn't play nice
# shellcheck disable=SC2086,SC2090
container_name=$(docker run \
${GPU_FLAG:-} \
${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
-e BUILD_ENVIRONMENT \
-e PR_NUMBER \
-e GITHUB_ACTIONS \
-e GITHUB_REPOSITORY \
-e GITHUB_WORKFLOW \
-e GITHUB_JOB \
-e GITHUB_RUN_ID \
-e GITHUB_RUN_NUMBER \
-e GITHUB_RUN_ATTEMPT \
-e JOB_ID \
-e JOB_NAME \
-e BASE_SHA \
-e BRANCH \
-e SHA1 \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e HUGGING_FACE_HUB_TOKEN \
-e VLLM_TEST_HUGGING_FACE_TOKEN \
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
-e ARTIFACTS_FILE_SUFFIX \
--memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
--memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--ipc=host \
${SHM_OPTS} \
--tty \
--detach \
--name="${container_name}" \
${JENKINS_USER} \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}" \
${DOCKER_SHELL_CMD}
)
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
docker exec -t "${container_name}" sh -c "python3 -m pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
- name: Collect backtraces from coredumps (if any)
if: always()
run: |
# shellcheck disable=SC2156
find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
- name: Store Core dumps on S3
uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
if: failure()
with:
name: coredumps-fa3-stable-abi-smoke-tests
retention-days: 14
if-no-files-found: ignore
path: ./**/core.[1-9]*
- name: Upload utilization stats
if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' }}
continue-on-error: true
uses: ./.github/actions/upload-utilization-stats
with:
job_id: ${{ steps.get-job-id.outputs.job-id }}
job_name: ${{ steps.get-job-id.outputs.job-name }}
workflow_name: ${{ github.workflow }}
workflow_run_id: ${{github.run_id}}
workflow_attempt: ${{github.run_attempt}}
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'

View File

@ -85,7 +85,7 @@ jobs:
uses: pytorch/test-infra/.github/actions/setup-python@main
with:
python-version: ${{ inputs.python-version }}
pip-requirements-file: .ci/docker/requirements-ci.txt
pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0

View File

@ -122,7 +122,7 @@ jobs:
uses: pytorch/test-infra/.github/actions/setup-python@main
with:
python-version: ${{ inputs.python-version }}
pip-requirements-file: .ci/docker/requirements-ci.txt
pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
- name: Start monitoring script
id: monitor-script

View File

@ -84,6 +84,9 @@ jobs:
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
- name: Clean up leftover processes on non-ephemeral Windows runner
uses: pytorch/test-infra/.github/actions/cleanup-runner@main
- name: Setup SSH (Click me for login details)
uses: pytorch/test-infra/.github/actions/setup-ssh@main
with:

View File

@ -77,6 +77,9 @@ jobs:
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
- name: Clean up leftover processes on non-ephemeral Windows runner
uses: pytorch/test-infra/.github/actions/cleanup-runner@main
- name: Setup SSH (Click me for login details)
uses: pytorch/test-infra/.github/actions/setup-ssh@main
with:
@ -103,6 +106,18 @@ jobs:
with:
cuda-version: ${{ inputs.cuda-version }}
# TODO: Move to a requirements.txt file for windows
- name: Install pip dependencies
uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
with:
shell: bash
timeout_minutes: 5
max_attempts: 5
retry_wait_seconds: 30
command: |
set -eu
python3 -m pip install 'xdoctest>=1.1.0'
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
@ -257,6 +272,15 @@ jobs:
shell: bash
run: python3 .github/scripts/parse_ref.py
- name: Uninstall PyTorch
if: always()
continue-on-error: true
shell: bash
run: |
# This step removes PyTorch installed by the test to give a clean slate
# to the next job
python3 -mpip uninstall -y torch
- name: Teardown Windows
uses: ./.github/actions/teardown-win
if: always()

View File

@ -36,7 +36,7 @@ jobs:
runs-on: linux.9xlarge.ephemeral
strategy:
matrix:
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "rocm7.0", "cpu"]
steps:
- name: Build docker image
uses: pytorch/pytorch/.github/actions/binary-docker-build@main

View File

@ -132,7 +132,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -178,7 +178,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -224,7 +224,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -335,7 +335,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -381,7 +381,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -427,7 +427,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -538,7 +538,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -584,7 +584,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -630,7 +630,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -741,7 +741,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -787,7 +787,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -833,7 +833,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -944,7 +944,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -990,7 +990,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1036,7 +1036,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1147,7 +1147,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1193,7 +1193,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1239,7 +1239,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1350,7 +1350,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1396,7 +1396,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1442,7 +1442,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -0,0 +1,87 @@
# @generated DO NOT EDIT MANUALLY
# Template is at: .github/templates/linux_binary_build_workflow.yml.j2
# Generation script: .github/scripts/generate_ci_workflows.py
name: linux-binary-libtorch-release
on:
push:
branches:
- main
tags:
- 'ciflow/trunk/*'
workflow_dispatch:
permissions:
id-token: write
env:
# Needed for conda builds
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
AWS_DEFAULT_REGION: us-east-1
BINARY_ENV_FILE: /tmp/env
BUILD_ENVIRONMENT: linux-binary-libtorch-release
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.pull_request.number }}
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
PYTORCH_ROOT: /pytorch
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
SKIP_ALL_TESTS: 0
concurrency:
group: linux-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
libtorch-cpu-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: cpu
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: libtorch-cpu-shared-with-deps-release
build_environment: linux-binary-libtorch-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-cpu-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-cpu-shared-with-deps-release-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: cpu
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-cpu-shared-with-deps-release
build_environment: linux-binary-libtorch-release
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.4xlarge
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -0,0 +1,88 @@
# @generated DO NOT EDIT MANUALLY
# Template is at: .github/templates/linux_binary_build_workflow.yml.j2
# Generation script: .github/scripts/generate_ci_workflows.py
name: linux-binary-manywheel
on:
push:
branches:
- main
tags:
- 'ciflow/trunk/*'
workflow_dispatch:
permissions:
id-token: write
env:
# Needed for conda builds
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
AWS_DEFAULT_REGION: us-east-1
BINARY_ENV_FILE: /tmp/env
BUILD_ENVIRONMENT: linux-binary-manywheel
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.pull_request.number }}
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
PYTORCH_ROOT: /pytorch
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
SKIP_ALL_TESTS: 0
concurrency:
group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
manywheel-py3_12-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda13_0-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_12-cuda13_0-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda13_0
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -127,7 +127,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_6-test: # Testing
@ -193,7 +193,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_8-test: # Testing
@ -259,7 +259,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda13_0-test: # Testing
@ -721,7 +721,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_6-test: # Testing
@ -787,7 +787,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_8-test: # Testing
@ -853,7 +853,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda13_0-test: # Testing
@ -1315,7 +1315,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_6-test: # Testing
@ -1381,7 +1381,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_8-test: # Testing
@ -1447,7 +1447,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda13_0-test: # Testing
@ -1909,7 +1909,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_6-test: # Testing
@ -1975,7 +1975,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_8-test: # Testing
@ -2041,7 +2041,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda13_0-test: # Testing
@ -2503,7 +2503,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_6-test: # Testing
@ -2569,7 +2569,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_8-test: # Testing
@ -2635,7 +2635,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda13_0-test: # Testing
@ -3097,7 +3097,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_6-test: # Testing
@ -3163,7 +3163,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_8-test: # Testing
@ -3229,7 +3229,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda13_0-test: # Testing
@ -3691,7 +3691,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_6-test: # Testing
@ -3757,7 +3757,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_8-test: # Testing
@ -3823,7 +3823,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda13_0-test: # Testing

View File

@ -0,0 +1,136 @@
# @generated DO NOT EDIT MANUALLY
# Template is at: .github/templates/linux_binary_build_workflow.yml.j2
# Generation script: .github/scripts/generate_ci_workflows.py
name: linux-binary-manywheel-rocm
on:
push:
branches:
- main
tags:
- 'ciflow/binaries/*'
- 'ciflow/binaries_wheel/*'
- 'ciflow/rocm/*'
workflow_dispatch:
permissions:
id-token: write
env:
# Needed for conda builds
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
AWS_DEFAULT_REGION: us-east-1
BINARY_ENV_FILE: /tmp/env
BUILD_ENVIRONMENT: linux-binary-manywheel-rocm
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.pull_request.number }}
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
PYTORCH_ROOT: /pytorch
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
SKIP_ALL_TESTS: 0
concurrency:
group: linux-binary-manywheel-rocm-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
manywheel-py3_10-rocm6_4-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.4
GPU_ARCH_VERSION: "6.4"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_10-rocm6_4
build_environment: linux-binary-manywheel-rocm
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-rocm6_4-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_10-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.4
GPU_ARCH_VERSION: "6.4"
GPU_ARCH_TYPE: rocm
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.10"
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: manywheel-py3_10-rocm6_4
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: ROCm set GPU_FLAG
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
docker-image-name: manylinux2_28-builder
custom-tag-prefix: rocm6.4
docker-build-dir: .ci/docker
working-directory: pytorch
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
env:
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm

View File

@ -0,0 +1,261 @@
# @generated DO NOT EDIT MANUALLY
# Template is at: .github/templates/windows_binary_build_workflow.yml.j2
# Generation script: .github/scripts/generate_ci_workflows.py
name: windows-binary-libtorch-debug
on:
push:
branches:
- main
workflow_dispatch:
env:
# Needed for conda builds
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
AWS_DEFAULT_REGION: us-east-1
BUILD_ENVIRONMENT: windows-binary-libtorch-debug
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.pull_request.number }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
SKIP_ALL_TESTS: 1
OS: windows
concurrency:
group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
libtorch-cpu-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: debug
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.10"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
- uses: actions/upload-artifact@v4.4.0
if: always()
with:
name: libtorch-cpu-shared-with-deps-debug
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
libtorch-cpu-shared-with-deps-debug-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-cpu-shared-with-deps-debug-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: debug
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-cpu-shared-with-deps-debug
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Test PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1

View File

@ -0,0 +1,261 @@
# @generated DO NOT EDIT MANUALLY
# Template is at: .github/templates/windows_binary_build_workflow.yml.j2
# Generation script: .github/scripts/generate_ci_workflows.py
name: windows-binary-libtorch-release
on:
push:
branches:
- main
workflow_dispatch:
env:
# Needed for conda builds
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
AWS_DEFAULT_REGION: us-east-1
BUILD_ENVIRONMENT: windows-binary-libtorch-release
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.pull_request.number }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
SKIP_ALL_TESTS: 1
OS: windows
concurrency:
group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
libtorch-cpu-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.10"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
- uses: actions/upload-artifact@v4.4.0
if: always()
with:
name: libtorch-cpu-shared-with-deps-release
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
libtorch-cpu-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-cpu-shared-with-deps-release-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-cpu-shared-with-deps-release
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Test PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1

View File

@ -18,7 +18,6 @@ permissions:
contents: read
jobs:
# H100 A100 runners
opmicrobenchmark-build:
if: github.repository_owner == 'pytorch'
name: opmicrobenchmark-build
@ -45,56 +44,3 @@ jobs:
docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
secrets: inherit
# B200 runner
opmicrobenchmark-build-b200:
if: github.repository_owner == 'pytorch'
name: opmicrobenchmark-build-b200
uses: ./.github/workflows/_linux-build.yml
with:
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
]}
secrets: inherit
opmicrobenchmark-test-b200:
name: opmicrobenchmark-test-b200
uses: ./.github/workflows/_linux-test.yml
needs: opmicrobenchmark-build-b200
with:
timeout-minutes: 500
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
secrets: inherit
# ROCM MI300 runner
opmicrobenchmark-build-rocm:
if: github.repository_owner == 'pytorch'
name: opmicrobenchmark-build-rocm
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-rocm-py3_10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
test-matrix: |
{ include: [
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
]}
secrets: inherit
opmicrobenchmark-test-rocm:
name: opmicrobenchmark-test-rocm
uses: ./.github/workflows/_rocm-test.yml
needs: opmicrobenchmark-build-rocm
with:
timeout-minutes: 500
build-environment: linux-jammy-rocm-py3_10
docker-image: ${{ needs.opmicrobenchmark-build-rocm.outputs.docker-image }}
test-matrix: ${{ needs.opmicrobenchmark-build-rocm.outputs.test-matrix }}
secrets: inherit

View File

@ -61,15 +61,3 @@ jobs:
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-sm90-FA3-ABI-stable-test:
name: linux-jammy-cuda12_8-py3_10-gcc11-sm90-FA3-ABI-stable-test
uses: ./.github/workflows/_linux-test-stable-fa3.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-sm90-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.docker-image }}
timeout-minutes: 30
s3-bucket: gha-artifacts
secrets: inherit

View File

@ -23,7 +23,7 @@ jobs:
with:
repository: pytorch/pytorch
stable-branch: viable/strict
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-aarch64\"]'
requires: '[\"pull\", \"trunk\", \"lint\", \"^linux-binary-manywheel$\", \"^linux-binary-libtorch-release$\", \"linux-aarch64\"]'
secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
@ -49,6 +49,5 @@ jobs:
pip install awscli==1.29.40
aws s3 cp "/tmp/${LATEST_SHA}.json" "s3://ossci-raw-job-status/stable_pushes/pytorch/pytorch/${LATEST_SHA}.json"
# Push new viable/strict tag
cd pytorch/pytorch
git push origin "${LATEST_SHA}:refs/tags/viable/strict/${TIME}"
fi

View File

@ -42,7 +42,7 @@ jobs:
build-external-packages: "vllm"
build-environment: linux-jammy-cuda12.8-py3.12-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
cuda-arch-list: '8.0 8.9 9.0'
cuda-arch-list: '8.0;8.9;9.0'
runner: linux.24xlarge.memory
test-matrix: |
{ include: [

View File

@ -18,7 +18,6 @@ exclude_patterns = [
'torch/_inductor/autoheuristic/artifacts/**',
'scripts/**',
'test/generated_type_hints_smoketest.py',
'test/test_torchfuzz_repros.py',
# CPython tests
'test/dynamo/cpython/**',
# Tests from the NumPy test suite
@ -28,7 +27,6 @@ exclude_patterns = [
'torch/lib/**',
'venv/**',
'**/*.pyi',
"tools/experimental/dynamic_shapes/torchfuzz/**",
'tools/test/test_selective_build.py',
]
command = [
@ -1262,7 +1260,6 @@ exclude_patterns = [
'test/test_masked.py',
'test/test_maskedtensor.py',
'test/test_matmul_cuda.py',
'test/test_scaled_matmul_cuda.py',
'test/test_meta.py',
'test/test_metal.py',
'test/test_mkl_verbose.py',

View File

@ -181,15 +181,15 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
/torch/csrc/jit/python/init.cpp @mikaylagawarecki
# CUDA and CUDA math libraries
aten/src/ATen/cuda/ @eqy @syed-ahmed @Aidyn-A
aten/src/ATen/cudnn/ @eqy @syed-ahmed @Aidyn-A
aten/src/ATen/native/cuda/ @eqy @syed-ahmed @Aidyn-A
aten/src/ATen/native/cudnn/ @eqy @syed-ahmed @Aidyn-A
c10/cuda @eqy @syed-ahmed @Aidyn-A
torch/cuda/ @eqy @syed-ahmed @Aidyn-A
torch/csrc/cuda/ @eqy @syed-ahmed @Aidyn-A
torch/backends/cuda/ @eqy @syed-ahmed @Aidyn-A
torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
aten/src/ATen/cuda/ @eqy @syed-ahmed
aten/src/ATen/cudnn/ @eqy @syed-ahmed
aten/src/ATen/native/cuda/ @eqy @syed-ahmed
aten/src/ATen/native/cudnn/ @eqy @syed-ahmed
c10/cuda @eqy @syed-ahmed
torch/cuda/ @eqy @syed-ahmed
torch/csrc/cuda/ @eqy @syed-ahmed
torch/backends/cuda/ @eqy @syed-ahmed
torch/backends/cudnn/ @eqy @syed-ahmed
# PyTree utilities
/torch/utils/_pytree.py @XuehaiPan

View File

@ -81,7 +81,7 @@ git remote add upstream git@github.com:pytorch/pytorch.git
make setup-env
# Or run `make setup-env-cuda` for pre-built CUDA binaries
# Or run `make setup-env-rocm` for pre-built ROCm binaries
source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows
source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
```
### Tips and Debugging
@ -182,36 +182,28 @@ You can use this script to check out a new nightly branch with the following:
```bash
./tools/nightly.py checkout -b my-nightly-branch
source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows
source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
```
To install the nightly binaries built with CUDA, you can pass in the flag `--cuda`:
```bash
./tools/nightly.py checkout -b my-nightly-branch --cuda
source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows
source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
```
To install the nightly binaries built with ROCm, you can pass in the flag `--rocm`:
```bash
./tools/nightly.py checkout -b my-nightly-branch --rocm
source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows
source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
```
You can also use this tool to pull the nightly commits into the current branch:
```bash
./tools/nightly.py pull
source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows
```
To create the virtual environment with a specific Python interpreter, you can
pass in the `--python` argument:
```bash
./tools/nightly.py --python /path/to/python3.12
source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows
./tools/nightly.py pull -p my-env
source my-env/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
```
Pulling will recreate a fresh virtual environment and reinstall the development

View File

@ -50,10 +50,11 @@ RUN git submodule update --init --recursive
FROM conda as conda-installs
ARG PYTHON_VERSION=3.11
ARG CUDA_PATH=cu121
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=whl/nightly
# Automatically set by buildx
# pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=25.7.0
RUN /opt/conda/bin/conda update -y -n base -c defaults conda
RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION}
ARG TARGETPLATFORM

View File

@ -3,7 +3,6 @@
<!-- toc -->
- [Release Compatibility Matrix](#release-compatibility-matrix)
- [PyTorch CUDA Support Matrix](#pytorch-cuda-support-matrix)
- [Release Cadence](#release-cadence)
- [General Overview](#general-overview)
- [Frequently Asked Questions](#frequently-asked-questions)
@ -64,22 +63,6 @@ Following is the Release Compatibility Matrix for PyTorch releases:
| 1.13 | >=3.7, <=3.10 | C++14 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
| 1.12 | >=3.7, <=3.10 | C++14 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |
### PyTorch CUDA Support Matrix
For Release 2.9 PyTorch Supports following CUDA Architectures:
| CUDA | architectures supported for Linux x86 and Windows builds | notes |
| --- | --- | --- |
| 12.6.3 | Maxwell(5.0), Pascal(6.0), Volta(7.0), Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0) | |
| 12.8.1 | Volta(7.0), Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0), Blackwell(10.0, 12.0) | |
| 13.0.0 | Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0), Blackwell(10.0, 12.0+PTX) | +PTX available on linux builds only |
| CUDA | architectures supported for Linux aarch64 builds |
| --- | --- |
| 12.6.3 | Ampere(8.0), Hopper(9.0) |
| 12.8.1 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 12.0) |
| 13.0.0 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 11.0, 12.0+PTX) |
## Release Cadence
Following is the release cadence. All future dates below are tentative. For latest updates on the release schedule, please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.

View File

@ -605,11 +605,6 @@ if(UNIX)
if(HAVE_MALLOC_USABLE_SIZE)
add_definitions(-DHAVE_MALLOC_USABLE_SIZE=1)
endif(HAVE_MALLOC_USABLE_SIZE)
set(CMAKE_EXTRA_INCLUDE_FILES "fcntl.h")
CHECK_FUNCTION_EXISTS(posix_fallocate HAVE_POSIX_FALLOCATE)
if(HAVE_POSIX_FALLOCATE)
add_definitions(-DHAVE_POSIX_FALLOCATE=1)
endif(HAVE_POSIX_FALLOCATE)
endif(UNIX)
ADD_DEFINITIONS(-DUSE_EXTERNAL_MZCRC)

View File

@ -279,6 +279,42 @@ bool Context::userEnabledOverrideableSDP() const {
return enabled_overrideable;
}
static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
bool Context::checkCuBLASConfigDeterministic() {
// If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
// is set to deterministic setting
if (hasCUDART()) {
const auto workspace_config = c10::utils::get_env(cublas_config_var_name);
return (workspace_config == cublas_deterministic_configs[0] || workspace_config == cublas_deterministic_configs[1]);
}
return true;
}
void Context::alertCuBLASConfigNotDeterministic() const {
static const bool cublas_config_deterministic = checkCuBLASConfigDeterministic();
if (C10_LIKELY(!deterministicAlgorithms() || cublas_config_deterministic)) {
return;
}
auto msg = c10::str(
"Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ",
"`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ",
"it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this ",
"case, you must set an environment variable before running your PyTorch application: ",
cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ",
cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ",
"https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility"
);
if (deterministicAlgorithmsWarnOnly()) {
TORCH_WARN(msg);
} else {
TORCH_CHECK(false, msg);
}
}
bool Context::benchmarkCuDNN() const {
return benchmark_cudnn;
}

View File

@ -310,7 +310,13 @@ class TORCH_API Context {
//
// * Throw an error when `Context::deterministicAlgorithms()` is true. Most
// of the time, this should be accomplished by calling
// `at::globalContext().alertNotDeterminstic().
// `at::globalContext().alertNotDeterminstic()`. However, if the
// nondeterministic behavior is caused by the CuBLAS workspace
// configuration in CUDA >= 10.2,
// `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
// called instead (in this case, a comment explaining why the operation is
// nondeterministic is not necessary). See below for details on these
// methods.
//
// * Have an entry in the list of nondeterministic PyTorch operations in the
// docstring of `use_deterministic_algorithms()` in torch/__init__.py
@ -334,6 +340,12 @@ class TORCH_API Context {
// Throws an error if `Context::deterministicAlgorithms()` is true
static void alertNotDeterministic(std::string_view const& caller);
// Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
// >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
// ":4096:8". For more details:
// https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
void alertCuBLASConfigNotDeterministic() const;
void setFloat32MatmulPrecision(const std::string& s);
void setFloat32Precision(
const std::string& backend,
@ -417,6 +429,7 @@ class TORCH_API Context {
}
private:
static bool checkCuBLASConfigDeterministic();
std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES> init_;
bool enabled_cudnn = true;
bool deterministic_cudnn = false;

View File

@ -292,28 +292,6 @@ MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags,
if (ftruncate(fd, static_cast<off_t>(size)) == -1) {
TORCH_CHECK(false, "unable to resize file <", filename_, "> to the right size: ", c10::utils::str_error(errno), " (", errno, ")");
}
#ifdef HAVE_POSIX_FALLOCATE
if (flags_ & ALLOCATOR_MAPPED_SHAREDMEM) {
for (;;) {
if (posix_fallocate(fd, 0, static_cast<off_t>(size)) == 0) {
break;
}
if (errno == EINTR) {
continue;
}
if (errno == EINVAL || errno == EOPNOTSUPP) {
// the underlying filesystem does not support the operation
break;
}
TORCH_CHECK(false, "unable to allocate shared memory(shm) for file <", filename_, ">: ", c10::utils::str_error(errno), " (", errno, ")");
}
}
#endif
if (fstat(fd, &file_stat) == -1 || file_stat.st_size < static_cast<int64_t>(size)) {
#ifndef STRIP_ERROR_MESSAGES
int last_err = errno;

View File

@ -6,7 +6,6 @@
#include <c10/core/thread_pool.h>
#include <c10/util/flat_hash_map.h>
#include <c10/util/llvmMathExtras.h>
#include <iostream>
#include <optional>
#include <deque>
@ -52,15 +51,17 @@ namespace {
// Struct containing memory allocator summary statistics for host.
struct TORCH_API HostStats {
// COUNT: total allocations (active)
Stat active_requests;
// SUM: bytes allocated/reserved by this memory alocator. (active)
Stat active_bytes;
// COUNT: total allocations (active + free)
Stat allocations;
// SUM: bytes allocated/reserved by this memory alocator. This accounts
// for both free and in-use blocks.
// COUNT: allocations requested by client code. Note that active
// count can be extracted by looking at current allocations
Stat allocation;
// COUNT: number of allocated segments from host memory allocation.
Stat segment;
// SUM: bytes allocated by this memory alocator. Note that active bytes
// can be extracted by looking at current bytes allocated
Stat allocated_bytes;
// SUM: bytes reserved by this memory allocator (both free and used)
Stat reserved_bytes;
// SUM: time spent in cudaHostAlloc/cudaHostRegister in microseconds
DurationStat host_alloc_time;
@ -74,9 +75,6 @@ struct TORCH_API HostStats {
// COUNT: number of times cudaHostFree/cudaHostUnregister was called.
int64_t num_host_free = 0; // This is derived from segment or timing
// Count of cudaHostAlloc/cudaHostRegister per bucket
std::vector<int64_t> bucket_allocation = std::vector<int64_t>(MAX_SIZE_INDEX);
};
// Struct containing memory allocator summary statistics for host, as they
@ -84,22 +82,17 @@ struct TORCH_API HostStats {
// avoid locking the allocator while collecting stats.
struct alignas(64) HostStatsStaged {
std::mutex timing_mutex_;
// COUNT: total allocations (active + free)
// COUNT: allocations requested by client code resulting in a new segment/block allocation
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
Stat allocation;
// SUM: bytes within active memory blocks, including blocks that are
// currently in the free list.
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
Stat allocations;
// SUM: bytes allocated/reserved by this memory alocator. This accounts
// for both free and in-use blocks.
Stat allocated_bytes;
// COUNT: number of allocations per bucket (active)
// LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
std::vector<Stat> active_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
// SUM: bytes of allocation per bucket (active)
// LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
std::vector<Stat> active_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
// COUNT: number of allocations per bucket (active + free)
// COUNT: number of allocations per bucket
// LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
std::vector<Stat> allocation_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
// SUM: bytes of allocation per bucket (active + free)
// SUM: bytes of allocation per bucket
// LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
std::vector<Stat> allocated_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
// SUM: time spent in cudaHostAlloc/cudaHostRegister
@ -203,17 +196,6 @@ struct CachingHostAllocatorImpl {
// background.
if (!pinned_use_background_threads()) {
process_events();
} else {
// Launch the background thread and process events in a loop.
static bool background_thread_flag [[maybe_unused]] = [this] {
getBackgroundThreadPool()->run([&]() {
while (active_) {
process_events();
std::this_thread::sleep_for(std::chrono::microseconds(100));
}
});
return true;
}();
}
// Round up the allocation to the nearest power of two to improve reuse.
@ -226,6 +208,27 @@ struct CachingHostAllocatorImpl {
return {block->ptr_, reinterpret_cast<void*>(block)};
}
// Check in the recently freed blocks with pending events to see if we
// can reuse them. Call get_free_block again after processing events
if (pinned_use_background_threads()) {
process_events_for_specific_size(roundSize);
block = get_free_block(roundSize);
if (block) {
return {block->ptr_, reinterpret_cast<void*>(block)};
}
// Launch the background thread and process events in a loop.
static bool background_thread_flag [[maybe_unused]] = [this] {
getBackgroundThreadPool()->run([&]() {
while (active_) {
process_events();
std::this_thread::sleep_for(std::chrono::microseconds(100));
}
});
return true;
}();
}
// Slow path: if we can't allocate from the cached free list, we need
// to create a new block.
void* ptr = nullptr;
@ -275,6 +278,8 @@ struct CachingHostAllocatorImpl {
auto index = size_index(block->size_);
std::lock_guard<std::mutex> g(free_list_[index].mutex_);
free_list_[index].list_.push_back(block);
stats_.allocation_bucket_stats[index].decrease(1);
stats_.allocated_bytes_bucket_stats[index].decrease(block->size_);
} else {
// restore these events that record by used streams.
std::lock_guard<std::mutex> g(events_mutex_);
@ -334,12 +339,9 @@ struct CachingHostAllocatorImpl {
for (auto* block : blocks_to_remove) {
blocks_.erase(block);
ptr_to_block_.erase(block->ptr_);
auto index = size_index(block->size_);
free_block(block);
stats_.allocations.decrease(1);
stats_.allocation.decrease(1);
stats_.allocated_bytes.decrease(block->size_);
stats_.allocation_bucket_stats[index].decrease(1);
stats_.allocated_bytes_bucket_stats[index].decrease(block->size_);
free_block(block);
delete block;
}
}
@ -386,17 +388,16 @@ struct CachingHostAllocatorImpl {
// per bucket (we pick index 0 arbitrarily). These are also all the host
// allocations, not taking into account caching and free lists.
if (i == 0) {
stats.allocations = stats_.allocations;
stats.allocated_bytes = stats_.allocated_bytes;
stats.num_host_alloc = stats.allocations.allocated;
stats.num_host_free = stats.allocations.freed;
stats.segment = stats_.allocation;
stats.reserved_bytes = stats_.allocated_bytes;
stats.num_host_alloc = stats.segment.allocated;
stats.num_host_free = stats.segment.freed;
}
// Bucket stats need to be merged with the slow-path stats. We do this in
// a best effort manner, since we can't really replay the cached events per bucket.
add_bucket_stats(stats.active_requests, stats_.active_bucket_stats[i]);
add_bucket_stats(stats.active_bytes, stats_.active_bytes_bucket_stats[i]);
stats.bucket_allocation[i] = stats_.allocation_bucket_stats[i].allocated;
add_bucket_stats(stats.allocation, stats_.allocation_bucket_stats[i]);
add_bucket_stats(stats.allocated_bytes, stats_.allocated_bytes_bucket_stats[i]);
}
// Get the timing stats
@ -420,11 +421,9 @@ struct CachingHostAllocatorImpl {
std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
if (i == 0) {
stats_.allocations.reset_accumulated();
stats_.allocation.reset_accumulated();
stats_.allocated_bytes.reset_accumulated();
}
stats_.active_bucket_stats[i].reset_accumulated();
stats_.active_bytes_bucket_stats[i].reset_accumulated();
stats_.allocation_bucket_stats[i].reset_accumulated();
stats_.allocated_bytes_bucket_stats[i].reset_accumulated();
}
@ -447,11 +446,9 @@ struct CachingHostAllocatorImpl {
std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
if (i == 0) {
stats_.allocations.reset_peak();
stats_.allocation.reset_peak();
stats_.allocated_bytes.reset_peak();
}
stats_.active_bucket_stats[i].reset_peak();
stats_.active_bytes_bucket_stats[i].reset_peak();
stats_.allocation_bucket_stats[i].reset_peak();
stats_.allocated_bytes_bucket_stats[i].reset_peak();
}
@ -468,7 +465,7 @@ struct CachingHostAllocatorImpl {
virtual void add_allocated_block(B* block) {
std::lock_guard<std::mutex> g(blocks_mutex_);
blocks_.insert(block);
stats_.allocations.increase(1);
stats_.allocation.increase(1);
stats_.allocated_bytes.increase(block->size_);
ptr_to_block_.insert({block->ptr_, block});
@ -481,8 +478,6 @@ struct CachingHostAllocatorImpl {
std::lock_guard<std::mutex> g(free_list_[index].mutex_);
stats_.allocation_bucket_stats[index].increase(1);
stats_.allocated_bytes_bucket_stats[index].increase(size);
stats_.active_bucket_stats[index].increase(1);
stats_.active_bytes_bucket_stats[index].increase(size);
}
}
@ -493,8 +488,8 @@ struct CachingHostAllocatorImpl {
B* block = free_list_[index].list_.back();
free_list_[index].list_.pop_back();
block->allocated_ = true;
stats_.active_bucket_stats[index].increase(1);
stats_.active_bytes_bucket_stats[index].increase(size);
stats_.allocation_bucket_stats[index].increase(1);
stats_.allocated_bytes_bucket_stats[index].increase(size);
return block;
}
return nullptr;
@ -588,8 +583,8 @@ struct CachingHostAllocatorImpl {
auto index = size_index(block->size_);
std::lock_guard<std::mutex> g(free_list_[index].mutex_);
free_list_[index].list_.push_back(block);
stats_.active_bucket_stats[index].decrease(1);
stats_.active_bytes_bucket_stats[index].decrease(size);
stats_.allocation_bucket_stats[index].decrease(1);
stats_.allocated_bytes_bucket_stats[index].decrease(size);
if (size != -1) {
return;
}

View File

@ -2,7 +2,6 @@
#include <c10/core/impl/PythonDispatcherTLS.h>
#include <ATen/core/PythonFallbackKernel.h>
#include <c10/core/SafePyObject.h>
#include <ATen/record_function.h>
namespace {
@ -54,24 +53,20 @@ void pythonFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_
TORCH_INTERNAL_ASSERT(tls_on_entry.has_value());
// c10::impl::ForceDispatchKeyGuard dispatcher_guard(tls_on_entry.value());
// StashTLSOnEntryGuard stash_guard;
c10::impl::ExcludeDispatchKeyGuard exclude_guard(after_Python_keyset);
c10::impl::ExcludeDispatchKeyGuard guard(after_Python_keyset);
const auto& schema = op.schema();
const auto num_arguments = schema.arguments().size();
// If Torch Dispatch Mode is active, use its PyInterpreter for dispatch
const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
if (mode_stack_len > 0) {
RECORD_FUNCTION("PythonDispatchMode", torch::jit::last(*stack, num_arguments));
const auto& cur_torch_dispatch_mode_state = c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
cur_torch_dispatch_mode_state->pyinterpreter()->dispatch(op, stack);
return;
}
RECORD_FUNCTION("PythonSubclass", torch::jit::last(*stack, num_arguments));
// Otherwise, find a PyInterpreter on a Tensor
const auto& schema = op.schema();
const auto num_arguments = schema.arguments().size();
// It is safe to dispatch on the very first Tensor with a pyobj_interpreter
// without checking the interpreters of any of the arguments, because when
// we actually run dispatch(), we will take out PyObjects in the context

View File

@ -1,32 +1,22 @@
#include <ATen/core/PythonOpRegistrationTrampoline.h>
#include <c10/core/impl/PyInterpreterHooks.h>
// TODO: delete this
namespace at::impl {
// The strategy is that all python interpreters attempt to register themselves
// as the main interpreter, but only one wins. Only that interpreter is
// allowed to interact with the C++ dispatcher. Furthermore, when we execute
// logic on that interpreter, we do so hermetically, never setting pyobj field
// on Tensor.
std::atomic<c10::impl::PyInterpreter*>
PythonOpRegistrationTrampoline::interpreter_{nullptr};
c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::interpreter_ = nullptr;
c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() {
return PythonOpRegistrationTrampoline::interpreter_.load();
return c10::impl::getGlobalPyInterpreter();
}
bool PythonOpRegistrationTrampoline::registerInterpreter(
c10::impl::PyInterpreter* interp) {
c10::impl::PyInterpreter* expected = nullptr;
interpreter_.compare_exchange_strong(expected, interp);
if (expected != nullptr) {
// This is the second (or later) Python interpreter, which means we need
// non-trivial hermetic PyObject TLS
c10::impl::HermeticPyObjectTLS::init_state();
if (interpreter_ != nullptr) {
return false;
} else {
return true;
}
interpreter_ = interp;
return true;
}
} // namespace at::impl

View File

@ -2,19 +2,21 @@
#include <ATen/core/dispatch/Dispatcher.h>
// TODO: this can probably live in c10
// TODO: We can get rid of this
namespace at::impl {
// Manages the single Python interpreter instance for PyTorch.
class TORCH_API PythonOpRegistrationTrampoline final {
static std::atomic<c10::impl::PyInterpreter*> interpreter_;
static c10::impl::PyInterpreter* interpreter_;
public:
// Returns true if you successfully registered yourself (that means
// you are in the hot seat for doing the operator registrations!)
// Register the Python interpreter. Returns true on first registration,
// false if an interpreter was already registered.
static bool registerInterpreter(c10::impl::PyInterpreter*);
// Returns the registered interpreter via the global PyInterpreter hooks.
// Returns nullptr if no interpreter has been registered yet.
static c10::impl::PyInterpreter* getInterpreter();
};

View File

@ -173,12 +173,4 @@ unsigned TensorBase::_register_hook(std::function<TensorBase(const TensorBase&)>
return impl::GetVariableHooks()->_register_hook(*this, std::move(hook));
}
std::optional<ScalarType> TensorBase::grad_dtype() const {
return impl::GetVariableHooks()->grad_dtype(*this);
}
void TensorBase::set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const {
return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype);
}
} // namespace at

View File

@ -930,10 +930,6 @@ public:
const TensorBase& requires_grad_(bool _requires_grad=true) const;
std::optional<ScalarType> grad_dtype() const;
void set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const;
// View Variables
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -68,8 +68,6 @@ struct TORCH_API VariableHooksInterface {
const c10::OperatorHandle& op,
c10::DispatchKeySet dispatch_keys,
torch::jit::Stack* stack) const = 0;
virtual std::optional<c10::ScalarType> grad_dtype(const TensorBase&) const = 0;
virtual void set_grad_dtype(const TensorBase&, const std::optional<c10::ScalarType>&) const = 0;
};
TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);

View File

@ -357,7 +357,7 @@ IValue IValue::equals(const IValue& rhs) const {
case Tag::Enum:
return lhs.toEnumHolder()->is(*rhs.toEnumHolder());
case Tag::Uninitialized:
// Uninitialized ivalues show up in no-ops when the compiler can prove a
// Unitialized ivalues show up in no-ops when the compiler can prove a
// value will never be used. Just return false on any equality comparison.
return false;
}

View File

@ -191,10 +191,6 @@ uint32_t _getAlignment(uintptr_t address) {
#ifdef USE_ROCM
static c10::cuda::CUDAStream _getCarveoutStream(int32_t value) {
// 0 is default value, meaning full CUs i.e. no mask
if (value == 0) {
return at::cuda::getCurrentCUDAStream();
}
static int32_t last_value = 0;
static hipStream_t stream;
if (last_value == 0) {
@ -213,15 +209,15 @@ static c10::cuda::CUDAStream _getCarveoutStream(int32_t value) {
int32_t CUs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
// how many uint32_t do we need to cover all CUs, fill bitmask with 1
uint32_t mask_size = static_cast<uint32_t>((CUs + 32 - 1) / 32);
std::vector<uint32_t> mask(mask_size, uint32_t{0x00000000});
std::vector<uint32_t> mask(mask_size, uint32_t{0xffffffff});
// starting from lowest order bits, in 32-bit chunks
// set bits to 0 based on how many CUs to carve out
int32_t full_shifts = value / 32;
int32_t remainder = value % 32;
for (int32_t i = 0; i < full_shifts; i++) {
mask[i] = uint32_t{0xffffffff};
mask[i] = uint32_t{0x00000000};
}
mask[full_shifts] = uint32_t{0xffffffff} << (32 - remainder);
mask[full_shifts] = uint32_t{0xffffffff} << remainder;
// finally, create masked stream
AT_CUDA_CHECK(hipExtStreamCreateWithCUMask(&stream, mask_size, &mask[0]));
@ -440,6 +436,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
}
globalContext().alertCuBLASConfigNotDeterministic();
cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -573,6 +570,8 @@ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_D
template <>
void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -584,6 +583,8 @@ void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
template <>
void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -595,6 +596,8 @@ void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
template <>
void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -608,6 +611,8 @@ void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::co
template <>
void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -621,6 +626,8 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com
template <typename C_Dtype>
inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -692,6 +699,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
template <typename C_Dtype>
inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
BGEMM_CHECK_ARGVALUES(at::BFloat16);
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
@ -1015,6 +1024,8 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dty
template <>
void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1026,6 +1037,8 @@ void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
template <>
void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1037,6 +1050,8 @@ void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
template <>
void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1050,6 +1065,8 @@ void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::comp
template <>
void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1063,6 +1080,8 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl
template <typename C_Dtype>
inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1171,6 +1190,7 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
template <typename C_Dtype>
inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -2384,6 +2404,8 @@ void trsmBatched<c10::complex<double>>(
template <>
void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t op = _cublasOpFromChar(trans);
_cublasAdjustLdLevel2(m, n, &lda);
@ -2399,6 +2421,8 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
// gemv is bw bound, and does not benefit from TF32. But the precision
// loss still happens on TF32. So we disable it here.
NoTF32Guard disable_tf32;
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t op = _cublasOpFromChar(trans);
_cublasAdjustLdLevel2(m, n, &lda);
@ -2411,6 +2435,8 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
template <>
void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t op = _cublasOpFromChar(trans);
_cublasAdjustLdLevel2(m, n, &lda);
@ -2424,6 +2450,8 @@ void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
// gemv is bw bound, and does not benefit from TF32. But the precision
// loss still happens on TF32. So we disable it here.
NoTF32Guard disable_tf32;
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t op = _cublasOpFromChar(trans);
_cublasAdjustLdLevel2(m, n, &lda);

View File

@ -151,6 +151,11 @@ struct CUDACachingHostAllocatorImpl
}
bool query_event(EventPool::Event& event) override {
// Do not call cudaEventQuery if capturing is underway
if (at::cuda::currentStreamCaptureStatusMayInitCtx() !=
at::cuda::CaptureStatus::None) {
return false;
}
cudaError_t err = cudaEventQuery(*event);
if (err == cudaErrorNotReady) {
(void)cudaGetLastError(); // clear CUDA error

View File

@ -90,10 +90,6 @@ public:
allocator_->setMemoryFraction(fraction, device);
}
std::vector<HIPCachingAllocator::StreamSegmentSize> getExpandableSegmentSizes(c10::DeviceIndex device) override {
return allocator_->getExpandableSegmentSizes(device);
}
void enable(bool value) override {
allocator_->enable(value);
}

View File

@ -670,8 +670,6 @@ Tensor rrelu_with_noise_backward(
}
Tensor rrelu(const Tensor & self, const Scalar& lower, const Scalar& upper, bool training, std::optional<Generator> generator) {
TORCH_CHECK(std::isfinite(lower.to<double>()), "rrelu: lower bound must be finite, got ", lower.to<double>());
TORCH_CHECK(std::isfinite(upper.to<double>()), "rrelu: upper bound must be finite, got ", upper.to<double>());
TORCH_CHECK(lower.to<double>() <= upper.to<double>(), "Lower bound should be less than or equal to the upper bound")
auto noise = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
return at::rrelu_with_noise(self, noise, lower, upper, training, std::move(generator));

View File

@ -2801,7 +2801,6 @@ Tensor matrix_exp(const Tensor& a) {
// TODO This should be deprecated in favor of linalg_matrix_exp_differential
// in FunctionsManual.cpp
Tensor matrix_exp_backward(const Tensor& self, const Tensor& grad) {
squareCheckInputs(self, "matrix_exp_backward");
NoTF32Guard disable_tf32;
return backward_analytic_function_of_a_matrix(
self, grad,

View File

@ -2067,7 +2067,7 @@ Tensor _reshape_copy_symint(
TORCH_CHECK(0, "_reshape_copy not implemented for mkldnn tensors");
}
if (self.is_contiguous_or_false()) {
if (self.is_contiguous()) {
return self.view_symint(shape).clone(at::MemoryFormat::Contiguous);
} else {
return at::_unsafe_view_symint(

View File

@ -1375,7 +1375,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise
&& ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
// cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
|| (dprops->major >= 10 && (!scale_a.sizes().empty() || !scale_b.sizes().empty())))) {
|| (dprops->major >= 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
at::cuda::detail::f8f8bf16_rowwise(
mat1,
@ -1831,37 +1831,6 @@ std::optional<c10::ScalarType> out_dtype) {
return out;
}
static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
// ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
const auto batch1_sizes = batch1.sizes();
const auto batch2_sizes = batch2.sizes();
int64_t bs = batch1_sizes[0];
int64_t contraction_size = batch1_sizes[2];
int64_t res_rows = batch1_sizes[1];
int64_t res_cols = batch2_sizes[2];
std::vector<int64_t> output_size {bs, res_rows, res_cols};
TORCH_CHECK(batch2_sizes[0] == bs && batch2_sizes[1] == contraction_size,
"Expected size for first two dimensions of batch2 tensor to be: [",
bs, ", ", contraction_size, "] but got: [", batch2_sizes[0], ", ", batch2_sizes[1], "].");
TORCH_CHECK(batch1.scalar_type() == batch2.scalar_type(), "batch1 and batch2 must have the same dtype");
TORCH_CHECK(out_dtype == batch1.scalar_type() ||
(out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
if (!is_bmm && self_baddbmm.has_value()) {
const auto& self = self_baddbmm.value();
TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
}
}
Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
IntArrayRef batch1_sizes = batch1.sizes();
IntArrayRef batch2_sizes = batch2.sizes();
@ -1871,7 +1840,12 @@ Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::Sca
}
Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
TORCH_CHECK(out_dtype == batch1.scalar_type() ||
(out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
Scalar beta(0.0);
Scalar alpha(1.0);
{
@ -1890,7 +1864,12 @@ Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tenso
}
Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
TORCH_CHECK(out_dtype == batch1.scalar_type() ||
(out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
{
NoNamesGuard guard;
baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
@ -1905,12 +1884,6 @@ Tensor _mm_dtype_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarTy
}
Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarType out_dtype, Tensor &out) {
TORCH_CHECK(self.dim() == 2, "self must be a matrix, got ", self.dim(), "-D tensor");
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
TORCH_CHECK(
self.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
self.sizes()[0], "x", self.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "input dtypes must be the same");
TORCH_CHECK(out_dtype == self.scalar_type() ||
@ -1930,14 +1903,6 @@ Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& m
}
Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
TORCH_CHECK(
mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
TORCH_CHECK(out_dtype == self.scalar_type() ||
(out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),

View File

@ -8,6 +8,7 @@
#include <ATen/NativeFunctions.h>
#include <ATen/Dispatch.h>
#include <ATen/DynamicLibrary.h>
#include <ATen/NativeFunctions.h>
#include <ATen/native/cuda/MiscUtils.h>
#include <ATen/native/Resize.h>
#include <ATen/native/LinearAlgebra.h>

View File

@ -1041,8 +1041,8 @@ std::string generate_code(
// and `extra_args` for computation call if
// extra arguments to capture runtime state are passed.
// (look at polygamma for example).
std::string extra_params;
std::string extra_args;
std::string extra_params = "";
std::string extra_args = "";
for (size_t i = 0; i < extra_args_typenames.size(); i++) {
auto type = std::string(extra_args_typenames[i]);
auto name = "extra_arg_" + std::to_string(i);
@ -1352,7 +1352,7 @@ std::string generate_reduction_code(
int vec_size,
int max_threads_codegen) {
TORCH_INTERNAL_ASSERT(desc.nInputs == 1);
TORCH_INTERNAL_ASSERT(desc.extra_args_types.empty());
TORCH_INTERNAL_ASSERT(desc.extra_args_types.size() == 0);
return generate_reduction_code(
desc.nOutputs,
@ -1451,7 +1451,7 @@ std::optional<std::string> get_cache_dir() {
std::string cache_dir;
char* ptkcp = std::getenv("PYTORCH_KERNEL_CACHE_PATH");
// Create kernel_cache_dir if needed as we do not want to create the base directory passed by the user
std::string kernels_cache_dir;
std::string kernels_cache_dir = "";
if (ptkcp != nullptr) {
cache_dir = std::string(ptkcp);
} else {

View File

@ -14,6 +14,7 @@
#include <ATen/native/LinearAlgebraUtils.h>
#include <ATen/native/cuda/MiscUtils.h>
#include <ATen/native/LinearAlgebra.h>
#include <ATen/native/BatchLinearAlgebra.h>
#include <ATen/native/cuda/linalg/BatchLinearAlgebraLib.h>
#include <ATen/native/cuda/linalg/MagmaUtils.h>
#include <ATen/native/cpu/zmath.h>
@ -1614,7 +1615,16 @@ static void lu_factor(const Tensor& input, const Tensor& pivots, const Tensor& i
const auto preferred_backend = at::globalContext().linalgPreferredBackend();
#ifdef USE_LINALG_SOLVER
const auto lu_factor_cusolver = [batch_size, m, n](const Tensor& input, const Tensor& pivots, const Tensor& infos, bool compute_pivots) {
if (m != n || (batch_size == 1 || m >= 512)) {
// In CUDA 10.2, lu_factor_looped_cusolver does not finish the computations when the input
// matrix is exactly singular. The returned pivots contain garbage. This breaks linalg.det
// Now, batched_cublas does not handle rectangular matrices, so we still dispatch to
// looped_cusolver even if m != n.
#ifdef USE_ROCM
constexpr bool looped_correct = true;
#else
constexpr bool looped_correct = CUSOLVER_VERSION >= 11100;
#endif
if (m != n || (looped_correct && (batch_size == 1 || m >= 512))) {
lu_factor_looped_cusolver(input, pivots, infos, compute_pivots);
} else {
lu_factor_batched_cublas(input, pivots, infos, compute_pivots);

View File

@ -14,7 +14,6 @@ struct EmbeddingBagParams {
::c10::metal::array<idx_type_t, 2> output_strides;
::c10::metal::array<idx_type_t, 2> max_indices_strides;
bool use_per_sample_weights;
idx_type_t per_sample_weights_stride;
idx_type_t num_indices;
@ -24,24 +23,3 @@ struct EmbeddingBagParams {
EmbeddingBagMode mode;
int64_t padding_idx;
};
template <typename idx_type_t = uint32_t>
struct EmbeddingBagBackwardParams {
::c10::metal::array<idx_type_t, 2> weight_grad_strides;
::c10::metal::array<idx_type_t, 2> output_grad_strides;
::c10::metal::array<idx_type_t, 2> max_indices_strides;
bool use_per_sample_weights;
idx_type_t per_sample_weights_stride;
idx_type_t feature_size;
EmbeddingBagMode mode;
int64_t padding_idx;
};
template <typename idx_type_t = uint32_t>
struct EmbeddingBagPerSampleWeightsBackwardParams {
::c10::metal::array<idx_type_t, 2> output_grad_strides;
::c10::metal::array<idx_type_t, 2> weight_strides;
idx_type_t per_sample_weights_grad_stride;
idx_type_t feature_size;
int64_t padding_idx;
};

View File

@ -1,5 +1,4 @@
#include <ATen/native/mps/kernels/EmbeddingBag.h>
#include <c10/metal/atomic.h>
#include <c10/metal/utils.h>
#include <metal_array>
#include <metal_stdlib>
@ -45,7 +44,6 @@ template <EmbeddingBagMode M, typename T>
struct MaybeApplyPerSampleWeight {
inline opmath_t<T> operator()(
opmath_t<T> weight_val,
bool /*use_per_sample_weights*/,
uint32_t /*per_sample_weights_index*/,
constant T* /*per_sample_weights*/,
uint32_t /*per_sample_weights_stride*/) {
@ -57,11 +55,10 @@ template <typename T>
struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
inline opmath_t<T> operator()(
opmath_t<T> weight_val,
bool use_per_sample_weights,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_stride) {
if (use_per_sample_weights) {
if (per_sample_weights_stride) {
T per_sample_weight = per_sample_weights
[per_sample_weights_stride * per_sample_weights_index];
return static_cast<opmath_t<T>>(per_sample_weight) * weight_val;
@ -157,7 +154,6 @@ void embedding_bag_impl(
auto num_bags = params.num_bags;
auto feature_size = params.feature_size;
auto padding_idx = params.padding_idx;
auto use_per_sample_weights = params.use_per_sample_weights;
auto per_sample_weights_stride = params.per_sample_weights_stride;
constant auto& output_strides = params.output_strides;
constant auto& weight_strides = params.weight_strides;
@ -187,11 +183,7 @@ void embedding_bag_impl(
feature_idx * weight_strides[1]]);
weight_val = MaybeApplyPerSampleWeight<M, T>()(
weight_val,
use_per_sample_weights,
indices_idx,
per_sample_weights,
per_sample_weights_stride);
weight_val, indices_idx, per_sample_weights, per_sample_weights_stride);
auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0);
@ -247,208 +239,19 @@ kernel void embedding_bag(
}
}
template <EmbeddingBagMode M, typename T>
struct MaybeDivBagSize {
inline opmath_t<T> operator()(opmath_t<T> val, opmath_t<T> bag_size) {
return val;
}
};
template <typename T>
struct MaybeDivBagSize<EmbeddingBagMode::MEAN, T> {
inline opmath_t<T> operator()(opmath_t<T> val, opmath_t<T> bag_size) {
return val / bag_size;
}
};
template <EmbeddingBagMode M, typename T, typename I>
void embedding_bag_backward_sum_mean_impl(
constant T* output_grad,
constant I* indices,
constant I* offset2bag,
constant I* bag_size,
constant T* per_sample_weights,
device AtomicType_t<T>* weight_grad,
constant EmbeddingBagBackwardParams<uint32_t>& params,
uint tid) {
auto feature_size = params.feature_size;
auto indices_idx = tid / feature_size;
auto bag_idx = static_cast<uint32_t>(offset2bag[indices_idx]);
auto bag_size_val = bag_size[bag_idx];
auto weight_idx = indices[indices_idx];
auto padding_idx = params.padding_idx;
if (bag_size_val && weight_idx != padding_idx) {
auto feature_idx = tid % feature_size;
constant auto& weight_grad_strides = params.weight_grad_strides;
constant auto& output_grad_strides = params.output_grad_strides;
auto use_per_sample_weights = params.use_per_sample_weights;
auto per_sample_weights_stride = params.per_sample_weights_stride;
auto output_grad_val =
static_cast<opmath_t<T>>(output_grad
[bag_idx * output_grad_strides[0] +
feature_idx * output_grad_strides[1]]);
opmath_t<T> weight_grad_val = MaybeDivBagSize<M, T>()(
MaybeApplyPerSampleWeight<M, T>()(
output_grad_val,
use_per_sample_weights,
indices_idx,
per_sample_weights,
per_sample_weights_stride),
static_cast<opmath_t<T>>(bag_size_val));
AtomicType<T>::atomic_add(
weight_grad,
static_cast<int32_t>(weight_idx) * weight_grad_strides[0] +
feature_idx * weight_grad_strides[1],
static_cast<T>(weight_grad_val));
}
}
template <typename T, typename I>
void embedding_bag_backward_max_impl(
constant T* output_grad,
constant I* bag_size,
constant I* max_indices,
device AtomicType_t<T>* weight_grad,
constant EmbeddingBagBackwardParams<uint32_t>& params,
uint tid) {
auto feature_size = params.feature_size;
auto bag_idx = tid / feature_size;
auto bag_size_val = bag_size[bag_idx];
if (bag_size_val) {
auto feature_idx = tid % feature_size;
constant auto& weight_grad_strides = params.weight_grad_strides;
constant auto& output_grad_strides = params.output_grad_strides;
constant auto& max_indices_strides = params.max_indices_strides;
auto output_grad_val = output_grad
[bag_idx * output_grad_strides[0] +
feature_idx * output_grad_strides[1]];
auto max_index =
static_cast<uint32_t>(max_indices
[bag_idx * max_indices_strides[0] +
feature_idx * max_indices_strides[1]]);
AtomicType<T>::atomic_add(
weight_grad,
max_index * weight_grad_strides[0] +
feature_idx * weight_grad_strides[1],
output_grad_val);
}
}
#define DISPATCH_BACKWARD_SUM_MEAN_IMPL(MODE) \
return embedding_bag_backward_sum_mean_impl<MODE>( \
output_grad, \
indices, \
offset2bag, \
bag_size, \
per_sample_weights, \
weight_grad, \
params, \
tid)
template <typename T, typename I>
kernel void embedding_bag_backward(
constant T* output_grad [[buffer(0)]],
constant I* indices [[buffer(1)]],
constant I* offset2bag [[buffer(2)]],
constant I* bag_size [[buffer(3)]],
constant I* max_indices [[buffer(4)]],
constant T* per_sample_weights [[buffer(5)]],
device AtomicType_t<T>* weight_grad [[buffer(6)]],
constant EmbeddingBagBackwardParams<uint32_t>& params [[buffer(7)]],
uint tid [[thread_position_in_grid]]) {
switch (params.mode) {
case EmbeddingBagMode::SUM:
DISPATCH_BACKWARD_SUM_MEAN_IMPL(EmbeddingBagMode::SUM);
case EmbeddingBagMode::MEAN:
DISPATCH_BACKWARD_SUM_MEAN_IMPL(EmbeddingBagMode::MEAN);
case EmbeddingBagMode::MAX:
return embedding_bag_backward_max_impl(
output_grad, bag_size, max_indices, weight_grad, params, tid);
}
}
template <typename T, typename I>
kernel void embedding_bag_per_sample_weights_backward(
constant T* output_grad [[buffer(0)]],
constant T* weight [[buffer(1)]],
constant I* indices [[buffer(2)]],
constant I* offset2bag [[buffer(3)]],
device AtomicType_t<T>* per_sample_weights_grad [[buffer(4)]],
constant EmbeddingBagPerSampleWeightsBackwardParams<uint32_t>& params
[[buffer(5)]],
uint tid [[thread_position_in_grid]]) {
auto feature_size = params.feature_size;
auto padding_idx = params.padding_idx;
auto indices_idx = tid / feature_size;
auto weight_idx = indices[indices_idx];
if (weight_idx != padding_idx) {
auto feature_idx = tid % feature_size;
auto bag_idx = static_cast<uint32_t>(offset2bag[indices_idx]);
constant auto& output_grad_strides = params.output_grad_strides;
constant auto& weight_strides = params.weight_strides;
auto per_sample_weights_grad_stride = params.per_sample_weights_grad_stride;
auto weight_val = weight
[static_cast<uint32_t>(weight_idx) * weight_strides[0] +
feature_idx * weight_strides[1]];
auto output_grad_val = output_grad
[bag_idx * output_grad_strides[0] +
feature_idx * output_grad_strides[1]];
auto per_sample_weights_grad_val = static_cast<opmath_t<T>>(weight_val) *
static_cast<opmath_t<T>>(output_grad_val);
AtomicType<T>::atomic_add(
per_sample_weights_grad,
indices_idx * per_sample_weights_grad_stride,
static_cast<T>(per_sample_weights_grad_val));
}
}
#define REGISTER_EMBEDDING_BAG_OP(T, I) \
template [[host_name("embedding_bag_" #T "_" #I)]] \
kernel void embedding_bag<T, I>( \
constant T * weight [[buffer(0)]], \
constant I * indices [[buffer(1)]], \
constant I * offsets [[buffer(2)]], \
constant T * per_sample_weights [[buffer(3)]], \
device T * output [[buffer(4)]], \
device I * offset2bag [[buffer(5)]], \
device I * bag_size [[buffer(6)]], \
device I * max_indices [[buffer(7)]], \
constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
uint tid [[thread_position_in_grid]]); \
\
template [[host_name("embedding_bag_backward_" #T "_" #I)]] \
kernel void embedding_bag_backward<T, I>( \
constant T * output_grad [[buffer(0)]], \
constant I * indices [[buffer(1)]], \
constant I * offset2bag [[buffer(2)]], \
constant I * bag_size [[buffer(3)]], \
constant I * max_indices [[buffer(4)]], \
constant T * per_sample_weights [[buffer(5)]], \
device AtomicType_t<T> * weight_grad [[buffer(6)]], \
constant EmbeddingBagBackwardParams<uint32_t> & params [[buffer(7)]], \
uint tid [[thread_position_in_grid]]); \
\
template \
[[host_name("embedding_bag_per_sample_weights_backward_" #T "_" #I)]] \
kernel void embedding_bag_per_sample_weights_backward<T, I>( \
constant T * output_grad [[buffer(0)]], \
constant T * weight [[buffer(1)]], \
constant I * indices [[buffer(2)]], \
constant I * offset2bag [[buffer(3)]], \
device AtomicType_t<T> * per_sample_weights_grad [[buffer(4)]], \
constant EmbeddingBagPerSampleWeightsBackwardParams<uint32_t> & \
params [[buffer(5)]], \
uint tid [[thread_position_in_grid]]);
#define REGISTER_EMBEDDING_BAG_OP(T, I) \
template [[host_name("embedding_bag_" #T "_" #I)]] \
kernel void embedding_bag<T, I>( \
constant T * weight [[buffer(0)]], \
constant I * indices [[buffer(1)]], \
constant I * offsets [[buffer(2)]], \
constant T * per_sample_weights [[buffer(3)]], \
device T * output [[buffer(4)]], \
device I * offset2bag [[buffer(5)]], \
device I * bag_size [[buffer(6)]], \
device I * max_indices [[buffer(7)]], \
constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
uint tid [[thread_position_in_grid]]);
REGISTER_EMBEDDING_BAG_OP(float, int);
REGISTER_EMBEDDING_BAG_OP(float, long);

View File

@ -13,10 +13,8 @@
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/_embedding_bag_dense_backward_native.h>
#include <ATen/ops/_embedding_bag_forward_only_native.h>
#include <ATen/ops/_embedding_bag_native.h>
#include <ATen/ops/_embedding_bag_per_sample_weights_backward_native.h>
#include <ATen/ops/empty.h>
#endif
@ -97,7 +95,6 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
}
bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
params.use_per_sample_weights = use_per_sample_weights;
params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
params.num_indices = num_indices;
@ -180,117 +177,4 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_mps(
padding_idx);
}
Tensor _embedding_bag_dense_backward_mps(const Tensor& output_grad,
const Tensor& indices,
const Tensor& offset2bag,
const Tensor& bag_size,
const Tensor& max_indices,
int64_t num_weights,
bool scale_grad_by_freq,
int64_t mode,
const std::optional<Tensor>& per_sample_weights_opt,
int64_t padding_idx) {
// indices and offset2bag are assumed having correct dtypes and
// contiguous here due to the checks in _embedding_bag_backward in
// EmbeddingBag.cpp.
// Also see NOTE [ embedding_bag Native Functions ] in native_functions.yaml
// for more details.
int64_t feature_size = output_grad.size(1);
auto weight_grad = at::zeros({num_weights, feature_size}, output_grad.options());
EmbeddingBagBackwardParams<uint32_t> params;
for (const auto dim : c10::irange(2)) {
params.output_grad_strides[dim] = output_grad.stride(dim);
params.weight_grad_strides[dim] = weight_grad.stride(dim);
if (mode == EmbeddingBagMode::MAX) {
params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim));
}
}
bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
params.use_per_sample_weights = use_per_sample_weights;
params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
params.feature_size = output_grad.size(1);
params.mode = static_cast<EmbeddingBagMode>(mode);
params.padding_idx = padding_idx;
auto num_indices = offset2bag.numel();
auto num_threads = (params.mode == EmbeddingBagMode::MAX) ? output_grad.numel() : num_indices * params.feature_size;
MPSStream* stream = getCurrentMPSStream();
mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_backward_{}_{}",
mps::scalarToMetalTypeString(output_grad),
mps::scalarToMetalTypeString(indices)));
getMPSProfiler().beginProfileKernel(
pipeline_state, "embedding_bag", {output_grad, indices, offset2bag, bag_size});
[computeEncoder setComputePipelineState:pipeline_state];
mps::mtl_setArgs(computeEncoder,
output_grad,
indices,
offset2bag,
bag_size,
max_indices,
use_per_sample_weights ? per_sample_weights_opt : std::nullopt,
weight_grad,
params);
mps::mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
getMPSProfiler().endProfileKernel(pipeline_state);
}
});
return std::move(weight_grad);
}
Tensor _embedding_bag_per_sample_weights_backward_mps(const Tensor& output_grad,
const Tensor& weight,
const Tensor& indices,
const Tensor& offsets,
const Tensor& offset2bag,
int64_t mode,
int64_t padding_idx) {
TORCH_INTERNAL_ASSERT(static_cast<EmbeddingBagMode>(mode) == EmbeddingBagMode::SUM);
int64_t num_indices = indices.size(0);
int64_t feature_size = output_grad.size(1);
auto per_sample_weights_grad = at::zeros({num_indices}, output_grad.options());
EmbeddingBagPerSampleWeightsBackwardParams params;
for (const auto dim : c10::irange(2)) {
params.output_grad_strides[dim] = output_grad.stride(dim);
params.weight_strides[dim] = weight.stride(dim);
}
params.per_sample_weights_grad_stride = per_sample_weights_grad.stride(0);
params.feature_size = feature_size;
params.padding_idx = padding_idx;
auto num_threads = num_indices * feature_size;
MPSStream* stream = getCurrentMPSStream();
mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_per_sample_weights_backward_{}_{}",
mps::scalarToMetalTypeString(output_grad),
mps::scalarToMetalTypeString(indices)));
getMPSProfiler().beginProfileKernel(
pipeline_state, "embedding_bag_per_sample_weights_backward", {output_grad, weight, indices, offset2bag});
[computeEncoder setComputePipelineState:pipeline_state];
mps::mtl_setArgs(computeEncoder, output_grad, weight, indices, offset2bag, per_sample_weights_grad, params);
mps::mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
getMPSProfiler().endProfileKernel(pipeline_state);
}
});
return std::move(per_sample_weights_grad);
}
} // namespace at::native

View File

@ -2379,7 +2379,7 @@
- func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
dispatch:
CPU, CUDA, MPS: _embedding_bag_backward_symint
CPU, CUDA: _embedding_bag_backward_symint
- func: _embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
dispatch:
@ -2389,14 +2389,12 @@
dispatch:
CPU: _embedding_bag_dense_backward_cpu
CUDA: _embedding_bag_dense_backward_cuda
MPS: _embedding_bag_dense_backward_mps
autogen: _embedding_bag_dense_backward.out
- func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor
dispatch:
CPU: _embedding_bag_per_sample_weights_backward_cpu
CUDA: _embedding_bag_per_sample_weights_backward_cuda
MPS: _embedding_bag_per_sample_weights_backward_mps
autogen: _embedding_bag_per_sample_weights_backward.out
- func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
@ -10258,7 +10256,6 @@
structured: True
dispatch:
CPU, CUDA: all_all_out
MTIA: all_all_out_mtia
MPS: all_all_out_mps
- func: any(Tensor self) -> Tensor

View File

@ -101,9 +101,6 @@ __device__ inline bool isinf_device(float v) {
__device__ inline bool isinf_device(c10::BFloat16 v) {
return ::isinf(static_cast<float>(v));
}
__device__ inline bool isinf_device(at::Half v) {
return ::isinf(static_cast<float>(v));
}
// CUDA kernel to compute Moving Average Min/Max of the tensor.
// It uses the running_min and running_max along with averaging const, c.
@ -163,8 +160,8 @@ void _calculate_moving_average(
std::tie(x_min, x_max) = at::aminmax(x, 1);
int num_threads = std::min(size, (int64_t)512);
const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
AT_DISPATCH_FLOATING_TYPES_AND2(
at::kBFloat16, at::kHalf, x.scalar_type(), "aminmax_kernel", [&] {
AT_DISPATCH_FLOATING_TYPES_AND(
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
@ -184,8 +181,8 @@ void _calculate_moving_average(
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
std::tie(x_min, x_max) = at::aminmax(x);
AT_DISPATCH_FLOATING_TYPES_AND2(
at::kBFloat16, at::kHalf, x.scalar_type(), "aminmax_kernel", [&] {
AT_DISPATCH_FLOATING_TYPES_AND(
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
@ -224,8 +221,8 @@ void _calc_moving_avg_qparams_helper(
cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
int64_t* fake_quant_on_data = fake_quant_on.data_ptr<int64_t>();
if (per_row_fq) {
AT_DISPATCH_FLOATING_TYPES_AND2(
at::kBFloat16, at::kHalf, x.scalar_type(), "aminmax_kernel", [&] {
AT_DISPATCH_FLOATING_TYPES_AND(
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
int num_threads = std::min(size, (int64_t)512);
@ -247,8 +244,8 @@ void _calc_moving_avg_qparams_helper(
});
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
AT_DISPATCH_FLOATING_TYPES_AND2(
at::kBFloat16, at::kHalf, x.scalar_type(), "aminmax_kernel", [&] {
AT_DISPATCH_FLOATING_TYPES_AND(
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(

View File

@ -68,6 +68,29 @@ c10::MaybeOwned<Tensor> prepare_dense_matrix_for_cusparse(
}
}
// This function is used for old CUDA Toolkit versions that doesn't support new cuSPARSE Generic API
void addmm_out_legacy(
const at::sparse_csr::SparseCsrTensor& mat1,
const Tensor& mat2,
const Scalar& beta,
const Scalar& alpha,
const Tensor& result) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat1.is_sparse_csr());
auto nnz = mat1._nnz();
auto m = mat1.size(0);
auto k = mat1.size(1);
auto n = mat2.size(1);
auto crow_indices = mat1.crow_indices().to(kInt);
auto col_indices = mat1.col_indices().to(kInt);
auto values = mat1.values();
auto mat2_ = at::native::expect_resolved_conj(mat2);
auto result_ = at::native::expect_resolved_conj(result);
at::native::s_addmm_out_csr_sparse_dense_cuda_worker(nnz, m, n, k, result, beta, *result_, alpha, crow_indices, col_indices, values, *mat2_);
if (!result.is_same(*result_)) {
result.copy_(*result_);
}
}
c10::MaybeOwned<Tensor> inline prepare_dense_vector_for_cusparse(
const Tensor& tensor) {
if (tensor.is_non_overlapping_and_dense()) {
@ -559,6 +582,9 @@ void spmm(
const Scalar& beta,
const Scalar& alpha,
const Tensor& result) {
#if !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
addmm_out_legacy(mat1, mat2, beta, alpha, result);
#else
c10::MaybeOwned<Tensor> result_ = prepare_dense_matrix_for_cusparse(result);
c10::MaybeOwned<Tensor> mat2_ = prepare_dense_matrix_for_cusparse(mat2);
@ -657,6 +683,7 @@ void spmm(
if (!result.is_same(*result_)) {
result.copy_(*result_);
}
#endif // !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
}
void spgemm(

View File

@ -672,7 +672,7 @@ Tensor bmm_sparse_cuda(const SparseTensor& self, const Tensor& mat2) {
return bmm_out_sparse_cuda(self, mat2, result);
}
#if defined(USE_ROCM) || defined(CUSPARSE_VERSION)
#if defined(USE_ROCM) || !(defined(_MSC_VER) && CUSPARSE_VERSION < 11000)
__global__ void search_end_matrix_indices_cuda_kernel(
int64_t* mat_el_end_indices,
int64_t num_matrices,
@ -745,6 +745,10 @@ cudaDataType getTensorCudaDataType(Tensor self) {
#endif
Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor& result) {
#if defined(_MSC_VER) && (CUSPARSE_VERSION < 11000)
TORCH_CHECK(false, "bmm sparse-dense CUDA is not supported on Windows with cuda before 11.0");
#elif defined(USE_ROCM) || (defined(CUDART_VERSION) && (CUDART_VERSION >= 10010)) // linux cuda >= 10.1 or windows cuda >= 11.0
TORCH_CHECK(!mat2.is_sparse(), "bmm_sparse: Tensor 'mat2' must be dense");
TORCH_CHECK(self.dense_dim() == 0, "bmm_sparse: Tensor 'self' must have 0 dense dims, but has ", self.dense_dim());
TORCH_CHECK(self.sparse_dim() == 3, "bmm_sparse: Tensor 'self' must have 3 sparse dims, but has ", self.sparse_dim());
@ -940,6 +944,10 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
// them in column-major order in memory
result.transpose_(1,2);
#else
TORCH_CHECK(false, "bmm sparse-dense requires CUDA 10.1 or greater");
#endif
return result;
}

View File

@ -40,7 +40,7 @@
#include <thrust/iterator/discard_iterator.h>
#if defined(__CUDACC__) && (defined(CUSPARSE_VERSION) || (defined(USE_ROCM) && ROCM_VERSION >= 60300))
#if defined(__CUDACC__) && ((CUSPARSE_VERSION >= 11000) || (defined(USE_ROCM) && ROCM_VERSION >= 60300))
#define IS_CUSPARSE11_AVAILABLE() 1
#else
#define IS_CUSPARSE11_AVAILABLE() 0
@ -689,6 +689,13 @@ void sparse_sparse_matmul_cuda_kernel(
std::is_same_v<c10::complex<double>, scalar_t>,
"sparse_sparse_matmul_cuda_kernel only supports data type of half, bfloat16, float, double and complex float, double.");
// older versions of cusparse on Windows segfault for complex128 dtype
#if defined(_WIN32) && defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11400
TORCH_CHECK(
!(mat1.scalar_type() == ScalarType::ComplexDouble),
"Sparse multiplication with complex128 dtype inputs is not supported with current CUDA version. Please upgrade to CUDA Toolkit 11.2.1+");
#endif
Tensor mat1_indices_ = mat1._indices().contiguous();
Tensor mat1_values = mat1._values().contiguous();

View File

@ -316,12 +316,6 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
return false;
#endif
#else
if (!at::cuda::is_available()) {
if (debug) {
TORCH_WARN("flash attention requires a CUDA device, which is not available.");
}
return false;
}
auto dprops = at::cuda::getCurrentDeviceProperties();
if (!check_sm_version<sm80, sm121>(dprops)) {
if (debug) {
@ -373,12 +367,6 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
return false;
#endif
#else
if (!at::cuda::is_available()) {
if (debug) {
TORCH_WARN("Mem Efficient attention requires a CUDA device, which is not available.");
}
return false;
}
auto dprops = at::cuda::getCurrentDeviceProperties();
if (!check_sm_version<sm50, sm121>(dprops)) {
if (debug) {
@ -609,12 +597,6 @@ bool check_cudnn_layout(sdp_params const& params, bool debug) {
bool check_cudnn_hardware_support(sdp_params const& params, bool debug) {
using sm80 = SMVersion<8, 0>;
using sm121 = SMVersion<12, 1>;
if (!at::cuda::is_available()) {
if (debug) {
TORCH_WARN("cuDNN SDPA requires a CUDA device, which is not available.");
}
return false;
}
auto dprops = at::cuda::getCurrentDeviceProperties();
if (!check_sm_version<sm80, sm121>(dprops)) {
if (debug) {

View File

@ -5,7 +5,6 @@
#include <torch/csrc/profiler/orchestration/vulkan.h>
#endif // USE_KINETO
#include <algorithm>
#include <cmath>
#include <iomanip>
#include <iostream>

View File

@ -1,83 +1,10 @@
#include <gtest/gtest.h>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <ATen/test/allocator_clone_test.h>
#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
std::unordered_map<void*, size_t> allocation_sizes;
void* logging_malloc(size_t size, int device, cudaStream_t stream) {
void* ptr;
cudaMalloc(&ptr, size);
allocation_sizes[ptr] = size;
return ptr;
}
void logging_free(void* ptr, size_t size, int device, cudaStream_t stream) {
if (allocation_sizes.find(ptr) != allocation_sizes.end()) {
if (allocation_sizes[ptr] != size) {
throw std::runtime_error("free mismatch");
}
} else {
throw std::runtime_error("free of unknown ptr");
}
cudaFree(ptr);
allocation_sizes.erase(ptr);
}
TEST(TestTorchUnique, UniqueComparisonTest) {
if (!at::cuda::is_available()) return;
auto custom_allocator =
torch::cuda::CUDAPluggableAllocator::createCustomAllocator(logging_malloc, logging_free);
torch::cuda::CUDAPluggableAllocator::changeCurrentAllocator(custom_allocator);
// Run the command 3 times; the first 2 will pass and the third invocation will have
// different sizes in alloc and free if the test fails.
for (int i = 0; i < 3; ++i) {
// Initialize simple sorted tensor with repeats
at::Tensor sorted_tensor =
at::tensor({0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 5},
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA));
// This operation will call malloc/free with different sizes on the same pointer
auto unique_dim_result = at::unique_consecutive(sorted_tensor, false, true, 0);
// Everything below is only there to validate correct results
auto unique_dim_values = std::get<0>(unique_dim_result);
auto unique_dim_counts = std::get<2>(unique_dim_result);
// Check tensor sizes
EXPECT_EQ(unique_dim_values.size(0), 5);
EXPECT_EQ(unique_dim_counts.size(0), 5);
// Copy to CPU before accessing elements
at::Tensor cpu_values = unique_dim_values.cpu();
at::Tensor cpu_counts = unique_dim_counts.cpu();
// Use accessors on the CPU tensors
auto values_accessor = cpu_values.accessor<float, 1>();
auto counts_accessor = cpu_counts.accessor<int64_t, 1>();
// Check individual values using accessors
EXPECT_EQ(values_accessor[0], 0.0f);
EXPECT_EQ(values_accessor[1], 1.0f);
EXPECT_EQ(values_accessor[2], 2.0f);
EXPECT_EQ(values_accessor[3], 3.0f);
EXPECT_EQ(values_accessor[4], 5.0f);
// Check count values using accessors
EXPECT_EQ(counts_accessor[0], 3);
EXPECT_EQ(counts_accessor[1], 2);
EXPECT_EQ(counts_accessor[2], 1);
EXPECT_EQ(counts_accessor[3], 4);
EXPECT_EQ(counts_accessor[4], 1);
}
}
TEST(AllocatorTestCUDA, test_clone) {
if (!at::cuda::is_available()) return;
test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
}

View File

@ -19,10 +19,10 @@ TEST(CachingHostAllocatorTest, check_stats) {
// Clear the stats and ensure they are zero.
size_t round_size = c10::llvm::PowerOf2Ceil(N);
auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
ASSERT_EQ(stats.allocations.current, 0);
ASSERT_EQ(stats.allocations.peak, 0);
ASSERT_EQ(stats.allocations.allocated, 0);
ASSERT_EQ(stats.allocations.freed, 0);
ASSERT_EQ(stats.allocation.current, 0);
ASSERT_EQ(stats.allocation.peak, 0);
ASSERT_EQ(stats.allocation.allocated, 0);
ASSERT_EQ(stats.allocation.freed, 0);
void* ptr{nullptr};
void* ctx{nullptr};
@ -32,10 +32,14 @@ TEST(CachingHostAllocatorTest, check_stats) {
ptr = pinned_tensor.data_ptr();
ctx = pinned_tensor.storage().data_ptr().get_context();
auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
ASSERT_EQ(stats.allocations.current, 1);
ASSERT_EQ(stats.allocations.peak, 1);
ASSERT_EQ(stats.allocations.allocated, 1);
// We dont track active bytes as free blocks are added in process_events
ASSERT_EQ(stats.allocation.current, 1);
ASSERT_EQ(stats.allocation.peak, 1);
ASSERT_EQ(stats.allocation.allocated, 1);
ASSERT_EQ(stats.allocation.freed, 0);
ASSERT_EQ(stats.segment.allocated, 1);
ASSERT_EQ(stats.segment.freed, 0);
ASSERT_EQ(stats.reserved_bytes.current, round_size);
ASSERT_EQ(stats.allocated_bytes.current, round_size);
ASSERT_EQ(stats.host_alloc_time.max, stats.host_alloc_time.min);
ASSERT_EQ(stats.host_free_time.total, 0);
}
@ -46,9 +50,13 @@ TEST(CachingHostAllocatorTest, check_stats) {
auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
ASSERT_EQ(ptr, pinned_tensor.data_ptr());
ASSERT_EQ(ctx, pinned_tensor.storage().data_ptr().get_context());
ASSERT_EQ(stats.allocations.current, 1);
ASSERT_EQ(stats.allocations.peak, 1);
ASSERT_EQ(stats.allocations.allocated, 1);
ASSERT_EQ(stats.allocation.current, 1);
ASSERT_EQ(stats.allocation.peak, 1);
ASSERT_EQ(stats.allocation.allocated, 2);
ASSERT_EQ(stats.allocation.freed, 1);
ASSERT_EQ(stats.segment.allocated, 1);
ASSERT_EQ(stats.segment.freed, 0);
ASSERT_EQ(stats.reserved_bytes.current, round_size);
ASSERT_EQ(stats.allocated_bytes.current, round_size);
}
// Ensure we don't reuse the allocation, due to size mismatch.
@ -60,10 +68,14 @@ TEST(CachingHostAllocatorTest, check_stats) {
auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
ASSERT_NE(ptr, pinned_tensor.data_ptr());
ASSERT_NE(ctx, pinned_tensor.storage().data_ptr().get_context());
ASSERT_EQ(stats.allocations.current, 2);
ASSERT_EQ(stats.allocations.peak, 2);
ASSERT_EQ(stats.allocations.allocated, 2);
ASSERT_EQ(stats.allocated_bytes.current, new_round_size + round_size);
ASSERT_EQ(stats.allocation.current, 1);
ASSERT_EQ(stats.allocation.peak, 2);
ASSERT_EQ(stats.allocation.allocated, 3);
ASSERT_EQ(stats.allocation.freed, 2);
ASSERT_EQ(stats.segment.allocated, 2);
ASSERT_EQ(stats.segment.freed, 0);
ASSERT_EQ(stats.reserved_bytes.current, round_size + new_round_size);
ASSERT_EQ(stats.allocated_bytes.current, new_round_size);
ASSERT_NE(stats.host_alloc_time.total, stats.host_alloc_time.min);
}
@ -71,10 +83,13 @@ TEST(CachingHostAllocatorTest, check_stats) {
{
at::getHostAllocator(at::kCUDA)->empty_cache();
auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
ASSERT_EQ(stats.allocations.current, 0);
ASSERT_EQ(stats.allocation.current, 0);
ASSERT_EQ(stats.allocated_bytes.current, 0);
ASSERT_EQ(stats.allocations.peak, 2);
ASSERT_EQ(stats.allocations.allocated, 2);
ASSERT_EQ(stats.allocation.peak, 2);
ASSERT_EQ(stats.allocation.allocated, 3);
ASSERT_EQ(stats.allocation.freed, 3);
ASSERT_EQ(stats.segment.allocated, 2);
ASSERT_EQ(stats.segment.freed, 2);
ASSERT_EQ(stats.num_host_alloc, 2);
ASSERT_EQ(stats.num_host_free, 2);
ASSERT_NE(stats.host_free_time.total, stats.host_free_time.min);
@ -85,9 +100,9 @@ TEST(CachingHostAllocatorTest, check_stats) {
at::getHostAllocator(at::kCUDA)->reset_accumulated_stats();
at::getHostAllocator(at::kCUDA)->reset_peak_stats();
auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
ASSERT_EQ(stats.allocations.peak, 0);
ASSERT_EQ(stats.allocations.allocated, 0);
ASSERT_EQ(stats.allocations.freed, 0);
ASSERT_EQ(stats.allocation.peak, 0);
ASSERT_EQ(stats.allocation.allocated, 0);
ASSERT_EQ(stats.allocation.freed, 0);
ASSERT_EQ(stats.allocated_bytes.peak, 0);
ASSERT_EQ(stats.num_host_alloc, 0);
ASSERT_EQ(stats.num_host_free, 0);

View File

@ -50,7 +50,6 @@ run_if_exists cuda_complex_test
run_if_exists cuda_complex_math_test
run_if_exists cuda_cub_test
run_if_exists cuda_atomic_ops_test
run_if_exists cuda_allocator_test
if [ "$VALGRIND" == "ON" ]; then
# NB: As these tests are invoked by valgrind, let's leave them for now as it's

View File

@ -156,7 +156,7 @@ ROOT = "//" if IS_OSS else "//xplat/caffe2"
# for targets in subfolders
ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/"
C10 = "//c10:c10" if IS_OSS else "//xplat/caffe2/c10:c10"
C10 = "//c10:c10" if IS_OSS else ("//xplat/caffe2/c10:c10_ovrsource" if is_arvr_mode() else "//xplat/caffe2/c10:c10")
# a dictionary maps third party library name to fbsource and oss target
THIRD_PARTY_LIBS = {

View File

@ -897,7 +897,6 @@ libtorch_python_core_sources = [
"torch/csrc/Stream.cpp",
"torch/csrc/Event.cpp",
"torch/csrc/TypeInfo.cpp",
"torch/csrc/acc/Module.cpp",
"torch/csrc/api/src/python/init.cpp",
"torch/csrc/autograd/functions/init.cpp",
"torch/csrc/autograd/init.cpp",

View File

@ -1,16 +1,100 @@
#pragma once
// This is directly synchronized with caffe2/proto/caffe2.proto, but
// doesn't require me to figure out how to get Protobuf headers into
// ATen/core (which would require a lot more build system hacking.)
// If you modify me, keep me synchronized with that file.
#include <c10/macros/Export.h>
// If you modified DeviceType in caffe2/proto/caffe2.proto, please also sync
// your changes into torch/headeronly/core/DeviceType.h.
#include <torch/headeronly/core/DeviceType.h>
#include <cstddef>
#include <cstdint>
#include <functional>
#include <ostream>
#include <string>
namespace c10 {
// These contains all device types that also have a BackendComponent
// and therefore participate in per-backend functionality dispatch keys.
// This is most backends except PrivateUse2 and PrivateUse3
#define C10_FORALL_BACKEND_DEVICE_TYPES(_, extra) \
_(CPU, extra) \
_(CUDA, extra) \
_(HIP, extra) \
_(XLA, extra) \
_(MPS, extra) \
_(IPU, extra) \
_(XPU, extra) \
_(HPU, extra) \
_(VE, extra) \
_(Lazy, extra) \
_(Meta, extra) \
_(MTIA, extra) \
_(PrivateUse1, extra)
enum class DeviceType : int8_t {
CPU = 0,
CUDA = 1, // CUDA.
MKLDNN = 2, // Reserved for explicit MKLDNN
OPENGL = 3, // OpenGL
OPENCL = 4, // OpenCL
IDEEP = 5, // IDEEP.
HIP = 6, // AMD HIP
FPGA = 7, // FPGA
MAIA = 8, // ONNX Runtime / Microsoft
XLA = 9, // XLA / TPU
Vulkan = 10, // Vulkan
Metal = 11, // Metal
XPU = 12, // XPU
MPS = 13, // MPS
Meta = 14, // Meta (tensors with no data)
HPU = 15, // HPU / HABANA
VE = 16, // SX-Aurora / NEC
Lazy = 17, // Lazy Tensors
IPU = 18, // Graphcore IPU
MTIA = 19, // Meta training and inference devices
PrivateUse1 = 20, // PrivateUse1 device
// NB: If you add more devices:
// - Change the implementations of DeviceTypeName and isValidDeviceType
// in DeviceType.cpp
// - Change the number below
COMPILE_TIME_MAX_DEVICE_TYPES = 21,
};
constexpr DeviceType kCPU = DeviceType::CPU;
constexpr DeviceType kCUDA = DeviceType::CUDA;
constexpr DeviceType kHIP = DeviceType::HIP;
constexpr DeviceType kFPGA = DeviceType::FPGA;
constexpr DeviceType kMAIA = DeviceType::MAIA;
constexpr DeviceType kXLA = DeviceType::XLA;
constexpr DeviceType kMPS = DeviceType::MPS;
constexpr DeviceType kMeta = DeviceType::Meta;
constexpr DeviceType kVulkan = DeviceType::Vulkan;
constexpr DeviceType kMetal = DeviceType::Metal;
constexpr DeviceType kXPU = DeviceType::XPU;
constexpr DeviceType kHPU = DeviceType::HPU;
constexpr DeviceType kVE = DeviceType::VE;
constexpr DeviceType kLazy = DeviceType::Lazy;
constexpr DeviceType kIPU = DeviceType::IPU;
constexpr DeviceType kMTIA = DeviceType::MTIA;
constexpr DeviceType kPrivateUse1 = DeviceType::PrivateUse1;
// define explicit int constant
constexpr int COMPILE_TIME_MAX_DEVICE_TYPES =
static_cast<int>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
static_assert(
COMPILE_TIME_MAX_DEVICE_TYPES <= 21,
"Hey! You seem to be adding a lot of new DeviceTypes. The intent was "
"for this constant to reflect the actual number of DeviceTypes we support "
"in PyTorch; it's important that this number is not too large as we "
"use this to allocate stack arrays in some places in our code. If you "
"are indeed just adding the 20th device type, feel free to change "
"the check to 32; but if you are adding some sort of extensible device "
"types registration, please be aware that you are affecting code that "
"this number is small. Try auditing uses of this constant.");
C10_API std::string DeviceTypeName(DeviceType d, bool lower_case = false);
C10_API bool isValidDeviceType(DeviceType d);
@ -24,6 +108,15 @@ C10_API bool is_privateuse1_backend_registered();
} // namespace c10
namespace std {
template <>
struct hash<c10::DeviceType> {
std::size_t operator()(c10::DeviceType k) const {
return std::hash<int>()(static_cast<int>(k));
}
};
} // namespace std
namespace torch {
// NOLINTNEXTLINE(misc-unused-using-decls)
using c10::DeviceType;

View File

@ -9,22 +9,16 @@ std::array<
static_cast<size_t>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)>
device_guard_impl_registry;
void registerDeviceGuard(
DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
DeviceType type,
const DeviceGuardImplInterface* impl) {
device_guard_impl_registry[static_cast<size_t>(type)].store(impl);
}
DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
DeviceType type,
const DeviceGuardImplInterface* impl) {
registerDeviceGuard(type, impl);
}
namespace {
thread_local std::unique_ptr<DeviceGuardImplInterface> tls_fake_device_guard =
nullptr;
} // namespace
}
void ensureCUDADeviceGuardSet() {
constexpr auto cuda_idx = static_cast<std::size_t>(DeviceType::CUDA);

View File

@ -368,9 +368,6 @@ inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) {
return p;
}
void C10_API
registerDeviceGuard(DeviceType type, const DeviceGuardImplInterface* impl);
inline bool hasDeviceGuardImpl(DeviceType type) {
return device_guard_impl_registry[static_cast<size_t>(type)].load();
}

View File

@ -1,21 +0,0 @@
#include <c10/core/impl/HermeticPyObjectTLS.h>
namespace c10::impl {
thread_local static std::atomic<bool> hermeticPyObjectState{false};
std::atomic<bool> HermeticPyObjectTLS::haveState_{false};
void HermeticPyObjectTLS::set_state(bool state) {
hermeticPyObjectState = state;
}
bool HermeticPyObjectTLS::get_tls_state() {
return hermeticPyObjectState;
}
void HermeticPyObjectTLS::init_state() {
haveState_ = true;
}
} // namespace c10::impl

View File

@ -1,62 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <atomic>
namespace c10::impl {
// This TLS controls whether or not we permanently associate PyObject
// with Tensor the first time it is allocated. When hermetic PyObject
// TLS is enabled (state is true), we DO NOT save PyObjects to Tensor,
// meaning you get a distinct PyObject whenever you execute the code in
// question.
struct C10_API HermeticPyObjectTLS {
static void set_state(bool state);
static bool get_state() {
// Hypothetical fastpath if torchdeploy/multipy // codespell:ignore multipy
// isn't used. Per
// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
// this qualifies relaxed access because it is a single-location data
// structure (only the boolean here).
//
// Forgetting about data races for a moment, is there a logical race?
//
// - Boolean only ever transitions from false to true. So the
// critical situation is when one interpreter is already running
// when a second interpreter switches haveState from false to true.
//
// - The first interpreter is indifferent whether or not it sees
// hasState true/false; obviously false works (this is what the
// interpreter was previously using; more directly, the interpreter
// calls into itself as the handler, so being hermetic is not
// required), and true simply means serviced python operator calls will
// be hermetic; in these cases it is expected to be functionally
// equivalent.
//
// - The second interpreter MUST see hasState true (as its requests will
// be forwarded to the first interpreter), but it is assumed that there
// is a synchronization between the interpreter initialization, and
// when we actually perform operations, so it is guaranteed to see
// hasState true.
//
// QED.
//
// This fastpath is currently disabled so that we can more easily test that
// hermetic mode works correctly even on stock build of PyTorch.
if (false && !haveState_.load(std::memory_order_relaxed))
return false;
return get_tls_state();
}
// Call this from the multipy/torchdeploy // codespell:ignore multipy
// top level
static void init_state();
private:
// This only flipped once from false to true during
// torchdeploy/multipy initialization, // codespell:ignore multipy
// and never again.
static std::atomic<bool> haveState_;
static bool get_tls_state();
};
} // namespace c10::impl

View File

@ -1,6 +1,5 @@
#pragma once
#include <c10/core/impl/HermeticPyObjectTLS.h>
#include <c10/core/impl/PyInterpreter.h>
#include <c10/core/impl/PyInterpreterHooks.h>
#include <c10/util/python_stub.h>
@ -42,32 +41,15 @@ struct C10_API PyObjectSlot {
PyObject* _unchecked_untagged_pyobj() const;
// Test the interpreter tag. If tagged for the current interpreter, return
// a non-nullopt (but possibly null) PyObject. If (possibly) untagged,
// returns a nullopt. If it is definitely invalid, raises an error.
//
// If `ignore_hermetic_tls` is false and this function is called from a
// hermetic context (ie, `HermeticPyObjectTLS::get_state()` is true), then
// nullopt is returned. If `ignore_hermetic_tls` is true, then the hermetic
// context is ignored, allowing you to check the interpreter tag of a
// nonhermetic PyObject from within a hermetic context. This is necessary
// because there are some cases where the deallocator function of a
// nonhermetic PyObject is called from within a hermetic context, so it must
// be properly treated as a nonhermetic PyObject.
//
// Test the interpreter / PyObj as they may be null
// NB: this lives in header so that we can avoid actually creating the
// std::optional
// @todo alban: I'm not too sure what's going on here, we can probably delete
// it but it's worthwhile making sure
std::optional<PyObject*> check_pyobj() const {
impl::PyInterpreter* interpreter = getGlobalPyInterpreter();
if (interpreter == nullptr || pyobj_ == nullptr) {
return std::nullopt;
}
if (c10::impl::HermeticPyObjectTLS::get_state()) {
return std::nullopt;
}
return _unchecked_untagged_pyobj();
}

View File

@ -382,7 +382,6 @@ struct ExpandableSegment {
peers_(std::move(peers)) {
cudaDeviceProp prop{};
C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device_));
mapped_size_ = 0;
// we allocate enough address space for 1 1/8 the total memory on the GPU.
// This allows for some cases where we have to unmap pages earlier in the
// segment to put them at the end.
@ -494,7 +493,6 @@ struct ExpandableSegment {
return SegmentRange{range.ptr, 0};
}
unmapHandles(begin, end);
mapped_size_ -= (end - begin) * segment_size_;
return rangeFromHandles(begin, end);
}
@ -634,18 +632,6 @@ struct ExpandableSegment {
return max_handles_ * segment_size_;
}
cudaStream_t getStream() {
return *stream_;
}
size_t getMappedSize() {
return mapped_size_;
}
size_t getSegmentSize() {
return segment_size_;
}
void addPeer(c10::DeviceIndex device) {
peers_.push_back(device);
forEachAllocatedRange(
@ -680,7 +666,6 @@ struct ExpandableSegment {
handles_.at(i).value().handle,
0ULL));
}
mapped_size_ += (end - begin) * segment_size_;
setAccess(device_, begin, end);
for (auto p : peers_) {
setAccess(p, begin, end);
@ -749,7 +734,6 @@ struct ExpandableSegment {
std::optional<cudaStream_t> stream_;
CUdeviceptr ptr_{};
size_t segment_size_;
size_t mapped_size_;
size_t max_handles_;
struct Handle {
CUmemGenericAllocationHandle handle;
@ -795,17 +779,6 @@ struct ExpandableSegment {
size_t size() const {
return 0;
}
cudaStream_t getStream() {
return nullptr;
}
size_t getMappedSize() {
return 0;
}
size_t getSegmentSize() {
return 0;
}
void addPeer(c10::DeviceIndex device) {}
};
#endif
@ -1210,16 +1183,6 @@ class DeviceCachingAllocator {
// ends.
ska::flat_hash_map<Block*, std::vector<cudaGraphNode_t>> deferred_blocks;
// Incremental reverse-traversal state cached per graph.
// We never re-traverse nodes we've already seen
struct GraphReuseContext {
ska::flat_hash_map<cudaStream_t, ska::flat_hash_set<cudaGraphNode_t>>
visited;
};
ska::flat_hash_map<MempoolId_t, CaptureId_t, MempoolIdHash>
mempool_to_capture_id;
ska::flat_hash_map<CaptureId_t, GraphReuseContext> graph_reuse_context;
// outstanding cuda events
ska::flat_hash_map<
cuda::CUDAStream,
@ -1675,70 +1638,44 @@ class DeviceCachingAllocator {
return block;
}
struct CaptureInfo {
cudaGraph_t graph{};
CaptureId_t capture_id{0};
const cudaGraphNode_t* terminals{nullptr};
size_t num_terminals{0};
cudaStreamCaptureStatus status{cudaStreamCaptureStatusNone};
};
inline CaptureInfo stream_get_capture_info(cudaStream_t stream) {
CaptureInfo info{};
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
stream,
&info.status,
&info.capture_id,
&info.graph,
&info.terminals,
nullptr,
&info.num_terminals));
#else
C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
stream,
&info.status,
&info.capture_id,
&info.graph,
&info.terminals,
&info.num_terminals));
#endif
TORCH_INTERNAL_ASSERT(
info.status != cudaStreamCaptureStatusInvalidated,
"Invalid stream capture status");
return info;
}
// Record "free marker" of the CUDA graph for all streams that
// Insert "free marker" (empty nodes) into the CUDA graph for all streams that
// have used the block, including the allocation stream. These nodes mark the
// last use of the block in the capture graph. Returns a vector of the
// inserted nodes, or an empty vector if any stream is not capturing.
std::vector<cudaGraphNode_t> record_free_markers(Block* block) {
// Is is possible to have the same marker recorded multiple times, so we use
// a set to avoid duplicates
ska::flat_hash_set<cudaGraphNode_t> markers;
cudaGraph_t owning_graph = nullptr;
std::vector<cudaGraphNode_t> insert_free_marker(Block* block) {
std::vector<cudaGraphNode_t> empty_nodes;
auto try_record = [&](cudaStream_t s) -> bool {
auto info = stream_get_capture_info(s);
if (info.status == cudaStreamCaptureStatusNone) {
return false; // not capturing on this stream -> must defer
}
auto try_add_empty_node = [&](cudaStream_t stream) -> bool {
cudaStreamCaptureStatus status{};
cudaGraph_t graph{};
const cudaGraphNode_t* deps = nullptr;
size_t num_deps = 0;
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
stream, &status, nullptr, &graph, &deps, nullptr, &num_deps));
#else
C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
stream, &status, nullptr, &graph, &deps, &num_deps));
#endif
if (owning_graph == nullptr) {
owning_graph = info.graph;
}
TORCH_INTERNAL_ASSERT(
info.graph == owning_graph,
"All streams in the same capture should agree on the graph");
status != cudaStreamCaptureStatusInvalidated,
"Invalid stream capture status");
// Use current terminals as the free markers for the stream
for (size_t i = 0; i < info.num_terminals; ++i) {
auto terminal = info.terminals[i];
markers.insert(terminal);
if (status == cudaStreamCaptureStatusNone) {
return false;
}
owning_graph = info.graph; // all streams in the same capture should agree
cudaGraphNode_t node{};
C10_CUDA_CHECK(cudaGraphAddEmptyNode(&node, graph, deps, num_deps));
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies(
stream, &node, nullptr, 1, cudaStreamSetCaptureDependencies));
#else
C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies(
stream, &node, 1, cudaStreamSetCaptureDependencies));
#endif
empty_nodes.push_back(node);
return true;
};
@ -1746,34 +1683,81 @@ class DeviceCachingAllocator {
// An empty vector indicates that the block should be deferred for freeing
// until after capture.
// Allocation stream
if (!try_record(block->stream)) {
// Attempt to add an empty node for the allocation stream.
if (!try_add_empty_node(block->stream)) {
return {};
}
// Any extra streams that used this block
// Attempt to add empty nodes for all streams that have used the block.
for (const auto& s : block->stream_uses) {
if (!try_record(s.stream())) {
if (!try_add_empty_node(s.stream())) {
return {};
}
}
return std::vector<cudaGraphNode_t>(markers.begin(), markers.end());
return empty_nodes;
}
// Returns the set of "reusable" free markers in the current
// Returns the current set of "terminal" nodes in the CUDA graph for a given
// stream. These represent the current endpoints of the stream, and may
// include additional nodes if the graph branches. Any new work captured will
// be attached after one or more of these terminals.
std::vector<cudaGraphNode_t> get_terminals(cudaStream_t stream) {
std::vector<cudaGraphNode_t> result;
cudaStreamCaptureStatus status{};
cudaGraph_t graph{};
const cudaGraphNode_t* dependencies = nullptr;
size_t num_dependencies = 0;
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
stream,
&status,
nullptr,
&graph,
&dependencies,
nullptr,
&num_dependencies));
#else
C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
stream, &status, nullptr, &graph, &dependencies, &num_dependencies));
#endif
TORCH_INTERNAL_ASSERT(
status == cudaStreamCaptureStatusActive,
"Invalid stream capture status");
for (size_t i = 0; i < num_dependencies; i++) {
auto node = dependencies[i];
if (node != nullptr) {
result.push_back(node);
}
}
return result;
}
// Returns the set of "reusable" free markers (empty nodes) in the current
// CUDA graph capture. A free marker is considered reusable if it is a
// predecessor of every terminal node.
// This ensures that all future captured work will occur after the free
// marker, making it safe to reuse.
void update_visited(
const CaptureInfo& info,
ska::flat_hash_set<cudaGraphNode_t>& visited) {
// This is the versioned cudaGraphNodeGetDependencies helper function.
auto node_get_dependencies =
[](cudaGraphNode_t n, cudaGraphNode_t* deps, size_t* count) -> void {
ska::flat_hash_set<cudaGraphNode_t> get_reusable_empty_nodes(
cudaStream_t stream) {
auto terminals = get_terminals(stream);
if (terminals.empty()) {
// No terminal nodes found; nothing to free.
return {};
}
auto get_dependencies = [](cudaGraphNode_t node,
cudaGraphNode_t* pDependencies,
size_t* pNumDependencies) -> void {
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
C10_CUDA_CHECK(cudaGraphNodeGetDependencies(n, deps, nullptr, count));
C10_CUDA_CHECK(cudaGraphNodeGetDependencies(
node, pDependencies, nullptr, pNumDependencies));
#else
C10_CUDA_CHECK(cudaGraphNodeGetDependencies(n, deps, count));
C10_CUDA_CHECK(
cudaGraphNodeGetDependencies(node, pDependencies, pNumDependencies));
#endif
};
@ -1781,43 +1765,62 @@ class DeviceCachingAllocator {
auto get_parents =
[&](cudaGraphNode_t node) -> std::vector<cudaGraphNode_t> {
size_t count = 0;
node_get_dependencies(node, nullptr, &count);
get_dependencies(node, nullptr, &count);
std::vector<cudaGraphNode_t> out(count);
if (count) {
node_get_dependencies(node, out.data(), &count);
get_dependencies(node, out.data(), &count);
out.resize(count);
}
return out;
};
// For each terminal node, perform a reverse DFS to count, for each free
// marker, how many terminals it can reach (i.e., for how many terminals it
// is a predecessor). A free marker is reusable if it is a predecessor of
// all terminal nodes.
std::deque<cudaGraphNode_t> dfs;
for (size_t i = 0; i < info.num_terminals; ++i) {
dfs.push_back(info.terminals[i]);
// Helper to determine if a node is an empty node (used as a free marker).
auto is_empty_node = [](cudaGraphNode_t n) -> bool {
cudaGraphNodeType type{};
C10_CUDA_CHECK(cudaGraphNodeGetType(n, &type));
return type == cudaGraphNodeTypeEmpty;
};
// For each terminal node, perform a reverse DFS to count, for each empty
// node, how many terminals it can reach (i.e., for how many terminals it is
// a predecessor). An empty node is reusable if it is a predecessor of all
// terminal nodes.
ska::flat_hash_map<cudaGraphNode_t, size_t> num_terminals_reachable;
for (auto terminal : terminals) {
ska::flat_hash_set<cudaGraphNode_t> visited;
ska::flat_hash_set<cudaGraphNode_t> empty_nodes;
std::function<void(cudaGraphNode_t)> reverse_dfs =
[&](cudaGraphNode_t node) {
if (!visited.insert(node).second)
return;
if (is_empty_node(node)) {
num_terminals_reachable[node]++;
empty_nodes.insert(node);
}
auto parents = get_parents(node);
for (auto p : parents) {
reverse_dfs(p);
}
};
reverse_dfs(terminal);
}
while (!dfs.empty()) {
auto v = dfs.back();
dfs.pop_back();
if (visited.count(v)) {
continue;
}
visited.insert(v);
auto parents = get_parents(v);
for (auto p : parents) {
dfs.push_back(p);
ska::flat_hash_set<cudaGraphNode_t> reusable_empty_nodes;
for (auto [node, count] : num_terminals_reachable) {
if (count == terminals.size()) {
reusable_empty_nodes.insert(node);
}
}
return reusable_empty_nodes;
}
// A block is considered reusable during CUDA graph capture if every free
// marker associated with the block is a predecessor of every
// marker (empty node) associated with the block is a predecessor of every
// terminal node.
//
// This ensures that any new operation added to the graph will be attached
@ -1826,52 +1829,36 @@ class DeviceCachingAllocator {
// on every stream, so the block's previous lifetime ends before any new
// lifetime begins. This check relies solely on the DAG topology and does not
// require event queries, making it safe to use during capture.
//
// This function iterates over all deferred blocks, determines if their empty
// nodes are reusable according to the above criteria, and frees the block if
// so.
void free_safe_blocks_in_capture(
const std::shared_ptr<GatheredContext>& context,
cudaStream_t stream) {
auto info = stream_get_capture_info(stream);
auto reusable_empty_nodes = get_reusable_empty_nodes(stream);
// If there are no reusable empty nodes (e.g., not currently capturing),
// there is nothing to do.
if (info.status == cudaStreamCaptureStatusNone || info.num_terminals == 0) {
if (reusable_empty_nodes.empty()) {
return;
}
if (graph_reuse_context.find(info.capture_id) ==
graph_reuse_context.end()) {
bool found = false;
for (auto& entry : captures_underway) {
if (entry.second(stream)) {
auto graph_pool = graph_pools.find(entry.first);
TORCH_INTERNAL_ASSERT(
graph_pool != graph_pools.end(),
"Could not find graph pool for capture.");
auto mempool_id = graph_pool->first;
graph_reuse_context[info.capture_id] = GraphReuseContext{};
mempool_to_capture_id[mempool_id] = info.capture_id;
found = true;
break;
}
}
TORCH_INTERNAL_ASSERT(
found, "Could not find memory pool id for capture.");
}
auto& graph_context = graph_reuse_context[info.capture_id];
auto& visited = graph_context.visited[stream];
update_visited(info, visited);
std::vector<Block*> blocks_to_erase;
for (auto& [block, markers] : deferred_blocks) {
// Skip this block if it has no markers, as we defer its freeing until
for (auto& [block, inserted_empty_nodes] : deferred_blocks) {
// Skip this block if it has no empty nodes, as we defer its freeing until
// after graph capture. Also skip if the block was not allocated on the
// current stream; such blocks will be freed when
// free_safe_blocks_in_capture is attempted on that stream.
if (markers.empty() || block->stream != stream) {
if (inserted_empty_nodes.empty() || block->stream != stream) {
continue;
}
bool is_reusable = true;
for (auto m : markers) {
if (!visited.count(m)) {
for (const auto& node : inserted_empty_nodes) {
if (reusable_empty_nodes.find(node) == reusable_empty_nodes.end()) {
is_reusable = false;
break;
}
@ -1932,11 +1919,11 @@ class DeviceCachingAllocator {
if (!block->stream_uses.empty()) {
if (C10_UNLIKELY(!captures_underway.empty())) {
if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) {
// record_free_markers returns a vector of free markers,
// insert_free_marker returns a vector of free markers,
// or an empty vector if any associated stream is not currently
// capturing. The empty vector means that we will defer the free until
// capture is finished.
deferred_blocks.emplace(block, record_free_markers(block));
deferred_blocks.emplace(block, insert_free_marker(block));
} else {
// If graph_capture_record_stream_reuse is not enabled, always defer
// the free until capture is finished.
@ -2038,22 +2025,6 @@ class DeviceCachingAllocator {
set_fraction = true;
}
/** get expandable segment size for all the streams on device **/
std::vector<StreamSegmentSize> getExpandableSegmentSizes() {
std::lock_guard<std::recursive_mutex> lock(mutex);
std::vector<StreamSegmentSize> sizes;
for (auto& segment : expandable_segments_) {
if (!segment->getStream()) {
continue;
}
sizes.emplace_back(
segment->getStream(),
segment->getSegmentSize() == kSmallBuffer,
segment->getMappedSize());
}
return sizes;
}
/** returns cached blocks to the system allocator **/
void emptyCache(MempoolId_t mempool_id) {
auto context = maybeGatherContext(RecordContext::ALL);
@ -2540,21 +2511,6 @@ class DeviceCachingAllocator {
// Called by CUDAGraph::capture_end
void endAllocateToPool(MempoolId_t mempool_id) {
std::lock_guard<std::recursive_mutex> lock(mutex);
if (CUDAAllocatorConfig::graph_capture_record_stream_reuse() &&
!graph_reuse_context.empty()) {
auto capture_id = mempool_to_capture_id[mempool_id];
auto graph_context = graph_reuse_context[capture_id];
for (auto& [stream, _] : graph_context.visited) {
TORCH_INTERNAL_ASSERT(
stream_get_capture_info(stream).status ==
cudaStreamCaptureStatusNone,
"This stream should not be capturing when the capture is ended");
}
graph_reuse_context.erase(capture_id);
mempool_to_capture_id.erase(mempool_id);
}
for (auto it = captures_underway.begin(); it != captures_underway.end();
++it) {
if (it->first == mempool_id) {
@ -3881,16 +3837,6 @@ class NativeCachingAllocator : public CUDAAllocator {
device_allocator[device]->setMemoryFraction(fraction);
}
std::vector<StreamSegmentSize> getExpandableSegmentSizes(
c10::DeviceIndex device) override {
TORCH_INTERNAL_ASSERT(
0 <= device && static_cast<size_t>(device) < device_allocator.size(),
"Allocator not initialized for device ",
device,
": did you call init?");
return device_allocator[device]->getExpandableSegmentSizes();
}
void recordHistory(
bool enabled,
CreateContextFn context_recorder,

Some files were not shown because too many files have changed in this diff Show More