Compare commits

..

3 Commits

Author SHA1 Message Date
83349ae64d [async_tp] Base support ag-transpose-mm(mat_B) case
ghstack-source-id: edd51b9c46e46e8eca0c45e0ea53c1b26b375c01
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163069
2025-09-19 08:35:51 -07:00
bf08b164dc [async_tp] Support ag+mm with gather_dim lastdim of mat_A
ghstack-source-id: 8de8acdc31566643d4b8370f27006002b05cdd61
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163068
2025-09-16 04:42:16 -07:00
da0b6aea11 [async_tp] Support mm+rs with scatter_dim matmul K by sharding B
ghstack-source-id: dee5390f82c6899af543adc6b91b5954097077ad
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162794
2025-09-12 04:34:10 -07:00
788 changed files with 4275 additions and 12808 deletions

View File

@ -5,9 +5,9 @@ GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
# Set CUDA architecture lists to match x86 build_cuda.sh # Set CUDA architecture lists to match x86 build_cuda.sh
if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0" export TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;8.0;9.0"
elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0" export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;10.0;12.0"
elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX" export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
fi fi
@ -42,6 +42,9 @@ else
echo "Bundling CUDA libraries with wheel for aarch64." echo "Bundling CUDA libraries with wheel for aarch64."
else else
echo "Using nvidia libs from pypi for aarch64." echo "Using nvidia libs from pypi for aarch64."
# Fix platform constraints in PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64
# Replace 'platform_machine == "x86_64"' with 'platform_machine == "aarch64"'
export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS//platform_machine == \'x86_64\'/platform_machine == \'aarch64\'}"
echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS" echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
export USE_NVIDIA_PYPI_LIBS=1 export USE_NVIDIA_PYPI_LIBS=1
fi fi

View File

@ -138,8 +138,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
folder = os.path.dirname(wheel_path) folder = os.path.dirname(wheel_path)
os.mkdir(f"{folder}/tmp") os.mkdir(f"{folder}/tmp")
os.system(f"unzip {wheel_path} -d {folder}/tmp") os.system(f"unzip {wheel_path} -d {folder}/tmp")
# Delete original wheel since it will be repackaged
os.system(f"rm {wheel_path}")
# Check if we should use PyPI NVIDIA libraries or bundle system libraries # Check if we should use PyPI NVIDIA libraries or bundle system libraries
use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1" use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
@ -213,8 +211,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
] ]
# CUDA version-specific libraries # CUDA version-specific libraries
if "13" in desired_cuda: if "130" in desired_cuda:
minor_version = desired_cuda[-1]
version_specific_libs = [ version_specific_libs = [
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13", "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
"/usr/local/cuda/lib64/libcublas.so.13", "/usr/local/cuda/lib64/libcublas.so.13",
@ -224,7 +221,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
"/usr/local/cuda/lib64/libcusolver.so.12", "/usr/local/cuda/lib64/libcusolver.so.12",
"/usr/local/cuda/lib64/libnvJitLink.so.13", "/usr/local/cuda/lib64/libnvJitLink.so.13",
"/usr/local/cuda/lib64/libnvrtc.so.13", "/usr/local/cuda/lib64/libnvrtc.so.13",
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}", "/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
] ]
elif "12" in desired_cuda: elif "12" in desired_cuda:
# Get the last character for libnvrtc-builtins version (e.g., "129" -> "9") # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
@ -240,8 +237,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
"/usr/local/cuda/lib64/libnvrtc.so.12", "/usr/local/cuda/lib64/libnvrtc.so.12",
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}", f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
] ]
else:
raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
# Combine all libraries # Combine all libraries
libs_to_copy = common_libs + version_specific_libs libs_to_copy = common_libs + version_specific_libs
@ -280,7 +275,14 @@ def complete_wheel(folder: str) -> str:
f"/{folder}/dist/{repaired_wheel_name}", f"/{folder}/dist/{repaired_wheel_name}",
) )
else: else:
repaired_wheel_name = list_dir(f"/{folder}/dist")[0] repaired_wheel_name = wheel_name.replace(
"linux_aarch64", "manylinux_2_28_aarch64"
)
print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
os.rename(
f"/{folder}/dist/{wheel_name}",
f"/{folder}/dist/{repaired_wheel_name}",
)
print(f"Copying {repaired_wheel_name} to artifacts") print(f"Copying {repaired_wheel_name} to artifacts")
shutil.copy2( shutil.copy2(

View File

@ -214,7 +214,8 @@ case "$tag" in
TRITON=yes TRITON=yes
;; ;;
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks) pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.10 # TODO (huydhn): Upgrade this to Python >= 3.10
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11 GCC_VERSION=11
VISION=yes VISION=yes
KATEX=yes KATEX=yes

View File

@ -56,13 +56,9 @@ ENV INSTALLED_VISION ${VISION}
# Install rocm # Install rocm
ARG ROCM_VERSION ARG ROCM_VERSION
RUN mkdir ci_commit_pins
COPY ./common/common_utils.sh common_utils.sh
COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
COPY ./common/install_rocm.sh install_rocm.sh COPY ./common/install_rocm.sh install_rocm.sh
RUN bash ./install_rocm.sh RUN bash ./install_rocm.sh
RUN rm install_rocm.sh common_utils.sh RUN rm install_rocm.sh
RUN rm -r ci_commit_pins
COPY ./common/install_rocm_magma.sh install_rocm_magma.sh COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
RUN rm install_rocm_magma.sh RUN rm install_rocm_magma.sh

View File

@ -1 +0,0 @@
7fe50dc3da2069d6645d9deb8c017a876472a977

View File

@ -1 +1 @@
5ae38bdb0dc066c5823e34dc9797afb9de42c866 fccfc522864cf8bc172abe0cd58ae5581e2d44b9

View File

@ -2,11 +2,6 @@
set -ex set -ex
# for pip_install function
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
ROCM_COMPOSABLE_KERNEL_VERSION="$(cat $(dirname $0)/../ci_commit_pins/rocm-composable-kernel.txt)"
ver() { ver() {
printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' '); printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
} }
@ -118,8 +113,6 @@ EOF
rm -rf HIP clr rm -rf HIP clr
fi fi
pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
# Cleanup # Cleanup
apt-get autoclean && apt-get clean apt-get autoclean && apt-get clean
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
@ -183,8 +176,6 @@ install_centos() {
sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;" sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
done done
pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
# Cleanup # Cleanup
yum clean all yum clean all
rm -rf /var/cache/yum rm -rf /var/cache/yum

View File

@ -52,13 +52,9 @@ ENV INSTALLED_VISION ${VISION}
# Install rocm # Install rocm
ARG ROCM_VERSION ARG ROCM_VERSION
RUN mkdir ci_commit_pins
COPY ./common/common_utils.sh common_utils.sh
COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
COPY ./common/install_rocm.sh install_rocm.sh COPY ./common/install_rocm.sh install_rocm.sh
RUN bash ./install_rocm.sh RUN bash ./install_rocm.sh
RUN rm install_rocm.sh common_utils.sh RUN rm install_rocm.sh
RUN rm -r ci_commit_pins
COPY ./common/install_rocm_magma.sh install_rocm_magma.sh COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
RUN rm install_rocm_magma.sh RUN rm install_rocm_magma.sh

View File

@ -66,11 +66,6 @@ class VllmBuildParameters:
"DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm" "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
) )
# the cleaning script to remove torch dependencies from pip
cleaning_script: Path = env_path_field(
"cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
)
# OUTPUT_DIR: where docker buildx (local exporter) will write artifacts # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm") output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")
@ -165,7 +160,6 @@ class VllmBuildRunner(BaseRunner):
logger.info("Running vllm build with inputs: %s", inputs) logger.info("Running vllm build with inputs: %s", inputs)
vllm_commit = clone_vllm() vllm_commit = clone_vllm()
self.cp_torch_cleaning_script(inputs)
self.cp_dockerfile_if_exist(inputs) self.cp_dockerfile_if_exist(inputs)
# cp torch wheels from root direct to vllm workspace if exist # cp torch wheels from root direct to vllm workspace if exist
self.cp_torch_whls_if_exist(inputs) self.cp_torch_whls_if_exist(inputs)
@ -211,11 +205,6 @@ class VllmBuildRunner(BaseRunner):
copy(inputs.torch_whls_path, tmp_dir) copy(inputs.torch_whls_path, tmp_dir)
return tmp_dir return tmp_dir
def cp_torch_cleaning_script(self, inputs: VllmBuildParameters):
script = get_path(inputs.cleaning_script, resolve=True)
vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
copy(script, vllm_script)
def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters): def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):
if not inputs.use_local_dockerfile: if not inputs.use_local_dockerfile:
logger.info("using vllm default dockerfile.torch_nightly for build") logger.info("using vllm default dockerfile.torch_nightly for build")

View File

@ -11,7 +11,7 @@ from typing import Any
from cli.lib.common.cli_helper import BaseRunner from cli.lib.common.cli_helper import BaseRunner
from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env
from cli.lib.common.path_helper import copy, get_path, remove_dir from cli.lib.common.path_helper import copy, remove_dir
from cli.lib.common.pip_helper import ( from cli.lib.common.pip_helper import (
pip_install_first_match, pip_install_first_match,
pip_install_packages, pip_install_packages,
@ -43,10 +43,6 @@ class VllmTestParameters:
torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9") torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
cleaning_script: Path = env_path_field(
"cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
)
def __post_init__(self): def __post_init__(self):
if not self.torch_whls_path.exists(): if not self.torch_whls_path.exists():
raise ValueError("missing torch_whls_path") raise ValueError("missing torch_whls_path")
@ -96,13 +92,11 @@ class VllmTestRunner(BaseRunner):
self._set_envs(params) self._set_envs(params)
clone_vllm(dst=self.work_directory) clone_vllm(dst=self.work_directory)
self.cp_torch_cleaning_script(params)
with working_directory(self.work_directory): with working_directory(self.work_directory):
remove_dir(Path("vllm")) remove_dir(Path("vllm"))
self._install_wheels(params) self._install_wheels(params)
self._install_dependencies() self._install_dependencies()
# verify the torches are not overridden by test dependencies # verify the torches are not overridden by test dependencies
check_versions() check_versions()
def run(self): def run(self):
@ -131,11 +125,6 @@ class VllmTestRunner(BaseRunner):
# double check the torches are not overridden by other packages # double check the torches are not overridden by other packages
check_versions() check_versions()
def cp_torch_cleaning_script(self, params: VllmTestParameters):
script = get_path(params.cleaning_script, resolve=True)
vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
copy(script, vllm_script)
def _install_wheels(self, params: VllmTestParameters): def _install_wheels(self, params: VllmTestParameters):
logger.info("Running vllm test with inputs: %s", params) logger.info("Running vllm test with inputs: %s", params)
if not pkg_exists("torch"): if not pkg_exists("torch"):

View File

@ -258,19 +258,11 @@ function install_torchrec_and_fbgemm() {
git clone --recursive https://github.com/pytorch/fbgemm git clone --recursive https://github.com/pytorch/fbgemm
pushd fbgemm/fbgemm_gpu pushd fbgemm/fbgemm_gpu
git checkout "${fbgemm_commit}" --recurse-submodules git checkout "${fbgemm_commit}" --recurse-submodules
# until the fbgemm_commit includes the tbb patch python setup.py bdist_wheel \
patch <<'EOF' --build-variant=rocm \
--- a/FbgemmGpu.cmake -DHIP_ROOT_DIR="${ROCM_PATH}" \
+++ b/FbgemmGpu.cmake -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
@@ -184,5 +184,6 @@ gpu_cpp_library( -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
fbgemm_gpu_tbe_cache
fbgemm_gpu_tbe_optimizers
fbgemm_gpu_tbe_utils
+ tbb
DESTINATION
fbgemm_gpu)
EOF
python setup.py bdist_wheel --build-variant=rocm
popd popd
# Save the wheel before cleaning up # Save the wheel before cleaning up

View File

@ -386,8 +386,8 @@ def smoke_test_compile(device: str = "cpu") -> None:
def smoke_test_nvshmem() -> None: def smoke_test_nvshmem() -> None:
if not torch.cuda.is_available() or target_os == "windows": if not torch.cuda.is_available():
print("Windows platform or CUDA is not available, skipping NVSHMEM test") print("CUDA is not available, skipping NVSHMEM test")
return return
# Check if NVSHMEM is compiled in current build # Check if NVSHMEM is compiled in current build
@ -396,9 +396,7 @@ def smoke_test_nvshmem() -> None:
except ImportError: except ImportError:
# Not built with NVSHMEM support. # Not built with NVSHMEM support.
# torch is not compiled with NVSHMEM prior to 2.9 # torch is not compiled with NVSHMEM prior to 2.9
from torch.torch_version import TorchVersion if torch.__version__ < "2.9":
if TorchVersion(torch.__version__) < (2, 9):
return return
else: else:
# After 2.9: NVSHMEM is expected to be compiled in current build # After 2.9: NVSHMEM is expected to be compiled in current build

View File

@ -1721,6 +1721,11 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
install_torchvision install_torchvision
test_inductor_shard "${SHARD_NUMBER}" test_inductor_shard "${SHARD_NUMBER}"
if [[ "${SHARD_NUMBER}" == 1 ]]; then
if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then
test_inductor_distributed
fi
fi
elif [[ "${TEST_CONFIG}" == *einops* ]]; then elif [[ "${TEST_CONFIG}" == *einops* ]]; then
test_einops test_einops
elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then

View File

@ -1,9 +1,9 @@
set WIN_DRIVER_VN=580.88 set WIN_DRIVER_VN=528.89
set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe" & REM @lint-ignore set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" & REM @lint-ignore
curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe
if errorlevel 1 exit /b 1 if errorlevel 1 exit /b 1
start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe -s -noreboot start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe -s -noreboot
if errorlevel 1 exit /b 1 if errorlevel 1 exit /b 1
del %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe || ver > NUL del %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe || ver > NUL

View File

@ -85,7 +85,7 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
# Create an isolated directory to store this builds pytorch checkout and conda # Create an isolated directory to store this builds pytorch checkout and conda
# installation # installation
if [[ -z "$MAC_PACKAGE_WORK_DIR" ]]; then if [[ -z "$MAC_PACKAGE_WORK_DIR" ]]; then
MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_${DESIRED_PYTHON}_$(date +%H%M%S)" MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_conda_${DESIRED_PYTHON}_$(date +%H%M%S)"
fi fi
mkdir -p "$MAC_PACKAGE_WORK_DIR" || true mkdir -p "$MAC_PACKAGE_WORK_DIR" || true
if [[ -n ${GITHUB_ACTIONS} ]]; then if [[ -n ${GITHUB_ACTIONS} ]]; then
@ -96,11 +96,11 @@ fi
whl_tmp_dir="${MAC_PACKAGE_WORK_DIR}/dist" whl_tmp_dir="${MAC_PACKAGE_WORK_DIR}/dist"
mkdir -p "$whl_tmp_dir" mkdir -p "$whl_tmp_dir"
mac_version='macosx-11_0-arm64' mac_version='macosx_11_0_arm64'
libtorch_arch='arm64' libtorch_arch='arm64'
# Create a consistent wheel package name to rename the wheel to # Create a consistent wheel package name to rename the wheel to
wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version//[-,]/_}.whl" wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version}.whl"
########################################################### ###########################################################
@ -125,6 +125,7 @@ popd
export TH_BINARY_BUILD=1 export TH_BINARY_BUILD=1
export INSTALL_TEST=0 # dont install test binaries into site-packages export INSTALL_TEST=0 # dont install test binaries into site-packages
export MACOSX_DEPLOYMENT_TARGET=11.0 export MACOSX_DEPLOYMENT_TARGET=11.0
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
EXTRA_CONDA_INSTALL_FLAGS="" EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS="" CONDA_ENV_CREATE_FLAGS=""
@ -132,19 +133,25 @@ RENAME_WHEEL=true
case $desired_python in case $desired_python in
3.14t) 3.14t)
echo "Using 3.14 deps" echo "Using 3.14 deps"
mac_version='macosx-11.0-arm64'
NUMPY_PINNED_VERSION="==2.1.0" NUMPY_PINNED_VERSION="==2.1.0"
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
RENAME_WHEEL=false RENAME_WHEEL=false
;; ;;
3.14) 3.14)
echo "Using 3.14t deps" echo "Using 3.14t deps"
mac_version='macosx-11.0-arm64'
NUMPY_PINNED_VERSION="==2.1.0" NUMPY_PINNED_VERSION="==2.1.0"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
RENAME_WHEEL=false RENAME_WHEEL=false
;; ;;
3.13t) 3.13t)
echo "Using 3.13 deps" echo "Using 3.13 deps"
NUMPY_PINNED_VERSION="==2.1.0" NUMPY_PINNED_VERSION="==2.1.0"
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
RENAME_WHEEL=false RENAME_WHEEL=false
;; ;;
3.13) 3.13)
@ -169,12 +176,17 @@ case $desired_python in
;; ;;
esac esac
# Install into a fresh env
tmp_env_name="wheel_py$python_nodot"
conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS}
source activate "$tmp_env_name"
PINNED_PACKAGES=( PINNED_PACKAGES=(
"numpy${NUMPY_PINNED_VERSION}" "numpy${NUMPY_PINNED_VERSION}"
) )
python -mvenv ~/${desired_python}-build retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
source ~/${desired_python}-build/bin/activate pip install requests ninja typing-extensions
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt" retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
retry brew install libomp retry brew install libomp
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which # For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
@ -188,7 +200,7 @@ export BUILD_TEST=OFF
pushd "$pytorch_rootdir" pushd "$pytorch_rootdir"
echo "Calling setup.py bdist_wheel at $(date)" echo "Calling setup.py bdist_wheel at $(date)"
_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name "${mac_version//[-.]/_}" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version}
echo "Finished setup.py bdist_wheel at $(date)" echo "Finished setup.py bdist_wheel at $(date)"

View File

@ -73,7 +73,7 @@ exclude =
./docs/src, ./docs/src,
./functorch/docs, ./functorch/docs,
./functorch/examples, ./functorch/examples,
./functorch/docs/source/tutorials, ./functorch/notebooks,
./scripts, ./scripts,
./test/generated_type_hints_smoketest.py, ./test/generated_type_hints_smoketest.py,
./third_party, ./third_party,

View File

@ -1 +1 @@
caba63f0fa29ef9e3d566699f32f11c07c8bda4e 27fc2493d383354a008106f22f3be232badee9a1

View File

@ -1 +1 @@
08ae0af1395c8d8471f4025deb6af9aef90b342f 7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8

View File

@ -1 +1 @@
f510715882304796a96e33028b4f6de1b026c2c7 e10fef08838612b4560e9c72e5cb1414a5edfa13

View File

@ -1,17 +0,0 @@
import glob
requires_files = glob.glob("requirements/*.txt")
requires_files += ["pyproject.toml"]
for file in requires_files:
print(f">>> cleaning {file}")
with open(file) as f:
lines = f.readlines()
if "torch" in "".join(lines).lower():
print("removed:")
with open(file, "w") as f:
for line in lines:
if "torch" not in line.lower():
f.write(line)
print(f"<<< done cleaning {file}")
print()

View File

@ -15,7 +15,7 @@ optree==0.13.0
packaging==23.1 packaging==23.1
parameterized==0.8.1 parameterized==0.8.1
pillow==10.3.0 pillow==10.3.0
protobuf==5.29.5 protobuf==5.29.4
psutil==5.9.8 psutil==5.9.8
pygments==2.15.0 pygments==2.15.0
pytest-cpp==2.3.0 pytest-cpp==2.3.0
@ -26,7 +26,7 @@ pytest-xdist==3.3.1
pytest==7.3.2 pytest==7.3.2
pyyaml==6.0.2 pyyaml==6.0.2
scipy==1.12.0 scipy==1.12.0
setuptools==78.1.1 setuptools==72.1.0
sympy==1.13.3 sympy==1.13.3
tlparse==0.4.0 tlparse==0.4.0
tensorboard==2.13.0 tensorboard==2.13.0

View File

@ -39,9 +39,7 @@ def main() -> None:
pull_request_label_names = [label.name for label in pull_request_labels] pull_request_label_names = [label.name for label in pull_request_labels]
issue_label_names = [label.name for label in issue_labels] issue_label_names = [label.name for label in issue_labels]
labels_to_add = [ labels_to_add = [
label label for label in issue_label_names if label not in pull_request_label_names
for label in issue_label_names
if label not in pull_request_label_names and label != "actionable"
] ]
if not labels_to_add: if not labels_to_add:
print("The pull request already has the same labels.") print("The pull request already has the same labels.")

View File

@ -43,55 +43,55 @@ CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"]
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"12.6": ( "12.6": (
"nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | " "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | " "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | " "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | " "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | " "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | " "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | " "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | " "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | " "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | " "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | " "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | " "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | " "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'" "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
), ),
"12.8": ( "12.8": (
"nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | " "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | " "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | " "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | " "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | " "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | " "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | " "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | " "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | " "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | " "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | " "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | " "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | " "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'" "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
), ),
"13.0": ( "13.0": (
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | " "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | " "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | " "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | " "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas==13.0.0.19; platform_system == 'Linux' | " "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft==12.0.0.15; platform_system == 'Linux' | " "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand==10.4.0.35; platform_system == 'Linux' | " "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | " "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | " "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | " "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | " "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | " "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx==13.0.39; platform_system == 'Linux' | " "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | " "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufile==1.15.0.42; platform_system == 'Linux'" "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
), ),
"xpu": ( "xpu": (
"intel-cmplr-lib-rt==2025.2.1 | " "intel-cmplr-lib-rt==2025.2.1 | "

View File

@ -1,91 +0,0 @@
#!/usr/bin/env bash
set -eux
torch_version=$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
nightly=$(echo ${torch_version} | cut -d'.' -f4)
# Copied from .ci/manywheel/build_common.sh
make_wheel_record() {
fpath=$1
if echo $fpath | grep RECORD >/dev/null 2>&1; then
echo "$fpath,,"
else
fhash=$(openssl dgst -sha256 -binary $fpath | openssl base64 | sed -e 's/+/-/g' | sed -e 's/\//_/g' | sed -e 's/=//g')
fsize=$(ls -nl $fpath | awk '{print $5}')
echo "$fpath,sha256=$fhash,$fsize"
fi
}
change_wheel_version() {
local package=$1
local wheel=$2
local f_version=$3
local t_version=$4
# Extract the wheel
${PYTHON_EXECUTABLE} -mwheel unpack $wheel
mv "${package}-${f_version}" "${package}-${t_version}"
# Change the version from f_version to t_version in the dist-info dir
pushd "${package}-${t_version}"
mv "${package}-${f_version}.dist-info" "${package}-${t_version}.dist-info"
pushd "${package}-${t_version}.dist-info"
sed -i "s/${package}-${f_version}.dist-info/${package}-${t_version}.dist-info/g" RECORD
# Update the version in METADATA and its SHA256 hash
sed -i "s/Version: ${f_version}/Version: ${t_version}/g" METADATA
# then add PyTorch nightly dependency of vLLM
if [[ "${package}" == vllm ]] || [[ "${package}" == xformers ]]; then
sed -i "/License-File/a\Requires-Dist: torch==${torch_version}" METADATA
fi
sed -i '/METADATA,sha256/d' RECORD
popd
make_wheel_record "${package}-${t_version}.dist-info/METADATA" >> "${package}-${t_version}.dist-info/RECORD"
popd
# Repack the wheel
${PYTHON_EXECUTABLE} -mwheel pack "${package}-${t_version}"
# Clean up
rm -rf "${package}-${t_version}"
}
repackage_wheel() {
local package=$1
pushd $package
local orig_wheel=$(find . -name *${package//-/_}*)
local orig_version=$(unzip -p $orig_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
local version=""
if [[ "${package}" == vllm ]]; then
# Copied from vllm/.buildkite/scripts/upload-wheels.sh
version=1.0.0
else
version=$(echo $orig_version | tr '.+' '.' | cut -d'.' -f1-3)
fi
local nightly_version=$version.$nightly
# Use nightly version
change_wheel_version ${package//-/_} $orig_wheel $orig_version $nightly_version
# Clean up
rm "${orig_wheel}"
auditwheel repair --plat $PLATFORM *.whl \
--exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
local repair_wheel=$(find wheelhouse -name *${PLATFORM}*)
local repair_wheel=$(basename ${repair_wheel})
popd
cp ${package}/wheelhouse/${repair_wheel} .
rm -rf $package
}
pushd externals/vllm/wheels
for package in xformers flashinfer-python vllm; do
repackage_wheel $package
done
popd

View File

@ -22,16 +22,6 @@ name: !{{ build_environment }}
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
{%- endmacro %} {%- endmacro %}
{%- macro setup_python(py_ver) -%}
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
{%- endmacro %}
on: on:
# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 # TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
push: push:
@ -71,13 +61,23 @@ jobs:
{%- endif %} {%- endif %}
steps: steps:
!{{ set_runner_specific_vars() }} !{{ set_runner_specific_vars() }}
!{{ setup_python(config.get("python_version", "3.10")) }} - name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
!{{ common.checkout(deep_clone=False, directory="pytorch") }} !{{ common.checkout(deep_clone=False, directory="pytorch") }}
- name: Populate binary env - name: Populate binary env
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary - name: Build PyTorch binary
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -94,6 +94,8 @@ jobs:
{%- if config["package_type"] == "wheel" %} {%- if config["package_type"] == "wheel" %}
- name: Test PyTorch wheel - name: Test PyTorch wheel
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -104,9 +106,33 @@ jobs:
SMOKE_TEST_PARAMS="" SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086 # shellcheck disable=SC2086
python -mvenv test_venv conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
source test_venv/bin/activate conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086 # shellcheck disable=SC2086

View File

@ -59,6 +59,20 @@ jobs:
run: | run: |
set -eux set -eux
# Keep PyTorch nightly wheel here so that we can install it later during
# vLLM build process
mkdir -p "${RUNNER_TEMP}/artifacts/"
container_name=$(docker run \
--tty \
--detach \
-e PLATFORM \
-v "${GITHUB_WORKSPACE}:/pytorch" \
-v "${RUNNER_TEMP}/artifacts:/artifacts" \
-w /artifacts/ \
"${MANYLINUX_IMAGE}"
)
# Determine python executable for given version (copied from build-triton-wheel) # Determine python executable for given version (copied from build-triton-wheel)
case $PY_VERS in case $PY_VERS in
3.10) 3.10)
@ -88,21 +102,6 @@ jobs:
;; ;;
esac esac
# Keep PyTorch nightly wheel here so that we can install it later during
# vLLM build process
mkdir -p "${RUNNER_TEMP}/artifacts/"
container_name=$(docker run \
--tty \
--detach \
-e PLATFORM \
-e PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
-v "${GITHUB_WORKSPACE}:/pytorch" \
-v "${RUNNER_TEMP}/artifacts:/artifacts" \
-w /artifacts/ \
"${MANYLINUX_IMAGE}"
)
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \ docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \
--pre torch torchvision torchaudio \ --pre torch torchvision torchaudio \
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}" --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
@ -114,6 +113,7 @@ jobs:
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}" --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
# Save this for later # Save this for later
echo "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" >> "$GITHUB_ENV"
echo "container_name=${container_name}" >> "$GITHUB_ENV" echo "container_name=${container_name}" >> "$GITHUB_ENV"
- name: Build vLLM wheel - name: Build vLLM wheel
@ -131,7 +131,36 @@ jobs:
set -eux set -eux
# Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh # Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh
docker exec -t "${container_name}" bash -c /pytorch/.github/scripts/prepare_vllm_wheels.sh docker exec -t "${container_name}" bash -c "
set -eux
nightly=\$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2 | cut -d'.' -f4)
pushd externals/vllm/wheels
for package in xformers flashinfer-python vllm; do
pushd \$package
auditwheel repair --plat \$PLATFORM *.whl \
--exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
repair_wheel=\$(find wheelhouse -name *\${PLATFORM}*)
repair_wheel=\$(basename \${repair_wheel})
popd
cp \${package}/wheelhouse/\${repair_wheel} .
version=\$(unzip -p \$repair_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
if [[ \$package == vllm ]]; then
new_wheel=\${repair_wheel/\$version/1.0.0.\$nightly}
else
major_version=\$(echo \$version | tr '.+' '.' | cut -d'.' -f1-3)
new_wheel=\${repair_wheel/\$version/\$major_version.\$nightly}
fi
mv -- \$repair_wheel \$new_wheel
rm -rf \$package
done
popd
"
docker exec -t "${container_name}" chown -R 1000:1000 /artifacts docker exec -t "${container_name}" chown -R 1000:1000 /artifacts
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0

View File

@ -132,7 +132,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_6 build_name: manywheel-py3_10-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -178,7 +178,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_8 build_name: manywheel-py3_10-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -224,7 +224,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-13_0 build_name: manywheel-py3_10-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -335,7 +335,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_6 build_name: manywheel-py3_11-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -381,7 +381,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_8 build_name: manywheel-py3_11-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -427,7 +427,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-13_0 build_name: manywheel-py3_11-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -538,7 +538,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_6 build_name: manywheel-py3_12-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -584,7 +584,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_8 build_name: manywheel-py3_12-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -630,7 +630,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-13_0 build_name: manywheel-py3_12-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -741,7 +741,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_6 build_name: manywheel-py3_13-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -787,7 +787,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_8 build_name: manywheel-py3_13-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -833,7 +833,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-13_0 build_name: manywheel-py3_13-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -944,7 +944,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_6 build_name: manywheel-py3_13t-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -990,7 +990,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_8 build_name: manywheel-py3_13t-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1036,7 +1036,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-13_0 build_name: manywheel-py3_13t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1147,7 +1147,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_6 build_name: manywheel-py3_14-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1193,7 +1193,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_8 build_name: manywheel-py3_14-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1239,7 +1239,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-13_0 build_name: manywheel-py3_14-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1350,7 +1350,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_6 build_name: manywheel-py3_14t-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1396,7 +1396,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_8 build_name: manywheel-py3_14t-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1442,7 +1442,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-13_0 build_name: manywheel-py3_14t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -60,7 +60,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_8 build_name: manywheel-py3_12-cuda12_8
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_8-test: # Testing manywheel-py3_12-cuda12_8-test: # Testing

View File

@ -127,7 +127,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda12_6 build_name: manywheel-py3_10-cuda12_6
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_6-test: # Testing manywheel-py3_10-cuda12_6-test: # Testing
@ -193,7 +193,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda12_8 build_name: manywheel-py3_10-cuda12_8
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_8-test: # Testing manywheel-py3_10-cuda12_8-test: # Testing
@ -259,7 +259,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda13_0 build_name: manywheel-py3_10-cuda13_0
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda13_0-test: # Testing manywheel-py3_10-cuda13_0-test: # Testing
@ -719,7 +719,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_6 build_name: manywheel-py3_11-cuda12_6
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_6-test: # Testing manywheel-py3_11-cuda12_6-test: # Testing
@ -785,7 +785,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_8 build_name: manywheel-py3_11-cuda12_8
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_8-test: # Testing manywheel-py3_11-cuda12_8-test: # Testing
@ -851,7 +851,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda13_0 build_name: manywheel-py3_11-cuda13_0
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda13_0-test: # Testing manywheel-py3_11-cuda13_0-test: # Testing
@ -1311,7 +1311,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_6 build_name: manywheel-py3_12-cuda12_6
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_6-test: # Testing manywheel-py3_12-cuda12_6-test: # Testing
@ -1377,7 +1377,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_8 build_name: manywheel-py3_12-cuda12_8
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_8-test: # Testing manywheel-py3_12-cuda12_8-test: # Testing
@ -1443,7 +1443,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda13_0 build_name: manywheel-py3_12-cuda13_0
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda13_0-test: # Testing manywheel-py3_12-cuda13_0-test: # Testing
@ -1903,7 +1903,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda12_6 build_name: manywheel-py3_13-cuda12_6
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_6-test: # Testing manywheel-py3_13-cuda12_6-test: # Testing
@ -1969,7 +1969,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda12_8 build_name: manywheel-py3_13-cuda12_8
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_8-test: # Testing manywheel-py3_13-cuda12_8-test: # Testing
@ -2035,7 +2035,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda13_0 build_name: manywheel-py3_13-cuda13_0
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda13_0-test: # Testing manywheel-py3_13-cuda13_0-test: # Testing
@ -2495,7 +2495,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda12_6 build_name: manywheel-py3_13t-cuda12_6
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_6-test: # Testing manywheel-py3_13t-cuda12_6-test: # Testing
@ -2561,7 +2561,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda12_8 build_name: manywheel-py3_13t-cuda12_8
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_8-test: # Testing manywheel-py3_13t-cuda12_8-test: # Testing
@ -2627,7 +2627,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda13_0 build_name: manywheel-py3_13t-cuda13_0
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda13_0-test: # Testing manywheel-py3_13t-cuda13_0-test: # Testing
@ -3087,7 +3087,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda12_6 build_name: manywheel-py3_14-cuda12_6
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_6-test: # Testing manywheel-py3_14-cuda12_6-test: # Testing
@ -3153,7 +3153,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda12_8 build_name: manywheel-py3_14-cuda12_8
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_8-test: # Testing manywheel-py3_14-cuda12_8-test: # Testing
@ -3219,7 +3219,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda13_0 build_name: manywheel-py3_14-cuda13_0
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda13_0-test: # Testing manywheel-py3_14-cuda13_0-test: # Testing
@ -3679,7 +3679,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda12_6 build_name: manywheel-py3_14t-cuda12_6
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_6-test: # Testing manywheel-py3_14t-cuda12_6-test: # Testing
@ -3745,7 +3745,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda12_8 build_name: manywheel-py3_14t-cuda12_8
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_8-test: # Testing manywheel-py3_14t-cuda12_8-test: # Testing
@ -3811,7 +3811,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda13_0 build_name: manywheel-py3_14t-cuda13_0
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda13_0-test: # Testing manywheel-py3_14t-cuda13_0-test: # Testing

View File

@ -60,13 +60,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129 # shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python - name: Install conda and dependencies
uses: actions/setup-python@v6 run: |
with: # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
# TODO: Removeme once 3.14 is out curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 chmod +x "${RUNNER_TEMP}/conda.sh"
python-version: "3.10.4" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
freethreaded: false echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch - name: Checkout PyTorch
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
@ -81,9 +81,13 @@ jobs:
working-directory: pytorch working-directory: pytorch
- name: Populate binary env - name: Populate binary env
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary - name: Build PyTorch binary
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"

View File

@ -56,13 +56,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129 # shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python - name: Install conda and dependencies
uses: actions/setup-python@v6 run: |
with: # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
# TODO: Removeme once 3.14 is out curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 chmod +x "${RUNNER_TEMP}/conda.sh"
python-version: "3.10.4" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
freethreaded: false echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch - name: Checkout PyTorch
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
@ -77,9 +77,13 @@ jobs:
working-directory: pytorch working-directory: pytorch
- name: Populate binary env - name: Populate binary env
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary - name: Build PyTorch binary
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -95,6 +99,8 @@ jobs:
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel - name: Test PyTorch wheel
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -105,9 +111,33 @@ jobs:
SMOKE_TEST_PARAMS="" SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086 # shellcheck disable=SC2086
python -mvenv test_venv conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
source test_venv/bin/activate conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086 # shellcheck disable=SC2086
@ -166,13 +196,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129 # shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python - name: Install conda and dependencies
uses: actions/setup-python@v6 run: |
with: # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
# TODO: Removeme once 3.14 is out curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 chmod +x "${RUNNER_TEMP}/conda.sh"
python-version: "3.11.4" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
freethreaded: false echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch - name: Checkout PyTorch
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
@ -187,9 +217,13 @@ jobs:
working-directory: pytorch working-directory: pytorch
- name: Populate binary env - name: Populate binary env
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary - name: Build PyTorch binary
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -205,6 +239,8 @@ jobs:
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel - name: Test PyTorch wheel
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -215,9 +251,33 @@ jobs:
SMOKE_TEST_PARAMS="" SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086 # shellcheck disable=SC2086
python -mvenv test_venv conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
source test_venv/bin/activate conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086 # shellcheck disable=SC2086
@ -276,13 +336,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129 # shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python - name: Install conda and dependencies
uses: actions/setup-python@v6 run: |
with: # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
# TODO: Removeme once 3.14 is out curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 chmod +x "${RUNNER_TEMP}/conda.sh"
python-version: "3.12.4" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
freethreaded: false echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch - name: Checkout PyTorch
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
@ -297,9 +357,13 @@ jobs:
working-directory: pytorch working-directory: pytorch
- name: Populate binary env - name: Populate binary env
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary - name: Build PyTorch binary
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -315,6 +379,8 @@ jobs:
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel - name: Test PyTorch wheel
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -325,9 +391,33 @@ jobs:
SMOKE_TEST_PARAMS="" SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086 # shellcheck disable=SC2086
python -mvenv test_venv conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
source test_venv/bin/activate conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086 # shellcheck disable=SC2086
@ -386,13 +476,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129 # shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python - name: Install conda and dependencies
uses: actions/setup-python@v6 run: |
with: # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
# TODO: Removeme once 3.14 is out curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 chmod +x "${RUNNER_TEMP}/conda.sh"
python-version: "3.13.4" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
freethreaded: false echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch - name: Checkout PyTorch
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
@ -407,9 +497,13 @@ jobs:
working-directory: pytorch working-directory: pytorch
- name: Populate binary env - name: Populate binary env
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary - name: Build PyTorch binary
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -425,6 +519,8 @@ jobs:
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel - name: Test PyTorch wheel
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -435,9 +531,33 @@ jobs:
SMOKE_TEST_PARAMS="" SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086 # shellcheck disable=SC2086
python -mvenv test_venv conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
source test_venv/bin/activate conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086 # shellcheck disable=SC2086
@ -496,13 +616,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129 # shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python - name: Install conda and dependencies
uses: actions/setup-python@v6 run: |
with: # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
# TODO: Removeme once 3.14 is out curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 chmod +x "${RUNNER_TEMP}/conda.sh"
python-version: "3.13.4" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
freethreaded: true echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch - name: Checkout PyTorch
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
@ -517,9 +637,13 @@ jobs:
working-directory: pytorch working-directory: pytorch
- name: Populate binary env - name: Populate binary env
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary - name: Build PyTorch binary
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -535,6 +659,8 @@ jobs:
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel - name: Test PyTorch wheel
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -545,9 +671,33 @@ jobs:
SMOKE_TEST_PARAMS="" SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086 # shellcheck disable=SC2086
python -mvenv test_venv conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
source test_venv/bin/activate conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086 # shellcheck disable=SC2086
@ -606,13 +756,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129 # shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python - name: Install conda and dependencies
uses: actions/setup-python@v6 run: |
with: # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
# TODO: Removeme once 3.14 is out curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 chmod +x "${RUNNER_TEMP}/conda.sh"
python-version: "3.14.0-rc.2" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
freethreaded: false echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch - name: Checkout PyTorch
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
@ -627,9 +777,13 @@ jobs:
working-directory: pytorch working-directory: pytorch
- name: Populate binary env - name: Populate binary env
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary - name: Build PyTorch binary
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -645,6 +799,8 @@ jobs:
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel - name: Test PyTorch wheel
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -655,9 +811,33 @@ jobs:
SMOKE_TEST_PARAMS="" SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086 # shellcheck disable=SC2086
python -mvenv test_venv conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
source test_venv/bin/activate conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086 # shellcheck disable=SC2086
@ -716,13 +896,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129 # shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python - name: Install conda and dependencies
uses: actions/setup-python@v6 run: |
with: # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
# TODO: Removeme once 3.14 is out curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 chmod +x "${RUNNER_TEMP}/conda.sh"
python-version: "3.14.0-rc.2" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
freethreaded: true echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch - name: Checkout PyTorch
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
@ -737,9 +917,13 @@ jobs:
working-directory: pytorch working-directory: pytorch
- name: Populate binary env - name: Populate binary env
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary - name: Build PyTorch binary
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -755,6 +939,8 @@ jobs:
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh" "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel - name: Test PyTorch wheel
run: | run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail set -eux -o pipefail
# shellcheck disable=SC1090 # shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -765,9 +951,33 @@ jobs:
SMOKE_TEST_PARAMS="" SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086 # shellcheck disable=SC2086
python -mvenv test_venv conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
source test_venv/bin/activate conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086 # shellcheck disable=SC2086

View File

@ -37,7 +37,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix needs: get-default-label-prefix
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
test-matrix: | test-matrix: |
@ -56,7 +56,7 @@ jobs:
uses: ./.github/workflows/_linux-test.yml uses: ./.github/workflows/_linux-test.yml
needs: nightly-dynamo-benchmarks-build needs: nightly-dynamo-benchmarks-build
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }} docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }} test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
timeout-minutes: 720 timeout-minutes: 720

View File

@ -75,7 +75,7 @@ jobs:
needs: get-label-type needs: get-label-type
with: with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
test-matrix: | test-matrix: |
{ include: [ { include: [
@ -101,7 +101,7 @@ jobs:
needs: inductor-build needs: inductor-build
if: github.event.schedule == '0 7 * * *' if: github.event.schedule == '0 7 * * *'
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
docker-image: ${{ needs.inductor-build.outputs.docker-image }} docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
@ -118,7 +118,7 @@ jobs:
needs: inductor-build needs: inductor-build
if: github.event_name == 'workflow_dispatch' if: github.event_name == 'workflow_dispatch'
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }} dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
docker-image: ${{ needs.inductor-build.outputs.docker-image }} docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}

View File

@ -80,7 +80,7 @@ jobs:
needs: get-label-type needs: get-label-type
with: with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
test-matrix: | test-matrix: |
{ include: [ { include: [
@ -107,7 +107,7 @@ jobs:
needs: inductor-build needs: inductor-build
if: github.event.schedule == '0 7 * * *' if: github.event.schedule == '0 7 * * *'
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
docker-image: ${{ needs.inductor-build.outputs.docker-image }} docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
@ -124,7 +124,7 @@ jobs:
needs: inductor-build needs: inductor-build
if: github.event_name == 'workflow_dispatch' if: github.event_name == 'workflow_dispatch'
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }} dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
docker-image: ${{ needs.inductor-build.outputs.docker-image }} docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}

View File

@ -154,7 +154,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix needs: get-default-label-prefix
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
test-matrix: | test-matrix: |
@ -200,7 +200,7 @@ jobs:
uses: ./.github/workflows/_linux-test.yml uses: ./.github/workflows/_linux-test.yml
needs: periodic-dynamo-benchmarks-cpu-build needs: periodic-dynamo-benchmarks-cpu-build
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }} docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }} test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
secrets: inherit secrets: inherit

View File

@ -110,7 +110,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml uses: ./.github/workflows/_linux-build.yml
needs: get-label-type needs: get-label-type
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
test-matrix: | test-matrix: |
@ -127,7 +127,7 @@ jobs:
uses: ./.github/workflows/_linux-test.yml uses: ./.github/workflows/_linux-test.yml
needs: inductor-cpu-build needs: inductor-cpu-build
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }} docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }} test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
secrets: inherit secrets: inherit

View File

@ -79,7 +79,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml uses: ./.github/workflows/_linux-build.yml
needs: get-label-type needs: get-label-type
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
test-matrix: | test-matrix: |
@ -101,7 +101,7 @@ jobs:
uses: ./.github/workflows/_linux-test.yml uses: ./.github/workflows/_linux-test.yml
needs: inductor-cpu-build needs: inductor-cpu-build
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }} docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }} test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
secrets: inherit secrets: inherit

View File

@ -54,7 +54,7 @@ jobs:
- get-label-type - get-label-type
with: with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-gcc11 build-environment: linux-jammy-py3.9-gcc11
docker-image: ${{ needs.docs-build.outputs.docker-image }} docker-image: ${{ needs.docs-build.outputs.docker-image }}
push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }} push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }}
run-doxygen: true run-doxygen: true

View File

@ -14,10 +14,6 @@ on:
schedule: schedule:
# Run at 07:00 UTC every Sunday # Run at 07:00 UTC every Sunday
- cron: 0 7 * * 0 - cron: 0 7 * * 0
pull_request:
paths:
- benchmarks/operator_benchmark/**
- .github/workflows/operator_benchmark.yml
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -33,7 +29,7 @@ jobs:
name: opbenchmark-build name: opbenchmark-build
uses: ./.github/workflows/_linux-build.yml uses: ./.github/workflows/_linux-build.yml
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
test-matrix: | test-matrix: |
{ include: [ { include: [
@ -46,7 +42,7 @@ jobs:
name: opbenchmark-on-demand-build name: opbenchmark-on-demand-build
uses: ./.github/workflows/_linux-build.yml uses: ./.github/workflows/_linux-build.yml
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
test-matrix: | test-matrix: |
{ include: [ { include: [
@ -59,7 +55,7 @@ jobs:
uses: ./.github/workflows/_linux-test.yml uses: ./.github/workflows/_linux-test.yml
needs: opbenchmark-build needs: opbenchmark-build
with: with:
build-environment: linux-jammy-py3.10-gcc11-build build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }} docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }} test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
secrets: inherit secrets: inherit

View File

@ -240,7 +240,7 @@ jobs:
needs: get-label-type needs: get-label-type
with: with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-gcc11 build-environment: linux-jammy-py3.9-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
test-matrix: | test-matrix: |
{ include: [ { include: [
@ -255,7 +255,7 @@ jobs:
- verify-cachebench-cpu-build - verify-cachebench-cpu-build
- target-determination - target-determination
with: with:
build-environment: linux-jammy-py3.10-gcc11 build-environment: linux-jammy-py3.9-gcc11
docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }} docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }} test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
secrets: inherit secrets: inherit

View File

@ -13,7 +13,7 @@ exclude_patterns = [
'**/fb/**', '**/fb/**',
'functorch/docs/**', 'functorch/docs/**',
'functorch/examples/**', 'functorch/examples/**',
'functorch/docs/source/tutorials/**', 'functorch/notebooks/**',
'torch/_inductor/fx_passes/serialized_patterns/**', 'torch/_inductor/fx_passes/serialized_patterns/**',
'torch/_inductor/autoheuristic/artifacts/**', 'torch/_inductor/autoheuristic/artifacts/**',
'scripts/**', 'scripts/**',
@ -1568,6 +1568,7 @@ include_patterns = [
exclude_patterns = [ exclude_patterns = [
'caffe2/**', 'caffe2/**',
'functorch/docs/**', 'functorch/docs/**',
'functorch/notebooks/**',
'torch/_inductor/fx_passes/serialized_patterns/**', 'torch/_inductor/fx_passes/serialized_patterns/**',
'torch/_inductor/autoheuristic/artifacts/**', 'torch/_inductor/autoheuristic/artifacts/**',
'test/dynamo/cpython/**', 'test/dynamo/cpython/**',

View File

@ -810,7 +810,7 @@ cc_library(
name = "torch_python", name = "torch_python",
srcs = libtorch_python_core_sources srcs = libtorch_python_core_sources
+ if_cuda(libtorch_python_cuda_sources) + if_cuda(libtorch_python_cuda_sources)
+ libtorch_python_distributed_sources + if_cuda(libtorch_python_distributed_sources)
+ GENERATED_AUTOGRAD_PYTHON, + GENERATED_AUTOGRAD_PYTHON,
hdrs = glob([ hdrs = glob([
"torch/csrc/generic/*.cpp", "torch/csrc/generic/*.cpp",

View File

@ -234,7 +234,6 @@ cmake_dependent_option(INSTALL_TEST "Install test binaries if BUILD_TEST is on"
option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF) option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON) option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
option(USE_ASAN "Use Address+Undefined Sanitizers" OFF) option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
option(USE_LSAN "Use Leak Sanitizer" OFF)
option(USE_TSAN "Use Thread Sanitizer" OFF) option(USE_TSAN "Use Thread Sanitizer" OFF)
option(USE_CUDA "Use CUDA" ON) option(USE_CUDA "Use CUDA" ON)
option(USE_XPU "Use XPU" ON) option(USE_XPU "Use XPU" ON)
@ -874,7 +873,7 @@ cmake_dependent_option(
"Whether to build the flash_attention kernel for scaled dot product attention.\ "Whether to build the flash_attention kernel for scaled dot product attention.\
Will be disabled if not supported by the platform" Will be disabled if not supported by the platform"
ON ON
"USE_CUDA OR USE_ROCM" "USE_CUDA OR USE_ROCM;NOT MSVC"
OFF) OFF)
cmake_dependent_option( cmake_dependent_option(
@ -890,9 +889,9 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
set(USE_FBGEMM_GENAI off) set(USE_FBGEMM_GENAI off)
endif() endif()
# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100. # Set USE_FBGEMM_GENAI to ON for CUDA build on SM100
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0a")
message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a") message(WARNING "Setting USE_FBGEMM_GENAI to ON for CUDA build on SM100")
set(USE_FBGEMM_GENAI ON) set(USE_FBGEMM_GENAI ON)
endif() endif()
@ -909,7 +908,7 @@ cmake_dependent_option(
# USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake # USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake
# #
if(USE_ROCM) if(USE_ROCM)
if(USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION) if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
include(cmake/External/aotriton.cmake) include(cmake/External/aotriton.cmake)
endif() endif()
endif() endif()

View File

@ -50,7 +50,6 @@ Following is the Release Compatibility Matrix for PyTorch releases:
| PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm | | PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
| --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- |
| 2.9 | >=3.10, <=(3.14, 3.14t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 13.0 (CUDNN 9.13.0.50) | ROCm 6.4 |
| 2.8 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 12.9 (CUDNN 9.10.2.21) | ROCm 6.4 | | 2.8 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 12.9 (CUDNN 9.10.2.21) | ROCm 6.4 |
| 2.7 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8 (CUDNN 9.1.0.70), CUDA 12.6 (CUDNN 9.5.1.17) | CUDA 12.8 (CUDNN 9.7.1.26) | ROCm 6.3 | | 2.7 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8 (CUDNN 9.1.0.70), CUDA 12.6 (CUDNN 9.5.1.17) | CUDA 12.8 (CUDNN 9.7.1.26) | ROCm 6.3 |
| 2.6 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8, CUDA 12.4 (CUDNN 9.1.0.70) | CUDA 12.6 (CUDNN 9.5.1.17) | ROCm 6.2.4 | | 2.6 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8, CUDA 12.4 (CUDNN 9.1.0.70) | CUDA 12.6 (CUDNN 9.5.1.17) | ROCm 6.2.4 |

View File

@ -16,8 +16,6 @@ However, if you believe you have found a security vulnerability in PyTorch, we e
Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new
All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported: Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
https://www.facebook.com/whitehat https://www.facebook.com/whitehat

View File

@ -265,14 +265,6 @@ IF(USE_FBGEMM_GENAI)
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu") "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
list(FILTER fbgemm_genai_native_cuda_cu INCLUDE REGEX ${FBGEMM_CUTLASS_KERNELS_REGEX}) list(FILTER fbgemm_genai_native_cuda_cu INCLUDE REGEX ${FBGEMM_CUTLASS_KERNELS_REGEX})
# PyTorch is not built for 10.0a in CI, due to lack of portability,
# so we need to explicitly build these files for 10.0a.
foreach(cu_file ${fbgemm_genai_native_cuda_cu})
_BUILD_FOR_ADDITIONAL_ARCHS(
"${cu_file}"
"100a")
endforeach()
file(GLOB_RECURSE fbgemm_genai_native_cuda_cpp file(GLOB_RECURSE fbgemm_genai_native_cuda_cpp
"${FBGEMM_GENAI_SRCS}/common/*.cpp" "${FBGEMM_GENAI_SRCS}/common/*.cpp"
) )

View File

@ -133,12 +133,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
"resize_ called on tensor with symbolic shape") "resize_ called on tensor with symbolic shape")
TORCH_CHECK( TORCH_CHECK(
sparse_dim + dense_dim == static_cast<int64_t>(size.size()), sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
"'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ", "number of dimensions must be sparse_dim (",
size.size(),
", sparse_dim = ",
sparse_dim, sparse_dim,
", dense_dim = ", ") + dense_dim (",
dense_dim); dense_dim,
"), but got ",
size.size());
if (nnz() > 0) { if (nnz() > 0) {
[[maybe_unused]] auto constexpr alt_options_msg = [[maybe_unused]] auto constexpr alt_options_msg =
"You could try the following options:\n\ "You could try the following options:\n\
@ -254,12 +254,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
"resize_and_clear_ called on tensor with symbolic shape") "resize_and_clear_ called on tensor with symbolic shape")
TORCH_CHECK( TORCH_CHECK(
sparse_dim + dense_dim == static_cast<int64_t>(size.size()), sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
"'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ", "number of dimensions must be sparse_dim (",
size.size(),
", sparse_dim = ",
sparse_dim, sparse_dim,
", dense_dim = ", ") + dense_dim (",
dense_dim); dense_dim,
"), but got ",
size.size());
set_sizes_and_strides(size, std::vector<int64_t>(size.size())); set_sizes_and_strides(size, std::vector<int64_t>(size.size()));
sparse_dim_ = sparse_dim; sparse_dim_ = sparse_dim;

View File

@ -644,8 +644,6 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
void * beta_ptr = &fbeta; void * beta_ptr = &fbeta;
#ifdef USE_ROCM #ifdef USE_ROCM
int flag = 0; int flag = 0;
rocblas_datatype c_type = std::is_same<C_Dtype, float>::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r;
rocblas_datatype d_type = c_type;
#if USE_GEMM_FLAGS_FP16_ALT_IMPL #if USE_GEMM_FLAGS_FP16_ALT_IMPL
flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
#endif #endif
@ -654,8 +652,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
hipOperationToRocOperation(opb), (int)m, (int)n, (int)k, hipOperationToRocOperation(opb), (int)m, (int)n, (int)k,
(void*)alpha_ptr, a, rocblas_datatype_f16_r, (int)lda, stridea, (void*)alpha_ptr, a, rocblas_datatype_f16_r, (int)lda, stridea,
b, rocblas_datatype_f16_r, (int)ldb, strideb, b, rocblas_datatype_f16_r, (int)ldb, strideb,
(void*)beta_ptr, c, c_type, (int)ldc, stridec, (void*)beta_ptr, c, rocblas_datatype_f16_r, (int)ldc, stridec,
c, d_type, (int)ldc, stridec, c, rocblas_datatype_f16_r, (int)ldc, stridec,
(int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard, (int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard,
0, flag))); 0, flag)));
#else #else
@ -1098,8 +1096,6 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
GEMM_CHECK_ARGVALUES(at::Half); GEMM_CHECK_ARGVALUES(at::Half);
#ifdef USE_ROCM #ifdef USE_ROCM
int flag = 0; int flag = 0;
rocblas_datatype c_type = std::is_same<C_Dtype, float>::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r;
rocblas_datatype d_type = c_type;
#if USE_GEMM_FLAGS_FP16_ALT_IMPL #if USE_GEMM_FLAGS_FP16_ALT_IMPL
flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
#endif #endif
@ -1119,10 +1115,10 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
ldb, ldb,
beta_ptr, beta_ptr,
c, c,
c_type, rocblas_datatype_f16_r,
ldc, ldc,
c, c,
d_type, rocblas_datatype_f16_r,
ldc, ldc,
rocblas_datatype_f32_r, rocblas_datatype_f32_r,
rocblas_gemm_algo_standard, rocblas_gemm_algo_standard,

View File

@ -45,24 +45,6 @@ struct OffsetCalculator {
C10_HOST_DEVICE offset_type get(index_t linear_idx) const { C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
offset_type offsets; offset_type offsets;
#if defined(USE_ROCM)
if ((dims > 0) && (dims <= 2)) {
auto divmod = sizes_[0].divmod(linear_idx);
#pragma unroll
for (int arg = 0; arg < NARGS; arg++)
offsets[arg] = divmod.mod * strides_[0][arg];
if (dims >= 2) {
divmod = sizes_[1].divmod(divmod.div);
#pragma unroll
for (int arg = 0; arg < NARGS; arg++)
offsets[arg] += divmod.mod * strides_[1][arg];
}
// [...]
return offsets;
}
#endif
#pragma unroll #pragma unroll
for (int arg = 0; arg < NARGS; arg++) { for (int arg = 0; arg < NARGS; arg++) {
offsets[arg] = 0; offsets[arg] = 0;

View File

@ -457,9 +457,24 @@ void gemm(
return; return;
} }
#endif #endif
// for the fallback path, first compute gemm with beta = 0,
// and then add c in full precision.
int64_t c_size = n * m;
std::vector<float> float_c(c_size, 0.f);
gemm_no_downcast_stub( gemm_no_downcast_stub(
at::kCPU, at::kBFloat16, at::kCPU, at::kBFloat16,
transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
for (const auto j : c10::irange(n)) {
for (const auto i : c10::irange(m)) {
auto offset = j * ldc + i;
// beta == 0 won't propagate NaN from C
if (beta == 0.f) {
c[offset] = float_c[j * m + i];
} else {
c[offset] = beta * c[offset] + float_c[j * m + i];
}
}
}
} }
void gemm( void gemm(
@ -478,9 +493,24 @@ void gemm(
return; return;
} }
#endif #endif
// for the fallback path, first compute gemm with beta = 0,
// and then add c in full precision.
int64_t c_size = n * m;
std::vector<float> float_c(c_size, 0.f);
gemm_no_downcast_stub( gemm_no_downcast_stub(
at::kCPU, at::kHalf, at::kCPU, at::kHalf,
transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
for (const auto j : c10::irange(n)) {
for (const auto i : c10::irange(m)) {
auto offset = j * ldc + i;
// beta == 0 won't propagate NaN from C
if (beta == 0.f) {
c[offset] = float_c[j * m + i];
} else {
c[offset] = beta * c[offset] + float_c[j * m + i];
}
}
}
} }
void gemm( void gemm(

View File

@ -1360,8 +1360,7 @@ Tensor outer(const Tensor& self, const Tensor& vec2) {
#endif #endif
#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED() #if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
// Used by default on x86 platforms and on AArch64+ACL
static inline int64_t get_mkldnn_matmul_min_dim() { static inline int64_t get_mkldnn_matmul_min_dim() {
static auto value = [&] { static auto value = [&] {
const int64_t default_min_dim = [&] { const int64_t default_min_dim = [&] {
@ -1396,6 +1395,8 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) {
return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size; return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size;
} }
#endif #endif
static void addmm_impl_cpu_( static void addmm_impl_cpu_(
Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) { Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) {
TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2); TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2);
@ -1771,8 +1772,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) || return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) ||
(strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1])); (strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1]));
}; };
#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
// Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL #if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]); bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) { if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) {
try { try {
@ -1784,6 +1785,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
} }
} }
#endif #endif
if (contraction_size * res_rows * res_cols < 400) { if (contraction_size * res_rows * res_cols < 400) {
if (is_bmm_out) { if (is_bmm_out) {
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, batch1.scalar_type(), "bmm", [&] { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, batch1.scalar_type(), "bmm", [&] {

View File

@ -624,9 +624,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
if (backend == BatchNormBackend::Miopen) { if (backend == BatchNormBackend::Miopen) {
return std::tuple_cat( return std::tuple_cat(
at::miopen_batch_norm( at::miopen_batch_norm(
input.contiguous(input.suggest_memory_format()), input.contiguous(), weight.contiguous(), bias.contiguous(),
weight.contiguous(),
bias.contiguous(),
running_mean.defined() ? running_mean.contiguous() : running_mean, running_mean.defined() ? running_mean.contiguous() : running_mean,
running_var.defined() ? running_var.contiguous() : running_var, running_var.defined() ? running_var.contiguous() : running_var,
training, momentum, eps), training, momentum, eps),

View File

@ -36,7 +36,7 @@ void hardsigmoid_kernel(TensorIteratorBase& iter) {
[zero, one_sixth, three, six] GPU_LAMBDA( [zero, one_sixth, three, six] GPU_LAMBDA(
scalar_t self_val) -> scalar_t { scalar_t self_val) -> scalar_t {
opmath_t x = static_cast<opmath_t>(self_val); opmath_t x = static_cast<opmath_t>(self_val);
return std::min<opmath_t>(std::max<opmath_t>(x + three, zero), six) * one_sixth; return std::min(std::max(x + three, zero), six) * one_sixth;
}); });
}); });
} }

View File

@ -1080,6 +1080,16 @@ static bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=fals
#endif #endif
} }
static bool _grouped_mm_allowed_device() {
#ifdef USE_ROCM
return false;
#else
auto dprops = at::cuda::getCurrentDeviceProperties();
// CUDA capability 8.0 and greater
return dprops->major >= 8;
#endif
}
#ifdef USE_ROCM #ifdef USE_ROCM
static bool _scaled_mm_is_fnuz() { static bool _scaled_mm_is_fnuz() {
return at::detail::getCUDAHooks().isGPUArch({"gfx942"}); return at::detail::getCUDAHooks().isGPUArch({"gfx942"});
@ -1776,19 +1786,14 @@ Tensor _grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
const std::optional<at::Tensor>& offs, const std::optional<at::Tensor>& offs,
const std::optional<at::Tensor>& bias, const std::optional<at::Tensor>& bias,
std::optional<c10::ScalarType> out_dtype) { std::optional<c10::ScalarType> out_dtype) {
#ifndef USE_ROCM
_grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype); _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
bool a_b_and_out_are_bf16 = ( bool a_b_and_out_are_bf16 = (
mat_a.dtype() == at::kBFloat16 && mat_a.dtype() == at::kBFloat16 &&
mat_b.dtype() == at::kBFloat16 && mat_b.dtype() == at::kBFloat16 &&
out_dtype.value_or(at::kBFloat16) == at::kBFloat16 out_dtype.value_or(at::kBFloat16) == at::kBFloat16
); );
#ifndef USE_ROCM
bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16; bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16;
#else
// _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
// the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
bool use_fast_path = false;
#endif
const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype); const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
if (use_fast_path) { if (use_fast_path) {
@ -1798,6 +1803,9 @@ std::optional<c10::ScalarType> out_dtype) {
_grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out); _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
} }
return out; return out;
#else
TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
#endif
} }
Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) { Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {

View File

@ -7,7 +7,6 @@
#include <ATen/NativeFunctions.h> #include <ATen/NativeFunctions.h>
#else #else
#include <ATen/ops/empty.h> #include <ATen/ops/empty.h>
#include <ATen/ops/empty_like.h>
#include <ATen/ops/miopen_batch_norm_native.h> #include <ATen/ops/miopen_batch_norm_native.h>
#include <ATen/ops/miopen_batch_norm_backward_native.h> #include <ATen/ops/miopen_batch_norm_backward_native.h>
#endif #endif
@ -103,7 +102,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
mode = miopenBNSpatial; mode = miopenBNSpatial;
} }
auto output_t = at::empty_like(input_t, input_t.options(), input_t.suggest_memory_format()); auto output_t = at::empty(input->sizes(), input->options());
TensorArg output{ output_t, "output", 0 }; TensorArg output{ output_t, "output", 0 };
auto handle = getMiopenHandle(); auto handle = getMiopenHandle();
@ -171,15 +170,20 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
const std::optional<Tensor>& save_var_t_opt, const std::optional<Tensor>& save_var_t_opt,
double epsilon) { double epsilon) {
// See [Note: hacky wrapper removal for optional tensor] // See [Note: hacky wrapper removal for optional tensor]
const Tensor& save_mean_t = save_mean_t_opt.value_or(Tensor()); const Tensor& running_mean =
const Tensor& save_var_t = save_var_t_opt.value_or(Tensor()); running_mean_opt.value_or(Tensor());
const Tensor& running_var =
running_var_opt.value_or(Tensor());
const Tensor& save_mean_t =
save_mean_t_opt.value_or(Tensor());
const Tensor& save_var_t =
save_var_t_opt.value_or(Tensor());
auto grad_output_contig = TensorArg input{ input_t, "input", 1 },
grad_output_t.contiguous(input_t.suggest_memory_format()); grad_output{ grad_output_t, "grad_output", 2 },
TensorArg input{input_t, "input", 1}, weight{ weight_t, "weight", 3 },
grad_output{grad_output_contig, "grad_output", 2}, save_mean{ save_mean_t, "save_mean", 4 },
weight{weight_t, "weight", 3}, save_mean{save_mean_t, "save_mean", 4}, save_var{ save_var_t, "save_var", 5 };
save_var{save_var_t, "save_var", 5};
CheckedFrom c = "miopen_batch_norm_backward"; CheckedFrom c = "miopen_batch_norm_backward";
checkAllDefined(c, {input, grad_output, weight, save_mean, save_var}); checkAllDefined(c, {input, grad_output, weight, save_mean, save_var});
@ -191,11 +195,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
} }
checkAllSameType(c, {input, grad_output}); checkAllSameType(c, {input, grad_output});
checkAllSameType(c, {weight, save_mean, save_var}); checkAllSameType(c, {weight, save_mean, save_var});
// TODO: is weight required to be contiguous? checkAllContiguous(c, {input, grad_output, save_mean, save_var});
checkAllContiguous(c, {save_mean, save_var});
// TODO: TensorArg check should start handle memory format
TORCH_CHECK(input->is_contiguous(input->suggest_memory_format()));
TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format()));
checkDimRange(c, input, 2, 6 /* exclusive */); checkDimRange(c, input, 2, 6 /* exclusive */);
checkSameSize(c, input, grad_output); checkSameSize(c, input, grad_output);
auto num_features = input->size(1); auto num_features = input->size(1);
@ -210,7 +210,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
mode = miopenBNSpatial; mode = miopenBNSpatial;
} }
auto grad_input_t = at::empty(input->sizes(), input->options(), input->suggest_memory_format()); auto grad_input_t = at::empty(input->sizes(), input->options());
auto grad_weight_t = at::empty(weight->sizes(), weight->options()); auto grad_weight_t = at::empty(weight->sizes(), weight->options());
auto grad_bias_t = at::empty(weight->sizes(), weight->options()); auto grad_bias_t = at::empty(weight->sizes(), weight->options());

View File

@ -617,7 +617,6 @@ static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) {
// we allocate 1 here due to MacOS13 bug for gather MPSGraph op, look below for the error // we allocate 1 here due to MacOS13 bug for gather MPSGraph op, look below for the error
Tensor output_t = at::empty({1}, input_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt); Tensor output_t = at::empty({1}, input_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
if (output_t.numel() == 0 || num_in_elements == 0) { if (output_t.numel() == 0 || num_in_elements == 0) {
output_t.fill_(std::numeric_limits<float>::quiet_NaN());
return output_t; return output_t;
} }

View File

@ -1414,7 +1414,7 @@
- func: cat(Tensor[] tensors, int dim=0) -> Tensor - func: cat(Tensor[] tensors, int dim=0) -> Tensor
structured_delegate: cat.out structured_delegate: cat.out
dispatch: dispatch:
SparseCPU, SparseCUDA, SparseMPS: cat_sparse SparseCPU, SparseCUDA: cat_sparse
QuantizedCPU: cat_quantized_cpu QuantizedCPU: cat_quantized_cpu
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested
tags: core tags: core
@ -1798,7 +1798,7 @@
device_guard: False device_guard: False
dispatch: dispatch:
MkldnnCPU: copy_mkldnn_ MkldnnCPU: copy_mkldnn_
SparseCPU, SparseCUDA, SparseMPS: copy_sparse_wrapper_ SparseCPU, SparseCUDA: copy_sparse_wrapper_
CompositeExplicitAutograd: copy_ CompositeExplicitAutograd: copy_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_
@ -2160,7 +2160,7 @@
variants: function, method variants: function, method
structured_delegate: div.out structured_delegate: div.out
dispatch: dispatch:
SparseCPU, SparseCUDA, SparseMPS: div_sparse SparseCPU, SparseCUDA: div_sparse
ZeroTensor: div_zerotensor ZeroTensor: div_zerotensor
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor
tags: [core, pointwise] tags: [core, pointwise]
@ -2170,7 +2170,7 @@
variants: method variants: method
structured_delegate: div.out structured_delegate: div.out
dispatch: dispatch:
SparseCPU, SparseCUDA, SparseMPS: div_sparse_ SparseCPU, SparseCUDA: div_sparse_
tags: pointwise tags: pointwise
- func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) - func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@ -2179,7 +2179,7 @@
structured_inherits: TensorIteratorBase structured_inherits: TensorIteratorBase
dispatch: dispatch:
CPU, CUDA, MPS, MTIA: div_out CPU, CUDA, MPS, MTIA: div_out
SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim SparseCPU, SparseCUDA: div_out_sparse_zerodim
tags: pointwise tags: pointwise
- func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor - func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
@ -2187,7 +2187,7 @@
variants: function, method variants: function, method
structured_delegate: div.out_mode structured_delegate: div.out_mode
dispatch: dispatch:
SparseCPU, SparseCUDA, SparseMPS: div_sparse SparseCPU, SparseCUDA: div_sparse
tags: [core, pointwise] tags: [core, pointwise]
- func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!) - func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
@ -2195,7 +2195,7 @@
variants: method variants: method
structured_delegate: div.out_mode structured_delegate: div.out_mode
dispatch: dispatch:
SparseCPU, SparseCUDA, SparseMPS: div_sparse_ SparseCPU, SparseCUDA: div_sparse_
tags: pointwise tags: pointwise
- func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!) - func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
@ -2204,7 +2204,7 @@
structured_inherits: TensorIteratorBase structured_inherits: TensorIteratorBase
dispatch: dispatch:
CPU, CUDA, MPS: div_out_mode CPU, CUDA, MPS: div_out_mode
SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim SparseCPU, SparseCUDA: div_out_sparse_zerodim
tags: pointwise tags: pointwise
# For C++ only, until we have conversion from C++ numbers to Tensor # For C++ only, until we have conversion from C++ numbers to Tensor
@ -2768,20 +2768,20 @@
variants: function, method variants: function, method
dispatch: dispatch:
CPU, CUDA, MPS, MTIA: floor_divide CPU, CUDA, MPS, MTIA: floor_divide
SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse SparseCPU, SparseCUDA: floor_divide_sparse
- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
device_check: NoCheck # TensorIterator device_check: NoCheck # TensorIterator
variants: method variants: method
dispatch: dispatch:
CPU, CUDA, MPS: floor_divide_ CPU, CUDA, MPS: floor_divide_
SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse_ SparseCPU, SparseCUDA: floor_divide_sparse_
- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator device_check: NoCheck # TensorIterator
dispatch: dispatch:
CPU, CUDA, MPS: floor_divide_out CPU, CUDA, MPS: floor_divide_out
SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
device_check: NoCheck # TensorIterator device_check: NoCheck # TensorIterator
@ -4273,7 +4273,7 @@
structured_delegate: mul.out structured_delegate: mul.out
variants: function, method variants: function, method
dispatch: dispatch:
SparseCPU, SparseCUDA, SparseMPS: mul_sparse SparseCPU, SparseCUDA: mul_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr
MkldnnCPU: mkldnn_mul MkldnnCPU: mkldnn_mul
ZeroTensor: mul_zerotensor ZeroTensor: mul_zerotensor
@ -4285,7 +4285,7 @@
structured_delegate: mul.out structured_delegate: mul.out
variants: method variants: method
dispatch: dispatch:
SparseCPU, SparseCUDA, SparseMPS: mul_sparse_ SparseCPU, SparseCUDA: mul_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_
MkldnnCPU: mkldnn_mul_ MkldnnCPU: mkldnn_mul_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor
@ -4299,7 +4299,6 @@
CPU, CUDA, MPS, MTIA: mul_out CPU, CUDA, MPS, MTIA: mul_out
SparseCPU: mul_out_sparse_cpu SparseCPU: mul_out_sparse_cpu
SparseCUDA: mul_out_sparse_cuda SparseCUDA: mul_out_sparse_cuda
SparseMPS: mul_out_sparse_mps
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_out_sparse_csr SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_out_sparse_csr
MkldnnCPU: mkldnn_mul_out MkldnnCPU: mkldnn_mul_out
tags: pointwise tags: pointwise
@ -5849,7 +5848,7 @@
variants: function, method variants: function, method
dispatch: dispatch:
CompositeExplicitAutograd: sum CompositeExplicitAutograd: sum
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo SparseCPU, SparseCUDA, SparseMeta: sum_coo
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
autogen: sum.out autogen: sum.out
@ -5860,7 +5859,7 @@
variants: function, method variants: function, method
dispatch: dispatch:
NestedTensorCPU: NestedTensor_sum_dim_CPU NestedTensorCPU: NestedTensor_sum_dim_CPU
SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo SparseCPU, SparseCUDA: sum_sparse_coo
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
tags: core tags: core
@ -6976,7 +6975,7 @@
CPU, CUDA: sub_out CPU, CUDA: sub_out
MPS: sub_out_mps MPS: sub_out_mps
MTIA: sub_out_mtia MTIA: sub_out_mtia
SparseCPU, SparseCUDA, SparseMPS: sub_out_sparse SparseCPU, SparseCUDA: sub_out_sparse
tags: pointwise tags: pointwise
- func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
@ -6984,7 +6983,7 @@
variants: function, method variants: function, method
structured_delegate: sub.out structured_delegate: sub.out
dispatch: dispatch:
SparseCPU, SparseCUDA, SparseMPS: sub_sparse SparseCPU, SparseCUDA: sub_sparse
ZeroTensor: sub_zerotensor ZeroTensor: sub_zerotensor
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor
tags: [core, pointwise] tags: [core, pointwise]
@ -6994,7 +6993,7 @@
variants: method variants: method
structured_delegate: sub.out structured_delegate: sub.out
dispatch: dispatch:
SparseCPU, SparseCUDA, SparseMPS: sub_sparse_ SparseCPU, SparseCUDA: sub_sparse_
tags: pointwise tags: pointwise
# For C++ only, until we have conversion from C++ numbers to Tensor # For C++ only, until we have conversion from C++ numbers to Tensor
@ -10343,7 +10342,7 @@
structured_inherits: TensorIteratorBase structured_inherits: TensorIteratorBase
dispatch: dispatch:
CPU, CUDA: pow_Tensor_Scalar_out CPU, CUDA: pow_Tensor_Scalar_out
SparseCPU, SparseCUDA, SparseMPS: pow_out_sparse_scalar SparseCPU, SparseCUDA: pow_out_sparse_scalar
MPS: pow_tensor_scalar_out_mps MPS: pow_tensor_scalar_out_mps
tags: pointwise tags: pointwise
@ -10352,7 +10351,7 @@
structured_delegate: pow.Tensor_Scalar_out structured_delegate: pow.Tensor_Scalar_out
variants: function, method variants: function, method
dispatch: dispatch:
SparseCPU, SparseCUDA, SparseMPS: pow_sparse_scalar SparseCPU, SparseCUDA: pow_sparse_scalar
tags: [core, pointwise] tags: [core, pointwise]
- func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!) - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)

View File

@ -2,7 +2,6 @@
#include <ATen/core/Tensor.h> #include <ATen/core/Tensor.h>
#include <ATen/Config.h> #include <ATen/Config.h>
#include <ATen/Dispatch.h> #include <ATen/Dispatch.h>
#include <ATen/AccumulateType.h>
#include <ATen/NamedTensorUtils.h> #include <ATen/NamedTensorUtils.h>
#include <ATen/native/sparse/ParamUtils.h> #include <ATen/native/sparse/ParamUtils.h>
#include <ATen/native/SparseTensorUtils.h> #include <ATen/native/SparseTensorUtils.h>
@ -296,7 +295,6 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
to exp functions as well as reuse of softmax implementation for to exp functions as well as reuse of softmax implementation for
log_softmax. log_softmax.
*/ */
using accscalar_t = at::acc_type<scalar_t, false>;
auto sparse_dim = input.sparse_dim(); auto sparse_dim = input.sparse_dim();
auto indices = input._indices().contiguous(); auto indices = input._indices().contiguous();
auto values = input._values().contiguous(); auto values = input._values().contiguous();
@ -342,14 +340,14 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
continue; continue;
/* Prepare scratch space */ /* Prepare scratch space */
std::vector<accscalar_t> mx_row(nvalues, -std::numeric_limits<accscalar_t>::infinity()); std::vector<scalar_t> mx_row(nvalues, -std::numeric_limits<scalar_t>::infinity());
std::vector<accscalar_t> exp_sums_row(nvalues, 0); std::vector<scalar_t> exp_sums_row(nvalues, 0);
/* Compute mx */ /* Compute mx */
for (int64_t i : pool_indices) { for (int64_t i : pool_indices) {
auto values_row = values_accessor[i]; auto values_row = values_accessor[i];
for (const auto j : c10::irange(nvalues)) { for (const auto j : c10::irange(nvalues)) {
mx_row[j] = std::max(mx_row[j], accscalar_t(values_row[j])); mx_row[j] = std::max(mx_row[j], values_row[j]);
} }
} }

View File

@ -391,13 +391,13 @@ void _validate_sparse_coo_tensor_args(
int64_t sparse_dim = indices.size(0); int64_t sparse_dim = indices.size(0);
int64_t dense_dim = values.dim() - 1; int64_t dense_dim = values.dim() - 1;
TORCH_CHECK( TORCH_CHECK(
sparse_dim + dense_dim == static_cast<int64_t>(size.size()), static_cast<int64_t>(size.size()) == sparse_dim + dense_dim,
"'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ", "number of dimensions must be sparse_dim (",
size.size(), sparse_dim,
", sparse_dim = ", ") + dense_dim (",
sparse_dim, dense_dim,
", dense_dim = ", "), but got ",
dense_dim); size.size());
if (check_pinning) { if (check_pinning) {
TORCH_CHECK( TORCH_CHECK(

View File

@ -10,7 +10,6 @@
#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h> #include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
#include <ATen/ops/cat.h> #include <ATen/ops/cat.h>
#include <ATen/ops/add_native.h> #include <ATen/ops/add_native.h>
#include <ATen/ops/mul_native.h>
#include <ATen/ops/empty_native.h> #include <ATen/ops/empty_native.h>
#include <ATen/ops/zeros_native.h> #include <ATen/ops/zeros_native.h>
#include <ATen/ops/result_type.h> #include <ATen/ops/result_type.h>
@ -21,265 +20,10 @@
namespace at::native { namespace at::native {
using namespace at::sparse; using namespace at::sparse;
using namespace mps;
#ifndef PYTORCH_JIT_COMPILE_SHADERS Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha);
static auto& lib = MetalShaderLibrary::getBundledLibrary();
#else
#include <ATen/native/mps/Mul_metallib.h>
#endif
static SparseTensor& mul_out_dense_sparse_mps( Tensor& add_out_dense_sparse_mps(
const Tensor& dense,
const Tensor& sparse,
SparseTensor& out) {
TORCH_CHECK(sparse.is_sparse(), "mul: expected 'sparse' to be sparse COO");
TORCH_CHECK(sparse.is_mps(), "mul: expected 'sparse' to be MPS, got ", sparse.device());
TORCH_CHECK(out.is_mps(), "mul: expected 'out' to be MPS, got ", out.device());
const bool scalar_like = (dense.dim() == 0) || (dense.numel() == 1);
TORCH_CHECK(dense.is_mps() || scalar_like,
"mul: expected 'dense' to be MPS or scalar-like, got ", dense.device());
const int64_t nnz = sparse._nnz();
out.resize_as_(sparse);
auto commonDtype = at::result_type(dense, sparse);
TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
"Can't convert result type ", commonDtype, " to output ", out.scalar_type());
auto indices = sparse._indices().contiguous();
auto values = sparse._values().to(commonDtype).contiguous();
if (nnz == 0) {
auto empty_vals = values.narrow(0, 0, 0);
alias_into_sparse(out,
indices.narrow(1, 0, 0),
(out.scalar_type() == commonDtype) ? empty_vals
: empty_vals.to(out.scalar_type()));
out._coalesced_(sparse.is_coalesced());
return out;
}
if (scalar_like) {
auto scalar = dense;
if (dense.numel() == 1 && dense.dim() > 0) {
scalar = dense.view({});
}
scalar = scalar.to(values.options());
auto out_vals = values.mul(scalar);
if (out.scalar_type() != commonDtype) {
out_vals = out_vals.to(out.scalar_type());
}
alias_into_sparse(out, indices, out_vals);
out._coalesced_(sparse.is_coalesced());
return out;
}
TORCH_CHECK(dense.sizes().equals(sparse.sizes()),
"mul(dense, sparse): sizes must match exactly (no broadcasting): ",
dense.sizes(), " vs ", sparse.sizes());
const int64_t ndim_i = sparse.sparse_dim();
const int64_t ndim = dense.dim();
TORCH_CHECK(
ndim_i <= ndim,
"mul(dense, sparse): sparse_dim=", ndim_i, " exceeds dense.dim()=", ndim);
// Prepare shapes
int64_t view_rows = 1, view_cols = 1;
for (int64_t i = 0; i < ndim_i; ++i) view_rows *= sparse.size(i);
for (int64_t i = ndim_i; i < ndim; ++i) view_cols *= sparse.size(i);
auto dense_mps = dense.to(commonDtype).contiguous().reshape({view_rows, view_cols});
auto out_vals = at::empty_like(values, values.options());
const uint32_t u_view_cols = static_cast<uint32_t>(view_cols);
const uint32_t u_nnz = static_cast<uint32_t>(nnz);
const uint32_t u_ndim_i = static_cast<uint32_t>(ndim_i);
auto stream = getCurrentMPSStream();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pso = lib.getPipelineStateForFunc("dense_sparse_mul_kernel_" + mps::scalarToMetalTypeString(values));
auto computeEncoder = stream->commandEncoder();
[computeEncoder setComputePipelineState:pso];
const uint32_t gridWidth = u_view_cols;
const uint32_t gridDepth = u_nnz;
MTLSize gridSize = MTLSizeMake(gridWidth, 1, gridDepth);
const uint32_t maxThreadsPerGroup = pso.maxTotalThreadsPerThreadgroup;
const uint32_t tew = pso.threadExecutionWidth;
uint32_t tgWidth = std::min(gridWidth, tew);
MTLSize threadgroupSize = MTLSizeMake(tgWidth, 1, 1);
mtl_setArgs(
computeEncoder,
dense_mps,
values,
out_vals,
indices,
sparse.sizes(),
std::array<uint32_t, 3>{u_nnz, u_ndim_i, u_view_cols}
);
[computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadgroupSize];
}
});
Tensor final_vals = out_vals;
if (out.scalar_type() != commonDtype) {
final_vals = final_vals.to(out.scalar_type());
}
alias_into_sparse(out, indices, final_vals);
out._coalesced_(sparse.is_coalesced());
return out;
}
SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTensor& r_) {
TORCH_CHECK(r_.is_mps(), "mul: expected 'out' to be MPS, but got ", r_.device());
// Dense x sparse fallback (keep dense first)
if (!t_.is_sparse() || !src_.is_sparse()) {
const Tensor& dense = t_.is_sparse() ? src_ : t_;
const Tensor& sparse = t_.is_sparse() ? t_ : src_;
return mul_out_dense_sparse_mps(dense, sparse, r_);
}
TORCH_CHECK(t_.is_mps(), "mul: expected 'self' to be MPS, but got ", t_.device());
TORCH_CHECK(src_.is_mps(), "mul: expected 'other' to be MPS, but got ", src_.device());
TORCH_CHECK(t_.sparse_dim() == src_.sparse_dim(),
"mul(sparse, sparse): must have same sparse_dim, got ",
t_.sparse_dim(), " vs ", src_.sparse_dim());
TORCH_CHECK(t_.sizes().equals(src_.sizes()),
"mul(sparse, sparse): sizes must match exactly (no broadcasting).");
// Coalesce and early-exit on structurally empty operands
auto lhs = t_.coalesce();
auto rhs = src_.coalesce();
const int64_t lhs_nnz = lhs._nnz();
const int64_t rhs_nnz = rhs._nnz();
if (!lhs_nnz || !rhs_nnz) {
r_.resize_as_(lhs);
return r_.zero_();
}
// dtype checks and promotion
auto commonDtype = at::result_type(lhs, rhs);
TORCH_CHECK(canCast(commonDtype, r_.scalar_type()),
"Can't convert result type ", commonDtype, " to output ", r_.scalar_type());
const int64_t ndim_i = lhs.sparse_dim();
// ndim_i == 0, at most one structural entry
if (ndim_i == 0) {
r_.resize_as_(lhs);
const bool has = (lhs_nnz && rhs_nnz);
auto out_indices = lhs._indices().narrow(1, 0, has ? 1 : 0);
Tensor lhs_vals = lhs._values().to(commonDtype);
Tensor rhs_vals = rhs._values().to(commonDtype);
lhs_vals = lhs_vals.narrow(0, 0, has ? 1 : 0);
rhs_vals = rhs_vals.narrow(0, 0, has ? 1 : 0);
Tensor out_values = lhs_vals.mul(rhs_vals);
if (r_.scalar_type() != commonDtype) {
out_values = out_values.to(r_.scalar_type());
}
alias_into_sparse(r_, out_indices, out_values);
r_._coalesced_(true);
return r_;
}
// General path, intersect keys, then gather + multiply on GPU
const auto device = r_.device();
auto stream = getCurrentMPSStream();
auto lhs_indices = lhs._indices();
auto rhs_indices = rhs._indices();
auto lhs_values = lhs._values().to(commonDtype);
auto rhs_values = rhs._values().to(commonDtype);
// Flatten sparse indices to keys
auto lhs_keys = flatten_indices(lhs_indices, lhs.sizes());
auto rhs_keys = flatten_indices(rhs_indices, rhs.sizes());
// Intersect sorted keys (search the shorter in the longer)
const bool A_is_lhs = (lhs_nnz <= rhs_nnz);
const int64_t lenA = A_is_lhs ? lhs_nnz : rhs_nnz;
const int64_t lenB = A_is_lhs ? rhs_nnz : lhs_nnz;
auto A_keys = A_is_lhs ? lhs_keys : rhs_keys;
auto B_keys = A_is_lhs ? rhs_keys : lhs_keys;
auto outA_idx = at::empty({lenA}, at::device(device).dtype(kLong));
auto outB_idx = at::empty({lenA}, at::device(device).dtype(kLong));
auto counter = at::zeros({1}, at::device(device).dtype(kInt));
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
auto enc = stream->commandEncoder();
[enc setComputePipelineState:pso];
mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
static_cast<uint32_t>(lenB), A_is_lhs);
mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
}
});
const uint32_t M = counter.item<int32_t>(); // number of structural matches
r_.resize_as_(lhs);
auto out_indices = at::empty({ndim_i, static_cast<int64_t>(M)}, at::device(device).dtype(at::kLong));
auto lhs_match = outA_idx.narrow(0, 0, M);
auto rhs_match = outB_idx.narrow(0, 0, M);
auto out_val_sizes = lhs_values.sizes().vec();
out_val_sizes[0] = static_cast<int64_t>(M);
auto out_values = at::empty(out_val_sizes, lhs_values.options());
const uint32_t cols = static_cast<uint32_t>(
lhs_values.numel() / std::max<int64_t>(1, lhs_nnz));
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pso = lib.getPipelineStateForFunc(
"fused_gather_mul_kernel_" + mps::scalarToMetalTypeString(lhs_values));
auto enc = stream->commandEncoder();
[enc setComputePipelineState:pso];
const uint32_t tew = pso.threadExecutionWidth;
uint32_t tgW = std::min(cols, tew);
MTLSize grid = MTLSizeMake(cols, 1, M);
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
mtl_setArgs(enc,
lhs_values, rhs_values,
lhs_match, rhs_match,
lhs_indices, out_indices,
out_values,
std::array<uint32_t, 2>{static_cast<uint32_t>(ndim_i), static_cast<uint32_t>(lhs_nnz)},
std::array<uint32_t, 2>{M, cols});
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
}
});
if (r_.scalar_type() != commonDtype) {
out_values = out_values.to(r_.scalar_type());
}
alias_into_sparse(r_, out_indices, out_values);
r_._coalesced_(true);
return r_;
}
static Tensor& add_out_dense_sparse_mps(
Tensor& out, Tensor& out,
const Tensor& dense, const Tensor& dense,
const SparseTensor& sparse, const SparseTensor& sparse,

View File

@ -1,150 +0,0 @@
#include <metal_stdlib>
#include <c10/metal/indexing.h>
using namespace metal;
template <typename T>
kernel void dense_sparse_mul_kernel(
device const T* dense [[buffer(0)]],
device const T* values [[buffer(1)]],
device T* out_values [[buffer(2)]],
device const long* indices [[buffer(3)]],
device const long* sizes [[buffer(4)]],
constant uint3& sparse_params [[buffer(5)]],
uint3 gid [[thread_position_in_grid]])
{
uint col = gid.x;
uint i = gid.z;
uint nnz = sparse_params.x;
uint ndim_i = sparse_params.y;
uint view_cols = sparse_params.z;
long key = 0;
for (uint d = 0; d < ndim_i; ++d) {
long idx_d = indices[(ulong)d * (ulong)nnz + (ulong)i];
const auto sz_d = sizes[d];
key = key * sz_d + idx_d;
}
ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;
const auto a = static_cast<float>(values[val_idx]);
const auto b = static_cast<float>(dense[dense_idx]);
out_values[val_idx] = static_cast<T>(a * b);
}
kernel void intersect_binary_search(
device const long* keysA [[buffer(0)]],
device const long* keysB [[buffer(1)]],
device long* outA_idx [[buffer(2)]],
device long* outB_idx [[buffer(3)]],
device atomic_uint* counter [[buffer(4)]],
constant uint& lenB [[buffer(5)]],
constant bool& A_is_lhs [[buffer(6)]],
uint3 tid_in_grid [[thread_position_in_grid]])
{
uint gid = tid_in_grid.x;
long key = keysA[gid];
// lower_bound in B
uint lo = 0;
uint hi = lenB;
while (lo < hi) {
uint mid = (lo + hi) >> 1;
long v = keysB[mid];
if (v < key) lo = mid + 1;
else hi = mid;
}
if (lo < lenB && keysB[lo] == key) {
uint pos = atomic_fetch_add_explicit(counter, 1u, memory_order_relaxed);
if (A_is_lhs) {
outA_idx[pos] = (long)gid;
outB_idx[pos] = (long)lo;
} else {
outA_idx[pos] = (long)lo;
outB_idx[pos] = (long)gid;
}
}
}
template <typename T>
kernel void fused_gather_mul_kernel(
device const T* lhs_vals [[buffer(0)]],
device const T* rhs_vals [[buffer(1)]],
device const long* lhs_sel [[buffer(2)]],
device const long* rhs_sel [[buffer(3)]],
device const long* lhs_indices [[buffer(4)]],
device long* out_indices [[buffer(5)]],
device T* out_vals [[buffer(6)]],
constant uint2& dims_input [[buffer(7)]],
constant uint2& dims_output [[buffer(8)]],
uint3 gid [[thread_position_in_grid]])
{
const uint col = gid.x;
const uint k = gid.z;
const uint n_dim_i = dims_input.x;
const uint L = dims_input.y;
const uint M = dims_output.x;
const uint view_cols = dims_output.y;
const long iL = lhs_sel[k];
const long iR = rhs_sel[k];
if (col < view_cols) {
const ulong offL = (ulong)iL * (ulong)view_cols + (ulong)col;
const ulong offR = (ulong)iR * (ulong)view_cols + (ulong)col;
const ulong offO = (ulong)k * (ulong)view_cols + (ulong)col;
const float a = (float)lhs_vals[offL];
const float b = (float)rhs_vals[offR];
out_vals[offO] = (T)(a * b);
}
// One thread per match copies the indices column
if (col == 0) {
const ulong uL = (ulong)L;
const ulong uM = (ulong)M;
const ulong src_col = (ulong)iL; // gather from lhs
for (uint d = 0; d < n_dim_i; ++d) {
const long v = lhs_indices[(ulong)d * uL + src_col];
out_indices[(ulong)d * uM + (ulong)k] = v;
}
}
}
#define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE) \
template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void \
dense_sparse_mul_kernel<DTYPE>( \
device const DTYPE* dense [[buffer(0)]], \
device const DTYPE* values [[buffer(1)]], \
device DTYPE* out_values [[buffer(2)]], \
device const long* indices [[buffer(3)]], \
device const long* sizes [[buffer(4)]], \
constant uint3& sparse_params [[buffer(5)]], \
uint3 gid [[thread_position_in_grid]]);
INSTANTIATE_DENSE_SPARSE_MUL(float);
INSTANTIATE_DENSE_SPARSE_MUL(half);
INSTANTIATE_DENSE_SPARSE_MUL(bfloat);
#define INSTANTIATE_FUSED_GATHER_MUL(DTYPE) \
template [[host_name("fused_gather_mul_kernel_" #DTYPE)]] kernel void \
fused_gather_mul_kernel<DTYPE>( \
device const DTYPE* lhs_vals [[buffer(0)]], \
device const DTYPE* rhs_vals [[buffer(1)]], \
device const long* lhs_sel [[buffer(2)]], \
device const long* rhs_sel [[buffer(3)]], \
device const long* lhs_indices [[buffer(4)]], \
device long* out_indices [[buffer(5)]], \
device DTYPE* out_vals [[buffer(6)]], \
constant uint2& dims_input [[buffer(7)]], \
constant uint2& dims_output [[buffer(8)]], \
uint3 gid [[thread_position_in_grid]]);
INSTANTIATE_FUSED_GATHER_MUL(float);
INSTANTIATE_FUSED_GATHER_MUL(half);
INSTANTIATE_FUSED_GATHER_MUL(bfloat);

View File

@ -95,72 +95,6 @@
#endif #endif
#endif #endif
#if defined(USE_ROCM) && (defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION))
namespace pytorch_flash
{
std::tuple<
at::Tensor,
at::Tensor,
at::Tensor,
at::Tensor,
at::Tensor,
at::Tensor,
at::Tensor,
at::Tensor>
mha_fwd(
const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size
const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size
const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size
std::optional<at::Tensor>&
out_, // batch_size x seqlen_q x num_heads x head_size
std::optional<at::Tensor>&
alibi_slopes_, // num_heads or batch_size x num_heads
const float p_dropout,
const float softmax_scale,
bool is_causal,
std::optional<int64_t> window_size_left,
std::optional<int64_t> window_size_right,
const float softcap,
const bool return_softmax,
std::optional<at::Generator> gen_) {
#if defined(USE_ROCM_CK_SDPA)
if (at::globalContext().getROCmFAPreferredBackend() ==
at::ROCmFABackend::Ck) {
const int non_null_window_left = window_size_left.value_or(-1);
const int non_null_window_right = window_size_right.value_or(-1);
std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
return mha_fwd_ck(
q,
k,
v,
out_,
p_dropout,
softmax_scale,
is_causal,
non_null_window_left,
non_null_window_right,
return_softmax,
gen_,
dummy_attn_bias); // Not used in flash attention
}
#endif
return mha_fwd_aot(
q,
k,
v,
out_,
alibi_slopes_,
p_dropout,
softmax_scale,
is_causal,
window_size_left,
window_size_right,
return_softmax,
gen_);
}
}
#endif
namespace at { namespace at {
namespace cuda::philox { namespace cuda::philox {

View File

@ -270,7 +270,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varle
#endif #endif
TORCH_API TORCH_API
std::tuple< inline std::tuple<
at::Tensor, at::Tensor,
at::Tensor, at::Tensor,
at::Tensor, at::Tensor,
@ -294,7 +294,42 @@ mha_fwd(
std::optional<int64_t> window_size_right, std::optional<int64_t> window_size_right,
const float softcap, const float softcap,
const bool return_softmax, const bool return_softmax,
std::optional<at::Generator> gen_); std::optional<at::Generator> gen_) {
#if defined(USE_ROCM_CK_SDPA)
if (at::globalContext().getROCmFAPreferredBackend() ==
at::ROCmFABackend::Ck) {
const int non_null_window_left = window_size_left.value_or(-1);
const int non_null_window_right = window_size_right.value_or(-1);
std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
return mha_fwd_ck(
q,
k,
v,
out_,
p_dropout,
softmax_scale,
is_causal,
non_null_window_left,
non_null_window_right,
return_softmax,
gen_,
dummy_attn_bias); // Not used in flash attention
}
#endif
return mha_fwd_aot(
q,
k,
v,
out_,
alibi_slopes_,
p_dropout,
softmax_scale,
is_causal,
window_size_left,
window_size_right,
return_softmax,
gen_);
}
inline std::tuple< inline std::tuple<
at::Tensor, at::Tensor,

View File

@ -98,11 +98,11 @@ dlrm,pass,0
doctr_det_predictor,pass,3 doctr_det_predictor,pass,5
doctr_reco_predictor,pass,1 doctr_reco_predictor,pass,4

1 name accuracy graph_breaks
98
99
100
101
102
103
104
105
106
107
108

View File

@ -98,11 +98,11 @@ dlrm,pass,0
doctr_det_predictor,pass,3 doctr_det_predictor,pass,5
doctr_reco_predictor,pass,1 doctr_reco_predictor,pass,4

1 name accuracy graph_breaks
98
99
100
101
102
103
104
105
106
107
108

View File

@ -98,11 +98,11 @@ dlrm,pass,0
doctr_det_predictor,pass,3 doctr_det_predictor,pass,5
doctr_reco_predictor,pass,1 doctr_reco_predictor,pass,4

1 name accuracy graph_breaks
98
99
100
101
102
103
104
105
106
107
108

View File

@ -82,11 +82,11 @@ dlrm,pass,0
doctr_det_predictor,pass,3 doctr_det_predictor,pass,5
doctr_reco_predictor,pass,1 doctr_reco_predictor,pass,4

1 name accuracy graph_breaks
82 tts_angular pass 2
83 vgg16 pass 0
84 vision_maskrcnn pass 29
85 yolov3 pass 0
86
87
88
89
90
91
92

View File

@ -98,11 +98,11 @@ dlrm,pass,0
doctr_det_predictor,pass,3 doctr_det_predictor,pass,5
doctr_reco_predictor,pass,1 doctr_reco_predictor,pass,4

1 name accuracy graph_breaks
98
99
100
101
102
103
104
105
106
107
108

View File

@ -106,11 +106,11 @@ dlrm,pass,0
doctr_det_predictor,eager_fail_to_run,3 doctr_det_predictor,eager_fail_to_run,5
doctr_reco_predictor,eager_fail_to_run,1 doctr_reco_predictor,eager_fail_to_run,4

1 name accuracy graph_breaks
106
107
108
109
110
111
112
113
114
115
116

View File

@ -106,11 +106,11 @@ dlrm,pass,0
doctr_det_predictor,eager_fail_to_run,3 doctr_det_predictor,eager_fail_to_run,5
doctr_reco_predictor,eager_fail_to_run,1 doctr_reco_predictor,eager_fail_to_run,4

1 name accuracy graph_breaks
106
107
108
109
110
111
112
113
114
115
116

View File

@ -106,11 +106,11 @@ dlrm,pass,0
doctr_det_predictor,eager_fail_to_run,3 doctr_det_predictor,eager_fail_to_run,5
doctr_reco_predictor,eager_fail_to_run,1 doctr_reco_predictor,eager_fail_to_run,4

1 name accuracy graph_breaks
106
107
108
109
110
111
112
113
114
115
116

View File

@ -106,11 +106,11 @@ dlrm,pass,0
doctr_det_predictor,eager_fail_to_run,3 doctr_det_predictor,eager_fail_to_run,5
doctr_reco_predictor,eager_fail_to_run,1 doctr_reco_predictor,eager_fail_to_run,4

1 name accuracy graph_breaks
106
107
108
109
110
111
112
113
114
115
116

View File

@ -106,11 +106,11 @@ dlrm,pass,0
doctr_det_predictor,eager_fail_to_run,3 doctr_det_predictor,eager_fail_to_run,5
doctr_reco_predictor,eager_fail_to_run,1 doctr_reco_predictor,eager_fail_to_run,4

1 name accuracy graph_breaks
106
107
108
109
110
111
112
113
114
115
116

View File

@ -219,7 +219,9 @@ skip:
- timm_regnet - timm_regnet
- timm_nfnet - timm_nfnet
cuda: [] cuda:
# Temporary until https://github.com/pytorch/pytorch/issues/162282 is fixed
- sam_fast
test: test:
training: training:

View File

@ -4,7 +4,6 @@ import csv
import functools import functools
import json import json
import os import os
import platform
import timeit import timeit
from collections import namedtuple from collections import namedtuple
from dataclasses import asdict, dataclass from dataclasses import asdict, dataclass
@ -192,11 +191,6 @@ class BenchmarkRunner:
self.predefined_minimum_secs = 1 self.predefined_minimum_secs = 1
self.max_iters = 1e6 self.max_iters = 1e6
self.use_jit = args.use_jit self.use_jit = args.use_jit
self.use_compile = args.use_compile
if self.use_jit and self.use_compile:
raise ValueError(
"use_jit and use_compile are mutually exclusive, please specify one."
)
self.num_runs = args.num_runs self.num_runs = args.num_runs
self.print_per_iter = False self.print_per_iter = False
self.output_csv = args.output_csv self.output_csv = args.output_csv
@ -228,7 +222,7 @@ class BenchmarkRunner:
if self.args.operators: if self.args.operators:
print(f"# {self.args.operators}") print(f"# {self.args.operators}")
def _print_perf_result(self, results, test_case): def _print_perf_result(self, reported_run_time_us, test_case):
if self.args.report_aibench: if self.args.report_aibench:
# Output for AIBench # Output for AIBench
# Print out per iteration execution time instead of avg time # Print out per iteration execution time instead of avg time
@ -242,14 +236,12 @@ class BenchmarkRunner:
"type": test_name, "type": test_name,
"metric": "latency", "metric": "latency",
"unit": "us", "unit": "us",
"value": str(results["reported_run_time_us"[run]]), "value": str(reported_run_time_us[run]),
} }
) )
) )
else: else:
print( print(f"# Mode: {'JIT' if self.use_jit else 'Eager'}")
f"# Mode: {'JIT' if self.use_jit else 'Compile' if self.use_compile else 'Eager'}"
)
print( print(
f"# Name: {test_case.test_config.test_name}\n# Input: {test_case.test_config.input_config}" f"# Name: {test_case.test_config.test_name}\n# Input: {test_case.test_config.input_config}"
) )
@ -258,33 +250,25 @@ class BenchmarkRunner:
if self.num_runs > 1: if self.num_runs > 1:
for run in range(self.num_runs): for run in range(self.num_runs):
print( print(
f"Run: {run}, {mode} Execution Time (us) : {results['reported_run_time_us'][run]:.3f}" f"Run: {run}, {mode} Execution Time (us) : {reported_run_time_us[run]:.3f}"
) )
print() print()
else: else:
print( print(f"{mode} Execution Time (us) : {reported_run_time_us[0]:.3f}\n")
f"{mode} Execution Time (us) : {results['reported_run_time_us'][0]:.3f}"
)
print(f"Peak Memory (KB) : {results['peak_memory']}\n")
def _perf_result_to_dict(self, results, test_case): def _perf_result_to_dict(self, reported_run_time_us, test_case):
"""This function is the parallel of _print_perf_result, which instead of """This function is the parallel of _print_perf_result, which instead of
writing information to terminal, returns a dictionary. writing information to terminal, returns a dictionary.
""" """
if self.args.report_aibench: if self.args.report_aibench:
return {} return {}
out = { out = {
"test_name": test_case.test_config.test_name, "test_name": test_case.test_config.test_name,
"input_config": test_case.test_config.input_config, "input_config": test_case.test_config.input_config,
"runtime": ( "mode": "JIT" if self.use_jit else "Eager",
"JIT" if self.use_jit else "Compile" if self.use_compile else "Eager"
),
"run": "Backward" if test_case.test_config.run_backward else "Forward", "run": "Backward" if test_case.test_config.run_backward else "Forward",
"latency": round(results["reported_run_time_us"][0], 3), "latency": round(reported_run_time_us[0], 3),
"latency unit": "us", "latency unit": "us",
"peak memory": results["peak_memory"],
"memory unit": "KB",
} }
# parsing test_case.test_config.input_config, adding it as entries to the 'out' dictionary # parsing test_case.test_config.input_config, adding it as entries to the 'out' dictionary
@ -346,8 +330,6 @@ class BenchmarkRunner:
func = test_case.run_forward func = test_case.run_forward
if self.use_jit: if self.use_jit:
func = test_case.run_jit_forward func = test_case.run_jit_forward
if self.use_compile:
func = test_case.run_compile_forward
forward_time = timeit.timeit( forward_time = timeit.timeit(
functools.partial(func, iters, print_per_iter, cuda_sync), number=1 functools.partial(func, iters, print_per_iter, cuda_sync), number=1
) )
@ -364,7 +346,7 @@ class BenchmarkRunner:
) )
return backward_time return backward_time
def _measure_metrics(self, launch_test, test_case, iters, print_per_iter): def _measure_time(self, launch_test, test_case, iters, print_per_iter):
""" """
This function execute the operator for <iters> iterations then look at the time. This function execute the operator for <iters> iterations then look at the time.
If it's not significant, the number of iterations will be increased before rerun. If it's not significant, the number of iterations will be increased before rerun.
@ -372,25 +354,8 @@ class BenchmarkRunner:
""" """
curr_test_total_time = 0 curr_test_total_time = 0
time_trace = [] time_trace = []
peak_memory = 0
input_values = test_case.op_bench.inputs.values()
device, device_module = None, None
if input_values and isinstance(next(iter(input_values)), torch.Tensor):
# The device and device module information are crucial for memory metric calculation,
# In case of ops where inputs are integers (not tensor), memory metrics need not be calculated.
sample_input = next(iter(input_values))
device = sample_input.device
device_module = torch.get_device_module(device.type)
# TODO: add support for cpu memory measurement
while True: while True:
if hasattr(device_module, "reset_peak_memory_stats"):
device_module.reset_peak_memory_stats(device)
run_time_sec = launch_test(test_case, iters, print_per_iter) run_time_sec = launch_test(test_case, iters, print_per_iter)
if hasattr(device_module, "synchronize"):
device_module.synchronize(device)
# Memory measurement process
if hasattr(device_module, "max_memory_allocated"):
peak_memory = device_module.max_memory_allocated(device)
curr_test_total_time += run_time_sec curr_test_total_time += run_time_sec
# Analyze time after each run to decide if the result is stable # Analyze time after each run to decide if the result is stable
results_are_significant = self._iteration_result_is_significant( results_are_significant = self._iteration_result_is_significant(
@ -404,13 +369,7 @@ class BenchmarkRunner:
time_trace.append(report_run_time) time_trace.append(report_run_time)
# Print out the time spent in each epoch in ms # Print out the time spent in each epoch in ms
if self.args.report_aibench: if self.args.report_aibench:
mode = ( mode = "JIT" if self.use_jit else "Eager"
"JIT"
if self.use_jit
else "Compile"
if self.use_compile
else "Eager"
)
test_name = "_".join( test_name = "_".join(
[test_case.framework, test_case.test_config.test_name, mode] [test_case.framework, test_case.test_config.test_name, mode]
) )
@ -422,7 +381,7 @@ class BenchmarkRunner:
"metric": "latency", "metric": "latency",
"unit": "ms", "unit": "ms",
"value": str(report_run_time / 1e3), "value": str(report_run_time / 1e3),
}, }
) )
) )
if results_are_significant: if results_are_significant:
@ -432,7 +391,7 @@ class BenchmarkRunner:
# iteration count, and run the benchmark again... # iteration count, and run the benchmark again...
iters = self._predict_num_iter_needed(iters) iters = self._predict_num_iter_needed(iters)
reported_run_time_us = np.percentile(np.array(time_trace), 50) reported_run_time_us = np.percentile(np.array(time_trace), 50)
return reported_run_time_us, peak_memory / 1024 return reported_run_time_us
def _check_keep(self, test_flag, cmd_flag): def _check_keep(self, test_flag, cmd_flag):
return cmd_flag is None or test_flag == cmd_flag return cmd_flag is None or test_flag == cmd_flag
@ -519,7 +478,6 @@ class BenchmarkRunner:
self, self,
perf_list, perf_list,
output_file, output_file,
benchmark_name="PyTorch operator benchmark",
): ):
""" """
Write the result into JSON format, so that it can be uploaded to the benchmark database Write the result into JSON format, so that it can be uploaded to the benchmark database
@ -537,10 +495,8 @@ class BenchmarkRunner:
input_config = perf_item.get("input_config", "") input_config = perf_item.get("input_config", "")
run_type = perf_item.get("run") run_type = perf_item.get("run")
latency = perf_item.get("latency", 0) latency = perf_item.get("latency", 0)
peak_memory = perf_item.get("peak memory", 0)
device = perf_item.get("device", "unknown") dtype = "float32" # default
dtype = perf_item.get("dtype", "torch.float").split(".")[1]
runtime = perf_item.get("runtime", None)
# Extract mode based on run_type # Extract mode based on run_type
mode = None mode = None
@ -549,22 +505,6 @@ class BenchmarkRunner:
elif run_type == "Backward": elif run_type == "Backward":
mode = "training" mode = "training"
# Extract use_compile from it
if runtime == "Compile":
use_compile = True
elif runtime == "Eager":
use_compile = False
else:
use_compile = None
device_arch = (
torch.cuda.get_device_name(0)
if device == "cuda"
else platform.processor()
if device == "cpu"
else "unknown"
)
# Create the record # Create the record
@dataclass @dataclass
class BenchmarkInfo: class BenchmarkInfo:
@ -592,18 +532,12 @@ class BenchmarkRunner:
model: ModelInfo model: ModelInfo
metric: MetricInfo metric: MetricInfo
# Add record for latency record = BenchmarkRecord(
record_latency = BenchmarkRecord(
benchmark=BenchmarkInfo( benchmark=BenchmarkInfo(
name=benchmark_name, name="PyTorch operator benchmark",
mode=mode, mode=mode,
dtype=dtype, dtype=dtype,
extra_info={ extra_info={"input_config": input_config},
"input_config": input_config,
"device": device,
"arch": device_arch,
"use_compile": use_compile,
},
), ),
model=ModelInfo( model=ModelInfo(
name=test_name, type="micro-benchmark", origins=["pytorch"] name=test_name, type="micro-benchmark", origins=["pytorch"]
@ -615,17 +549,8 @@ class BenchmarkRunner:
target_value=None, target_value=None,
), ),
) )
records.append(asdict(record_latency))
# Add record for peak memory records.append(asdict(record))
record_memory = copy.deepcopy(record_latency)
record_memory.metric = MetricInfo(
name="peak memory",
unit="KB",
benchmark_values=[peak_memory],
target_value=None,
)
records.append(asdict(record_memory))
# Write all records to the output file # Write all records to the output file
with open(output_file, "w", encoding="utf-8") as f: with open(output_file, "w", encoding="utf-8") as f:
@ -641,7 +566,6 @@ class BenchmarkRunner:
"tag", "tag",
"run_backward", "run_backward",
"Execution Time", "Execution Time",
"Peak Memory (KB)",
] ]
if self.args.output_json or self.args.output_json_for_dashboard: if self.args.output_json or self.args.output_json_for_dashboard:
@ -679,16 +603,13 @@ class BenchmarkRunner:
test_case, self.args.warmup_iterations, print_per_iter=False test_case, self.args.warmup_iterations, print_per_iter=False
) )
# Actual Execution # Actual Execution
results = [ reported_time = [
self._measure_metrics( self._measure_time(
launch_func, test_case, self.iters, self.print_per_iter launch_func, test_case, self.iters, self.print_per_iter
) )
for _ in range(self.num_runs) for _ in range(self.num_runs)
] ]
result_dict = dict() self._print_perf_result(reported_time, test_case)
result_dict["reported_run_time_us"] = [r[0] for r in results]
result_dict["peak_memory"] = results[0][1]
self._print_perf_result(results=result_dict, test_case=test_case)
# output results to csv # output results to csv
self._output_csv( self._output_csv(
@ -704,17 +625,16 @@ class BenchmarkRunner:
), ),
test_case.test_config.tag, test_case.test_config.tag,
test_case.test_config.run_backward, test_case.test_config.run_backward,
result_dict["reported_run_time_us"][0], reported_time[0],
result_dict["peak_memory"],
], ],
) )
if self.args.output_json or self.args.output_json_for_dashboard: if self.args.output_json or self.args.output_json_for_dashboard:
perf_list.append(self._perf_result_to_dict(result_dict, test_case)) perf_list.append(
self._perf_result_to_dict(reported_time, test_case)
)
if self.args.output_json_for_dashboard: if self.args.output_json_for_dashboard:
self._output_json( self._output_json(perf_list, self.args.output_json_for_dashboard)
perf_list, self.args.output_json_for_dashboard, self.args.benchmark_name
)
if self.args.output_json: if self.args.output_json:
with open(self.args.output_json, "w") as f: with open(self.args.output_json, "w") as f:

View File

@ -4,15 +4,6 @@ import time
import torch import torch
# Import the C++ extension to register the _consume operator
try:
import benchmark_cpp_extension # noqa: F401
except ImportError as err:
# If the extension isn't built, the script must raise an error
raise ImportError(
"Failed to import C++ extension, please build it using \ncd pt_extension \npython -m pip install ."
) from err
"""PyTorch performance microbenchmarks. """PyTorch performance microbenchmarks.
This module contains PyTorch-specific functionalities for performance This module contains PyTorch-specific functionalities for performance
@ -80,16 +71,6 @@ class TorchBenchmarkBase(torch.nn.Module):
for _ in range(iters): for _ in range(iters):
torch.ops.operator_benchmark._consume(self.forward_impl()) torch.ops.operator_benchmark._consume(self.forward_impl())
def forward_impl_eager(self):
# This is to supply the inputs to the forward function which
# will be called in both the eager and compile mode of local runs
return self.forward(*self.get_inputs())
def forward_consume_eager(self, iters: int):
# Eager version of forward_consume without decorators (compilation handled by torch.compile)
for _ in range(iters):
torch.ops.operator_benchmark._consume(self.forward_impl_eager())
def module_name(self): def module_name(self):
"""this is used to label the operator being benchmarked""" """this is used to label the operator being benchmarked"""
if self.user_given_name: if self.user_given_name:
@ -136,32 +117,18 @@ class PyTorchOperatorTestCase:
self.framework = "PyTorch" self.framework = "PyTorch"
self.time_series = [] self.time_series = []
self._jit_forward_graph = None self._jit_forward_graph = None
self._compile_forward_graph = None
def _generate_jit_forward_graph(self): def _generate_jit_forward_graph(self):
"""generate a graph for the forward function via scripting""" """generate a graph for the forward function via scripting"""
scripted_op_bench = torch.jit.script(self.op_bench) scripted_op_bench = torch.jit.script(self.op_bench)
return scripted_op_bench.forward_consume return scripted_op_bench.forward_consume
def _generate_compile_forward_graph(self):
"""generate a compiled graph for the forward function via torch.compile"""
compiled_forward_consume = torch.compile(
self.op_bench.forward_consume_eager, backend="inductor"
)
return compiled_forward_consume
def run_jit_forward(self, num_runs, print_per_iter=False, cuda_sync=False): def run_jit_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
"""Run the forward path of an op with JIT mode""" """Run the forward path of an op with JIT mode"""
if self._jit_forward_graph is None: if self._jit_forward_graph is None:
self._jit_forward_graph = self._generate_jit_forward_graph() self._jit_forward_graph = self._generate_jit_forward_graph()
self._jit_forward_graph(num_runs) self._jit_forward_graph(num_runs)
def run_compile_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
"""Run the forward path of an op with compile mode"""
if self._compile_forward_graph is None:
self._compile_forward_graph = self._generate_compile_forward_graph()
self._compile_forward_graph(num_runs)
def _print_per_iter(self): def _print_per_iter(self):
# print last 50 values # print last 50 values
length = min(len(self.time_series), 50) length = min(len(self.time_series), 50)
@ -183,14 +150,14 @@ class PyTorchOperatorTestCase:
if print_per_iter: if print_per_iter:
for _ in range(num_runs): for _ in range(num_runs):
start_time = time.time() start_time = time.time()
self.output = self.op_bench.forward_impl_eager() self.output = self.op_bench.forward_impl()
if cuda_sync: if cuda_sync:
torch.cuda.synchronize(torch.cuda.current_device()) torch.cuda.synchronize(torch.cuda.current_device())
end_time = time.time() end_time = time.time()
self.time_series.append((end_time - start_time) * 1e3) self.time_series.append((end_time - start_time) * 1e3)
else: else:
for _ in range(num_runs): for _ in range(num_runs):
self.output = self.op_bench.forward_impl_eager() self.output = self.op_bench.forward_impl()
if cuda_sync: if cuda_sync:
torch.cuda.synchronize(torch.cuda.current_device()) torch.cuda.synchronize(torch.cuda.current_device())

View File

@ -62,13 +62,6 @@ def parse_args():
default=None, default=None,
) )
parser.add_argument(
"--benchmark-name",
"--benchmark_name",
help="Name of the benchmark to store results to",
default="PyTorch operator benchmark",
)
parser.add_argument( parser.add_argument(
"--list-tests", "--list-tests",
"--list_tests", "--list_tests",
@ -142,16 +135,6 @@ def parse_args():
help="Run operators with PyTorch JIT mode", help="Run operators with PyTorch JIT mode",
) )
parser.add_argument(
"--use-compile",
"--use_compile",
type=benchmark_utils.str2bool,
nargs="?",
const=True,
default=False,
help="Run operators with PyTorch Compile mode",
)
parser.add_argument( parser.add_argument(
"--forward-only", "--forward-only",
"--forward_only", "--forward_only",
@ -179,7 +162,7 @@ def parse_args():
"--output-json-for-dashboard", "--output-json-for-dashboard",
"--output_json_for_dashboard", "--output_json_for_dashboard",
help="Save results in JSON format for display on the OSS dashboard", help="Save results in JSON format for display on the OSS dashboard",
default="benchmark-results.json", default="False",
) )
args, _ = parser.parse_known_args() args, _ = parser.parse_known_args()

View File

@ -1,5 +1,5 @@
Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time
PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,2.459 PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,3.9497
PyTorch,add,add_M64_N64_K64_cpu,short,FALSE,14.3181 PyTorch,add,add_M64_N64_K64_cpu,short,FALSE,14.3181
PyTorch,add,add_M64_N64_K128_cpu,short,FALSE,14.6826 PyTorch,add,add_M64_N64_K128_cpu,short,FALSE,14.6826
PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,58.1449 PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,58.1449
@ -376,10 +376,10 @@ PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",sho
PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.6588 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.6588
PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.5969 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.5969
PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.547 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.547
PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.21375 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,68.739
PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,45.14133333 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,45.14133333
PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,52.6664 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,52.6664
PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,51.49525 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,69.1875
PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,48.3458 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,48.3458
PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,62.0719 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,62.0719
PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.5728 PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.5728
@ -388,10 +388,10 @@ PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplace
PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,8.1647 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,8.1647
PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.1768 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.1768
PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.0619 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.0619
PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,48.88475 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.118
PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,43.702 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,43.702
PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,50.3613 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,50.3613
PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.3995 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.436
PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,46.9813 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,46.9813
PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,59.2295 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,59.2295
PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.5189 PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.5189
@ -1316,4 +1316,4 @@ PyTorch,where,"where_cond_shape(8,16,1)_input_shape(1,)_other_shape(1,)_cpu_dtyp
PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,FALSE,5.763 PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,FALSE,5.763
PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.744666667 PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.744666667
PyTorch,clamp,clamp_M512_N512_cpu,short,FALSE,15.26233333 PyTorch,clamp,clamp_M512_N512_cpu,short,FALSE,15.26233333
PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667 PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667
1 Benchmarking Framework Benchmarking Module Name Case Name tag run_backward Execution Time
2 PyTorch add add_M1_N1_K1_cpu short FALSE 2.459 3.9497
3 PyTorch add add_M64_N64_K64_cpu short FALSE 14.3181
4 PyTorch add add_M64_N64_K128_cpu short FALSE 14.6826
5 PyTorch add add_M1_N1_K1_cpu_bwdall_BACKWARD short TRUE 58.1449
376 PyTorch relu6 relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8 short FALSE 9.6588
377 PyTorch relu6 relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8 short FALSE 9.5969
378 PyTorch relu6 relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32 short FALSE 9.547
379 PyTorch relu6 relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8 short FALSE 50.21375 68.739
380 PyTorch relu6 relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8 short FALSE 45.14133333
381 PyTorch relu6 relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32 short FALSE 52.6664
382 PyTorch relu6 relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8 short FALSE 51.49525 69.1875
383 PyTorch relu6 relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8 short FALSE 48.3458
384 PyTorch relu6 relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32 short FALSE 62.0719
385 PyTorch functional.hardtanh functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8 short FALSE 7.5728
388 PyTorch functional.hardtanh functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8 short FALSE 8.1647
389 PyTorch functional.hardtanh functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8 short FALSE 8.1768
390 PyTorch functional.hardtanh functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32 short FALSE 8.0619
391 PyTorch functional.hardtanh functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8 short FALSE 48.88475 67.118
392 PyTorch functional.hardtanh functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8 short FALSE 43.702
393 PyTorch functional.hardtanh functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32 short FALSE 50.3613
394 PyTorch functional.hardtanh functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8 short FALSE 50.3995 67.436
395 PyTorch functional.hardtanh functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8 short FALSE 46.9813
396 PyTorch functional.hardtanh functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32 short FALSE 59.2295
397 PyTorch functional.hardsigmoid functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8 short FALSE 6.5189
1316 PyTorch where where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32 short FALSE 5.763
1317 PyTorch where where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32 short FALSE 5.744666667
1318 PyTorch clamp clamp_M512_N512_cpu short FALSE 15.26233333
1319 PyTorch gelu gelu_M512_N512_cpu short FALSE 31.33166667

View File

@ -156,7 +156,7 @@ ROOT = "//" if IS_OSS else "//xplat/caffe2"
# for targets in subfolders # for targets in subfolders
ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/" ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/"
C10 = "//c10:c10" if IS_OSS else ("//xplat/caffe2/c10:c10_ovrsource" if is_arvr_mode() else "//xplat/caffe2/c10:c10") C10 = "//c10:c10" if IS_OSS else "//xplat/caffe2/c10:c10"
# a dictionary maps third party library name to fbsource and oss target # a dictionary maps third party library name to fbsource and oss target
THIRD_PARTY_LIBS = { THIRD_PARTY_LIBS = {

View File

@ -638,13 +638,10 @@ libtorch_nativert_sources = [
"torch/nativert/kernels/KernelHandlerRegistry.cpp", "torch/nativert/kernels/KernelHandlerRegistry.cpp",
"torch/nativert/kernels/TritonKernel.cpp", "torch/nativert/kernels/TritonKernel.cpp",
"torch/nativert/executor/triton/CpuTritonKernelManager.cpp", "torch/nativert/executor/triton/CpuTritonKernelManager.cpp",
"torch/nativert/executor/AOTInductorDelegateExecutor.cpp",
"torch/nativert/kernels/ETCallDelegateKernel.cpp",
] ]
libtorch_nativert_cuda_sources = [ libtorch_nativert_cuda_sources = [
"torch/nativert/executor/triton/CudaTritonKernelManager.cpp", "torch/nativert/executor/triton/CudaTritonKernelManager.cpp",
"torch/nativert/executor/AOTInductorModelContainerCudaShim.cpp",
] ]
torch_mobile_tracer_sources = [ torch_mobile_tracer_sources = [

View File

@ -504,16 +504,7 @@ struct ExpandableSegment {
SegmentRange share(SegmentRange range, std::ostream& buf) { SegmentRange share(SegmentRange range, std::ostream& buf) {
auto begin = segmentLeft(range.ptr); auto begin = segmentLeft(range.ptr);
auto end = segmentRight(range.ptr + range.size); auto end = segmentRight(range.ptr + range.size);
ShareHeader header{getpid(), segment_size_, end - begin};
// header.pid needs to be padded with 4 bytes and initialized with
// 0 values to avoid random padding of different bytes each time,
// thereby ensuring that the handle can be correctly matched in
// ipcMemHandle_to_devptr.
ShareHeader header{};
header.pid = getpid();
header.segment_size = segment_size_;
header.num_handles = end - begin;
buf.write((const char*)&header, sizeof(ShareHeader)); buf.write((const char*)&header, sizeof(ShareHeader));
for (auto i : c10::irange(begin, end)) { for (auto i : c10::irange(begin, end)) {
// NOLINTNEXTLINE(bugprone-unchecked-optional-access) // NOLINTNEXTLINE(bugprone-unchecked-optional-access)

View File

@ -78,7 +78,7 @@ int device_count_impl(bool fail_if_no_driver) {
"would like to use GPUs, turn off ASAN."); "would like to use GPUs, turn off ASAN.");
break; break;
#endif // C10_ASAN_ENABLED #endif // C10_ASAN_ENABLED
#if defined(_WIN32) && CUDA_VERSION >= 13000 #if _WIN32 && CUDA_VERSION >= 13000
// Workaround for CUDA-13.0 error handling on Windows, see // Workaround for CUDA-13.0 error handling on Windows, see
// https://github.com/pytorch/pytorch/issues/162333#issuecomment-3267929585 // https://github.com/pytorch/pytorch/issues/162333#issuecomment-3267929585
case cudaErrorNotSupported: case cudaErrorNotSupported:

View File

@ -18,9 +18,9 @@ cuda_supported_platforms = [
def define_c10_ovrsource(name, is_mobile): def define_c10_ovrsource(name, is_mobile):
if is_mobile: if is_mobile:
pp_flags = ["-DC10_MOBILE=1", "-DC10_USE_GLOG"] pp_flags = ["-DC10_MOBILE=1"]
else: else:
pp_flags = ["-DC10_USE_GLOG"] pp_flags = []
oxx_static_library( oxx_static_library(
name = name, name = name,

View File

@ -196,25 +196,20 @@ TTarget* assign_ptr_(TTarget* rhs) {
} }
} }
// The only requirement for refcount increment is that it happens-before // Increment needs to be acquire-release to make use_count() and
// decrement, so no additional memory ordering is needed. // unique() reliable.
inline uint32_t atomic_refcount_increment(std::atomic<uint32_t>& refcount) { inline uint32_t atomic_refcount_increment(std::atomic<uint32_t>& refcount) {
return refcount.fetch_add(1, std::memory_order_relaxed) + 1; return refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
} }
// weak_use_count() is only used for testing, so we don't need it to
// be reliable. Relaxed should be fine.
inline uint32_t atomic_weakcount_increment(std::atomic<uint32_t>& weakcount) { inline uint32_t atomic_weakcount_increment(std::atomic<uint32_t>& weakcount) {
return weakcount.fetch_add(1, std::memory_order_relaxed) + 1; return weakcount.fetch_add(1, std::memory_order_relaxed) + 1;
} }
// The requirement is that all modifications to the managed object happen-before // Both decrements need to be acquire-release for correctness. See
// invocation of the managed object destructor, and that allocation of the // e.g. std::shared_ptr implementation.
// managed object storage happens-before deallocation of the storage.
//
// To get this ordering, all non-final decrements must synchronize-with the
// final decrement. So all non-final decrements have to store-release while the
// final decrement has to load-acquire, either directly or with the help of
// fences. But it's easiest just to have all decrements be acq-rel. And it turns
// out, on modern architectures and chips, it's also fastest.
inline uint32_t atomic_refcount_decrement(std::atomic<uint32_t>& refcount) { inline uint32_t atomic_refcount_decrement(std::atomic<uint32_t>& refcount) {
return refcount.fetch_sub(1, std::memory_order_acq_rel) - 1; return refcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
} }
@ -337,7 +332,7 @@ class intrusive_ptr final {
intrusive_ptr() noexcept intrusive_ptr() noexcept
: intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {} : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
/* implicit */ intrusive_ptr(std::nullptr_t) noexcept intrusive_ptr(std::nullptr_t) noexcept
: intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {} : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
// This constructor will not increase the ref counter for you. // This constructor will not increase the ref counter for you.
@ -450,14 +445,14 @@ class intrusive_ptr final {
if (target_ == NullType::singleton()) { if (target_ == NullType::singleton()) {
return 0; return 0;
} }
return target_->refcount_.load(std::memory_order_relaxed); return target_->refcount_.load(std::memory_order_acquire);
} }
uint32_t weak_use_count() const noexcept { uint32_t weak_use_count() const noexcept {
if (target_ == NullType::singleton()) { if (target_ == NullType::singleton()) {
return 0; return 0;
} }
return target_->weakcount_.load(std::memory_order_relaxed); return target_->weakcount_.load(std::memory_order_acquire);
} }
bool unique() const noexcept { bool unique() const noexcept {
@ -856,14 +851,14 @@ class weak_intrusive_ptr final {
return 0; return 0;
} }
return target_->refcount_.load( return target_->refcount_.load(
std::memory_order_relaxed); // refcount, not weakcount! std::memory_order_acquire); // refcount, not weakcount!
} }
uint32_t weak_use_count() const noexcept { uint32_t weak_use_count() const noexcept {
if (target_ == NullType::singleton()) { if (target_ == NullType::singleton()) {
return 0; return 0;
} }
return target_->weakcount_.load(std::memory_order_relaxed); return target_->weakcount_.load(std::memory_order_acquire);
} }
bool expired() const noexcept { bool expired() const noexcept {
@ -871,22 +866,18 @@ class weak_intrusive_ptr final {
} }
intrusive_ptr<TTarget, NullType> lock() const noexcept { intrusive_ptr<TTarget, NullType> lock() const noexcept {
if (target_ == NullType::singleton()) { if (expired()) {
return intrusive_ptr<TTarget, NullType>(); return intrusive_ptr<TTarget, NullType>();
} else { } else {
auto refcount = target_->refcount_.load(std::memory_order_relaxed); auto refcount = target_->refcount_.load(std::memory_order_seq_cst);
do { do {
if (refcount == 0) { if (refcount == 0) {
// Object already destructed, no strong references left anymore. // Object already destructed, no strong references left anymore.
// Return nullptr. // Return nullptr.
return intrusive_ptr<TTarget, NullType>(); return intrusive_ptr<TTarget, NullType>();
} }
} while (!target_->refcount_.compare_exchange_weak( } while (
refcount, !target_->refcount_.compare_exchange_weak(refcount, refcount + 1));
refcount + 1,
std::memory_order_acquire,
std::memory_order_relaxed));
return intrusive_ptr<TTarget, NullType>( return intrusive_ptr<TTarget, NullType>(
target_, raw::DontIncreaseRefcount{}); target_, raw::DontIncreaseRefcount{});
} }

View File

@ -550,13 +550,6 @@ if(USE_CUDA OR USE_ROCM)
append_filelist("libtorch_cuda_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS) append_filelist("libtorch_cuda_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS)
endif() endif()
if(USE_CUDA)
append_filelist("libtorch_nativert_cuda_sources" Caffe2_GPU_SRCS)
endif()
if(USE_ROCM)
append_filelist("libtorch_nativert_cuda_sources" Caffe2_HIP_SRCS)
endif()
if(USE_CUDA) if(USE_CUDA)
list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS}) list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS}) add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
@ -1837,12 +1830,6 @@ if(BUILD_TEST)
target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::undefined) target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::undefined)
endif() endif()
endif() endif()
if(USE_LSAN AND TARGET Sanitizer::leak)
target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::leak)
endif()
if(USE_TSAN AND TARGET Sanitizer::thread)
target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::thread)
endif()
else() else()
add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}") add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main) target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)

View File

@ -108,32 +108,24 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_BUILD_MOBILE)
enable_ubsan() enable_ubsan()
endif() endif()
if(USE_ASAN OR USE_LSAN OR USE_TSAN) if(USE_ASAN OR USE_TSAN)
find_package(Sanitizer REQUIRED) find_package(Sanitizer REQUIRED)
if(USE_ASAN) if(USE_ASAN)
if(TARGET Sanitizer::address) if(TARGET Sanitizer::address)
list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::address) list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::address)
else() else()
message(WARNING "ASAN not found. Suppress this warning with -DUSE_ASAN=OFF.") message(WARNING "Not ASAN found. Suppress this warning with -DUSE_ASAN=OFF.")
caffe2_update_option(USE_ASAN OFF) caffe2_update_option(USE_ASAN OFF)
endif() endif()
if(TARGET Sanitizer::undefined) if(TARGET Sanitizer::undefined)
list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::undefined) list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::undefined)
endif() endif()
endif() endif()
if(USE_LSAN)
if(TARGET Sanitizer::leak)
list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::leak)
else()
message(WARNING "LSAN not found. Suppress this warning with -DUSE_LSAN=OFF.")
caffe2_update_option(USE_LSAN OFF)
endif()
endif()
if(USE_TSAN) if(USE_TSAN)
if(TARGET Sanitizer::thread) if(TARGET Sanitizer::thread)
list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::thread) list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::thread)
else() else()
message(WARNING "TSAN not found. Suppress this warning with -DUSE_TSAN=OFF.") message(WARNING "Not TSAN found. Suppress this warning with -DUSE_TSAN=OFF.")
caffe2_update_option(USE_TSAN OFF) caffe2_update_option(USE_TSAN OFF)
endif() endif()
endif() endif()

View File

@ -45,88 +45,13 @@ if(NOT __AOTRITON_INCLUDED)
) )
set(__AOTRITON_BASE_URL "https://github.com/ROCm/aotriton/releases/download/") # @lint-ignore set(__AOTRITON_BASE_URL "https://github.com/ROCm/aotriton/releases/download/") # @lint-ignore
set(__AOTRITON_Z "gz") set(__AOTRITON_Z "gz")
# Set the default __AOTRITON_LIB path
set(__AOTRITON_LIB "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so")
if(WIN32)
set(__AOTRITON_LIB "${__AOTRITON_INSTALL_DIR}/lib/aotriton_v2.lib")
endif()
function(aotriton_build_windows_dependencies dlfcn-win32_external xz_external dlfcn-win32_DIR liblzma_DIR)
# Windows-specific dependencies - build these first
if(NOT noimage)
message(FATAL_ERROR "noimage must be ON for Windows builds")
endif()
# Build dlfcn-win32
set(__DLFCN_WIN32_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dlfcn-win32")
set(__DLFCN_WIN32_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/dlfcn-win32-install")
ExternalProject_Add(${dlfcn-win32_external}
GIT_REPOSITORY https://github.com/dlfcn-win32/dlfcn-win32.git
GIT_TAG v1.4.2
PREFIX ${__DLFCN_WIN32_PREFIX}
INSTALL_DIR ${__DLFCN_WIN32_INSTALL_DIR}
CMAKE_ARGS
-DCMAKE_INSTALL_PREFIX=${__DLFCN_WIN32_INSTALL_DIR}
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_C_COMPILER=cl
-DCMAKE_CXX_COMPILER=cl
-DBUILD_SHARED_LIBS=ON
-DBUILD_TESTS=OFF
BUILD_BYPRODUCTS
"${__DLFCN_WIN32_INSTALL_DIR}/lib/dl.lib"
"${__DLFCN_WIN32_INSTALL_DIR}/bin/dl.dll"
)
ExternalProject_Add_Step(${dlfcn-win32_external} copy_to_aotriton
COMMAND ${CMAKE_COMMAND} -E copy_if_different
"${__DLFCN_WIN32_INSTALL_DIR}/bin/dl.dll"
"${__AOTRITON_INSTALL_DIR}/lib/"
DEPENDEES install
)
set(${dlfcn-win32_DIR} "${__DLFCN_WIN32_INSTALL_DIR}/share/dlfcn-win32" CACHE PATH "Path to dlfcn-win32 CMake config" FORCE)
# Build xz/liblzma
set(__XZ_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/xz")
set(__XZ_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/xz-install")
ExternalProject_Add(${xz_external}
GIT_REPOSITORY https://github.com/tukaani-project/xz.git
GIT_TAG v5.8.1
PREFIX ${__XZ_PREFIX}
INSTALL_DIR ${__XZ_INSTALL_DIR}
CMAKE_ARGS
-DCMAKE_INSTALL_PREFIX=${__XZ_INSTALL_DIR}
-DCMAKE_BUILD_TYPE=Release
-DBUILD_SHARED_LIBS=ON
-DENABLE_NLS=OFF
-DXZ_TOOL_LZMAINFO=OFF
-DXZ_TOOL_XZ=OFF
-DXZ_TOOL_XZDEC=OFF
-DXZ_TOOL_LZMADEC=OFF
BUILD_BYPRODUCTS
"${__XZ_INSTALL_DIR}/lib/lzma.lib"
"${__XZ_INSTALL_DIR}/bin/liblzma.dll"
)
ExternalProject_Add_Step(${xz_external} copy_to_aotriton
COMMAND ${CMAKE_COMMAND} -E copy_if_different
"${__XZ_INSTALL_DIR}/bin/liblzma.dll"
"${__AOTRITON_INSTALL_DIR}/lib/"
DEPENDEES install
)
set(${liblzma_DIR} "${__XZ_INSTALL_DIR}/lib/cmake/liblzma" CACHE PATH "Path to xz/liblzma CMake config" FORCE)
endfunction()
function(aotriton_build_from_source noimage project) function(aotriton_build_from_source noimage project)
if(noimage) if(noimage)
SET(RECURSIVE "OFF") SET(RECURSIVE "OFF")
else() else()
SET(RECURSIVE "ON") SET(RECURSIVE "ON")
endif() endif()
if(WIN32)
message(STATUS "Building AOTriton Windows dependencies")
aotriton_build_windows_dependencies(dlfcn-win32_external xz_external dlfcn-win32_DIR liblzma_DIR)
endif()
message(STATUS "PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}") message(STATUS "PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}")
ExternalProject_Add(${project} ExternalProject_Add(${project}
GIT_REPOSITORY https://github.com/ROCm/aotriton.git GIT_REPOSITORY https://github.com/ROCm/aotriton.git
GIT_SUBMODULES_RECURSE ${RECURSIVE} GIT_SUBMODULES_RECURSE ${RECURSIVE}
@ -140,19 +65,12 @@ if(NOT __AOTRITON_INCLUDED)
-DAOTRITON_GPU_BUILD_TIMEOUT=0 -DAOTRITON_GPU_BUILD_TIMEOUT=0
-DAOTRITON_NO_PYTHON=ON -DAOTRITON_NO_PYTHON=ON
-DAOTRITON_NOIMAGE_MODE=${noimage} -DAOTRITON_NOIMAGE_MODE=${noimage}
-DHIP_PLATFORM=amd BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
$<$<BOOL:${WIN32}>:-Ddlfcn-win32_DIR=${dlfcn-win32_DIR}>
$<$<BOOL:${WIN32}>:-Dliblzma_DIR=${liblzma_DIR}>
BUILD_BYPRODUCTS
"${__AOTRITON_LIB}"
USES_TERMINAL_DOWNLOAD TRUE USES_TERMINAL_DOWNLOAD TRUE
USES_TERMINAL_CONFIGURE TRUE USES_TERMINAL_CONFIGURE TRUE
USES_TERMINAL_BUILD TRUE USES_TERMINAL_BUILD TRUE
USES_TERMINAL_INSTALL TRUE USES_TERMINAL_INSTALL TRUE
) )
if(WIN32)
add_dependencies(${project} dlfcn-win32_external xz_external)
endif()
endfunction() endfunction()
set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR}) set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
@ -177,7 +95,7 @@ if(NOT __AOTRITON_INCLUDED)
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
"${CMAKE_CURRENT_BINARY_DIR}/aotriton_runtime" "${CMAKE_CURRENT_BINARY_DIR}/aotriton_runtime"
"${__AOTRITON_INSTALL_DIR}" "${__AOTRITON_INSTALL_DIR}"
BUILD_BYPRODUCTS "${__AOTRITON_LIB}" BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
) )
message(STATUS "Using AOTriton Runtime from pre-compiled binary ${__AOTRITON_URL}.\ message(STATUS "Using AOTriton Runtime from pre-compiled binary ${__AOTRITON_URL}.\
Set env variables AOTRITON_INSTALL_FROM_SOURCE=1 to build from source.") Set env variables AOTRITON_INSTALL_FROM_SOURCE=1 to build from source.")
@ -193,35 +111,14 @@ if(NOT __AOTRITON_INCLUDED)
string(CONCAT __AOTRITON_URL string(CONCAT __AOTRITON_URL
"${__AOTRITON_BASE_URL}" "${__AOTRITON_BASE_URL}"
"${__AOTRITON_VER}/${__AOTRITON_FILE}") "${__AOTRITON_VER}/${__AOTRITON_FILE}")
# Set up directories
set(__AOTRITON_DOWNLOAD_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_download-${image})
set(__AOTRITON_EXTRACT_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image})
set(__AOTRITON_INSTALL_SOURCE_DIR ${__AOTRITON_EXTRACT_DIR})
set(__DOWNLOAD_NO_EXTRACT "")
set(__BUILD_COMMANDS "")
# On Windows, we need custom tar extraction with UTF-8 support
if(WIN32)
set(__DOWNLOAD_NO_EXTRACT "DOWNLOAD_NO_EXTRACT;TRUE")
set(__BUILD_COMMANDS
COMMAND ${CMAKE_COMMAND} -E make_directory "${__AOTRITON_EXTRACT_DIR}"
COMMAND tar --options hdrcharset=UTF-8 -xf "${__AOTRITON_DOWNLOAD_DIR}/${__AOTRITON_FILE}" -C "${__AOTRITON_EXTRACT_DIR}"
)
set(__AOTRITON_INSTALL_SOURCE_DIR ${__AOTRITON_EXTRACT_DIR}/aotriton)
endif()
ExternalProject_Add(${project} ExternalProject_Add(${project}
URL "${__AOTRITON_URL}" URL "${__AOTRITON_URL}"
URL_HASH SHA256=${__AOTRITON_SHA256} URL_HASH SHA256=${__AOTRITON_SHA256}
DOWNLOAD_DIR ${__AOTRITON_DOWNLOAD_DIR} SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image}
${__DOWNLOAD_NO_EXTRACT}
SOURCE_DIR ${__AOTRITON_EXTRACT_DIR}
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
${__BUILD_COMMANDS}
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
"${__AOTRITON_INSTALL_SOURCE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image}"
"${__AOTRITON_INSTALL_DIR}" "${__AOTRITON_INSTALL_DIR}"
BUILD_BYPRODUCTS BUILD_BYPRODUCTS
"${__AOTRITON_INSTALL_DIR}/lib/aotriton.images/${image}/__signature__" "${__AOTRITON_INSTALL_DIR}/lib/aotriton.images/${image}/__signature__"
@ -267,7 +164,7 @@ if(NOT __AOTRITON_INCLUDED)
endforeach() endforeach()
endforeach() endforeach()
endif() endif()
target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_LIB}) target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so)
target_include_directories(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include) target_include_directories(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include)
set(AOTRITON_FOUND TRUE) set(AOTRITON_FOUND TRUE)
endif() # __AOTRITON_INCLUDED endif() # __AOTRITON_INCLUDED

View File

@ -66,7 +66,6 @@ function(caffe2_print_configuration_summary)
message(STATUS " LAPACK : ${LAPACK_INFO}") message(STATUS " LAPACK : ${LAPACK_INFO}")
endif() endif()
message(STATUS " USE_ASAN : ${USE_ASAN}") message(STATUS " USE_ASAN : ${USE_ASAN}")
message(STATUS " USE_LSAN : ${USE_LSAN}")
message(STATUS " USE_TSAN : ${USE_TSAN}") message(STATUS " USE_TSAN : ${USE_TSAN}")
message(STATUS " USE_CPP_CODE_COVERAGE : ${USE_CPP_CODE_COVERAGE}") message(STATUS " USE_CPP_CODE_COVERAGE : ${USE_CPP_CODE_COVERAGE}")
message(STATUS " USE_CUDA : ${USE_CUDA}") message(STATUS " USE_CUDA : ${USE_CUDA}")

View File

@ -1,80 +0,0 @@
# Autoload Mechanism
The **Autoload** mechanism in PyTorch simplifies the integration of a custom backend by enabling automatic discovery and initialization at runtime. This eliminates the need for explicit imports or manual initialization, allowing developers to seamlessly integrate a new accelerator or backend into PyTorch.
## Background
The **Autoload Device Extension** proposal in PyTorch is centered on improving support for various hardware backend devices, especially those implemented as out-of-the-tree extensions (not part of PyTorchs main codebase). Currently, users must manually import or load these device-specific extensions to use them, which complicates the experience and increases cognitive overhead.
In contrast, in-tree devices (devices officially supported within PyTorch) are seamlessly integrated—users dont need extra imports or steps. The goal of autoloading is to make out-of-the-tree devices just as easy to use, so users can follow the standard PyTorch device programming model without explicit loading or code changes. This would allow existing PyTorch applications to run on new devices without any modification, making hardware support more user-friendly and reducing barriers to adoption.
For more information about the background of **Autoload**, please refer to its [RFC](https://github.com/pytorch/pytorch/issues/122468).
## Design
The core idea of **Autoload** is to Use Pythons plugin discovery (entry points) so PyTorch automatically loads out-of-tree device extensions when torch is imported—no explicit user import needed.
For more instructions of the design of **Autoload**, please refer to [**How it works**](https://docs.pytorch.org/tutorials/unstable/python_extension_autoload.html#how-it-works).
## Implementation
This tutorial will take **OpenReg** as a new out-of-the-tree device and guide you through the steps to enable and use the **Autoload** mechanism.
### Entry Point Setup
To enable **Autoload**, register the `_autoload` function as an entry point in [setup.py](https://github.com/pytorch/pytorch/blob/main/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py) file.
::::{tab-set}
:::{tab-item} Python
```{eval-rst}
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/setup.py
:language: python
:start-after: LITERALINCLUDE START: SETUP
:end-before: LITERALINCLUDE END: SETUP
:linenos:
:emphasize-lines: 9-13
```
:::
::::
### Backend Setup
Define the initialization hook `_autoload` for backend initialization in [torch_openreg](https://github.com/pytorch/pytorch/blob/main/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/__init__.py). This hook will be automatically invoked by PyTorch during startup.
::::{tab-set-code}
```{eval-rst}
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/__init__.py
:language: python
:start-after: LITERALINCLUDE START: AUTOLOAD
:end-before: LITERALINCLUDE END: AUTOLOAD
:linenos:
```
::::
## Result
After setting up the entry point and backend, build and install your backend. Now, we can use the new accelerator without explicitly importing it.
```{eval-rst}
.. grid:: 2
.. grid-item-card:: :octicon:`terminal;1em;` Without Autoload
>>> import torch
>>> import torch_openreg
>>> torch.tensor(1, device="openreg")
tensor(1, device='openreg:0')
.. grid-item-card:: :octicon:`terminal;1em;` With Autoload
>>> import torch # Automatically import torch_openreg
>>>
>>> torch.tensor(1, device="openreg")
tensor(1, device='openreg:0')
```

View File

@ -2,10 +2,6 @@
Since PyTorch 2.1, the community has made significant progress in streamlining the process of integrating new accelerators into the PyTorch ecosystem. These improvements include, but are not limited to: refinements to the `PrivateUse1` Dispatch Key, the introduction and enhancement of core subsystem extension mechanisms, and the device-agnostic refactoring of key modules (e.g., `torch.accelerator`, `memory management`). Taken together, these advances provide the foundation for a **robust**, **flexible**, and **developer-friendly** pathway for accelerator integration. Since PyTorch 2.1, the community has made significant progress in streamlining the process of integrating new accelerators into the PyTorch ecosystem. These improvements include, but are not limited to: refinements to the `PrivateUse1` Dispatch Key, the introduction and enhancement of core subsystem extension mechanisms, and the device-agnostic refactoring of key modules (e.g., `torch.accelerator`, `memory management`). Taken together, these advances provide the foundation for a **robust**, **flexible**, and **developer-friendly** pathway for accelerator integration.
```{note}
This guide is a work in progress. For more details, please refer to the [roadmap](https://github.com/pytorch/pytorch/issues/158917).
```
## Why Does This Matter? ## Why Does This Matter?
This integration pathway offers several major benefits: This integration pathway offers several major benefits:
@ -14,21 +10,9 @@ This integration pathway offers several major benefits:
* **Future-proofing**: This is the default integration path for all future PyTorch features, meaning that as new modules and features are added, they will automatically support scaling to new accelerators if this path is followed. * **Future-proofing**: This is the default integration path for all future PyTorch features, meaning that as new modules and features are added, they will automatically support scaling to new accelerators if this path is followed.
* **Autonomy**: Vendors maintain full control over their accelerator integration timelines, enabling fast iteration cycles and reducing reliance on upstream coordination. * **Autonomy**: Vendors maintain full control over their accelerator integration timelines, enabling fast iteration cycles and reducing reliance on upstream coordination.
## Target Audience
This document is intended for:
* **Accelerator Developers** who are integrating accelerator into PyTorch;
* **Advanced PyTorch Users** interested in the inner workings of key modules;
## About This Document ## About This Document
This guide aims to provide a **comprehensive overview of the modern integration pathway** for new accelerator in PyTorch. It walks through the full integration surface, from low-level device primitives to higher-level domain modules like compilation and quantization. The structure follows a **modular and scenario-driven approach**, where each topic is paired with corresponding code examples from [torch_openreg][OpenReg URL], an official reference implementation, and this series is structured around four major axes: This guide aims to provide a **comprehensive overview of the modern integration pathway** for new accelerator in PyTorch. It walks through the full integration surface, from low-level device primitives to higher-level domain modules like compilation and quantization. The structure follows a **modular and scenario-driven approach**, where each topic is paired with corresponding code examples from [torch_openreg][OpenReg URL], an official reference implementation.
* **Runtime**: Covers core components such as Event, Stream, Memory, Generator, Guard, Hooks, as well as the supporting C++ scaffolding.
* **Operators**: Involve the minimum necessary set of operators, forward and backward operators, fallback operators, fallthroughs, STUBs, etc. in both C++ and Python implementations.
* **Python Frontend**: Focuses on Python bindings for modules and device-agnostic APIs.
* **High-level Modules**: Explores integration with major subsystems such as `AMP`, `Compiler`, `ONNX`, and `Distributed` and so on.
The goal is to help developers: The goal is to help developers:
@ -36,13 +20,32 @@ The goal is to help developers:
* Follow best practices to quickly launch new accelerators; * Follow best practices to quickly launch new accelerators;
* Avoid common pitfalls through clear, targeted examples. * Avoid common pitfalls through clear, targeted examples.
Next, we will delve into each chapter of this guide. Each chapter focuses on a key aspect of integration, providing detailed explanations and illustrative examples. Since some chapters build upon previous ones, readers are encouraged to follow the sequence to achieve a more coherent understanding. ## Target Audience
This document is intended for:
* **Accelerator Developers** who are integrating accelerator into PyTorch;
* **Advanced PyTorch Users** interested in the inner workings of key modules;
## Quick Overview
This document outlines the key processes and practical scenarios involved in integrating new devices into PyTorch, providing developers with a comprehensive and detailed guide for bringing up new backends. The discussion is structured around four major axes:
* **Runtime**: Covers core components such as Event, Stream, Memory, Generator, Guard, Hooks, as well as the supporting C++ scaffolding.
* **Operators**: Involve the minimum necessary set of operators, forward and backward operators, fallback operators, fallthroughs, STUBs, etc. in both C++ and Python implementations.
* **Python Frontend**: Focuses on Python bindings for modules and device-agnostic APIs.
* **High-level Modules**: Explores integration with major subsystems such as `AMP`, `Compiler`, `ONNX`, and `Distributed` and so on.
Next, we will officially embark on the integration journey for a new PyTorch accelerator.
```{note}
This guide is a work in progress. For more details, please refer to the [roadmap](https://github.com/pytorch/pytorch/issues/158917).
```
```{toctree} ```{toctree}
:glob: :glob:
:maxdepth: 1 :maxdepth: 1
autoload
operators operators
``` ```

View File

@ -169,7 +169,7 @@ Of course, global fallbacks can also be combined with a blacklist of fallbacks,
### PyTorch STUB ### PyTorch STUB
PyTorch also provides another approach for built-in operators: `STUB`. This method is essentially based on the {ref}`Step 1<step-one>` approach, but adds secondary scheduling capabilities (for example, scheduling based on CPU characteristics). PyTorch also provides another approach for built-in operators: `STUB`. This method is essentially based on the `Step 1<step-one>` approach, but adds secondary scheduling capabilities (for example, scheduling based on CPU characteristics).
```{note} ```{note}
The `STUB` method currently supports only a limited set of operators. For new accelerator devices, the advantage of the `STUB` method is that it significantly reduces the cost of development at the cost of a small performance overhead. PyTorch currently does not clearly list the set of operators that can be registered through `STUB`. Due to the large number of related operators, only the query method for the supported operator list is provided here. The `STUB` method currently supports only a limited set of operators. For new accelerator devices, the advantage of the `STUB` method is that it significantly reduces the cost of development at the cost of a small performance overhead. PyTorch currently does not clearly list the set of operators that can be registered through `STUB`. Due to the large number of related operators, only the query method for the supported operator list is provided here.

View File

@ -1,6 +1,6 @@
# Working with Graph Breaks # Working with Graph Breaks
As you might remember from [Dynamo Core Concepts](programming_model.dynamo_core_concepts) that Dynamo performs a graph break when As you might remember from (Dynamo Core Concepts)[programming_model.dynamo_core_concepts] that Dynamo performs a graph break when
it encounters code that can't be traced. In the default `torch.compile` settings, Dynamo compiles the FX graph it encounters code that can't be traced. In the default `torch.compile` settings, Dynamo compiles the FX graph
that has been determined up to that point, executes the unsupported code in regular Python, and then resumes tracing. that has been determined up to that point, executes the unsupported code in regular Python, and then resumes tracing.

View File

@ -5,7 +5,7 @@
# Tensor Parallelism - torch.distributed.tensor.parallel # Tensor Parallelism - torch.distributed.tensor.parallel
Tensor Parallelism(TP) is built on top of the PyTorch DistributedTensor Tensor Parallelism(TP) is built on top of the PyTorch DistributedTensor
([DTensor](https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md)) (DTensor)[https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md]
and provides different parallelism styles: Colwise, Rowwise, and Sequence Parallelism. and provides different parallelism styles: Colwise, Rowwise, and Sequence Parallelism.
:::{warning} :::{warning}
@ -89,4 +89,4 @@ Parallelized cross-entropy loss computation (loss parallelism), is supported via
``` ```
:::{warning} :::{warning}
The loss_parallel API is experimental and subject to change. The loss_parallel API is experimental and subject to change.
::: :::

View File

@ -1,17 +0,0 @@
torch.nativert
==============
.. automodule:: torch.nativert
.. currentmodule:: torch.nativert
.. py:module:: torch.nativert
:noindex:
torch.nativert.backends
-----------------------
.. automodule:: torch.nativert.backends
.. currentmodule:: torch.nativert.backends
.. py:module:: torch.nativert.backends
:noindex:

View File

@ -102,7 +102,6 @@ also be interested in reading our [development wiki](https://github.com/pytorch/
onnx_export onnx_export
onnx_ops onnx_ops
onnx_verification onnx_verification
onnx_testing
``` ```
### Deprecated APIs ### Deprecated APIs

View File

@ -1,9 +0,0 @@
# torch.onnx.testing
```{eval-rst}
.. automodule:: torch.onnx.testing
```
```{eval-rst}
.. autofunction:: torch.onnx.testing.assert_onnx_program
```

View File

@ -56,7 +56,6 @@ torch.monitor <monitor>
torch.signal <signal> torch.signal <signal>
torch.special <special> torch.special <special>
torch.overrides torch.overrides
torch.nativert <nativert>
torch.package <package> torch.package <package>
profiler profiler
nn.init nn.init

View File

@ -72,4 +72,4 @@ aot_function(f, ts_compiler, ts_compiler)(torch.randn(3, requires_grad=True))
* Min-cut [recomputation](https://dev-discuss.pytorch.org/t/min-cut-optimal-recomputation-i-e-activation-checkpointing-with-aotautograd/467) with AOT Autograd. * Min-cut [recomputation](https://dev-discuss.pytorch.org/t/min-cut-optimal-recomputation-i-e-activation-checkpointing-with-aotautograd/467) with AOT Autograd.
## Tutorials ## Tutorials
You can use this [tutorial](https://pytorch.org/functorch/nightly/tutorials/aot_autograd_optimizations.html) to play with AOT Autograd. You can use this [tutorial](https://pytorch.org/functorch/nightly/notebooks/aot_autograd_optimizations.html) to play with AOT Autograd.

View File

@ -50,7 +50,7 @@ extensions = [
"myst_nb", "myst_nb",
] ]
# sys.path.insert(0, os.path.abspath('./tutorials')) # sys.path.insert(0, os.path.abspath('./notebooks'))
# build the templated autosummary files # build the templated autosummary files
# autosummary_generate = True # autosummary_generate = True
@ -131,7 +131,7 @@ language = "en"
# List of patterns, relative to source directory, that match files and # List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files. # directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path # This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ["tutorials/colab**", "tutorials/_src/**"] exclude_patterns = ["notebooks/colab**", "notebooks/_src/**"]
# The name of the Pygments (syntax highlighting) style to use. # The name of the Pygments (syntax highlighting) style to use.
pygments_style = "sphinx" pygments_style = "sphinx"

View File

@ -55,7 +55,7 @@ Check out our `whirlwind tour <whirlwind_tour>`_ or some of our tutorials mentio
:caption: functorch: Getting Started :caption: functorch: Getting Started
install install
tutorials/whirlwind_tour.ipynb notebooks/whirlwind_tour.ipynb
ux_limitations ux_limitations
.. toctree:: .. toctree::
@ -70,9 +70,9 @@ Check out our `whirlwind tour <whirlwind_tour>`_ or some of our tutorials mentio
:maxdepth: 1 :maxdepth: 1
:caption: functorch Tutorials :caption: functorch Tutorials
tutorials/jacobians_hessians.ipynb notebooks/jacobians_hessians.ipynb
tutorials/ensembling.ipynb notebooks/ensembling.ipynb
tutorials/per_sample_grads.ipynb notebooks/per_sample_grads.ipynb
tutorials/neural_tangent_kernels.ipynb notebooks/neural_tangent_kernels.ipynb
tutorials/aot_autograd_optimizations.ipynb notebooks/aot_autograd_optimizations.ipynb
tutorials/minifier.ipynb notebooks/minifier.ipynb

Some files were not shown because too many files have changed in this diff Show More