Update on "[Inductor XPU GEMM] Step 6/N: Refactor CUDACodeCache."

This PR is part of #160175. It extracts the CUDA-independent functionality from `CUDACodeCache` into `CUTLASSCodeCache`, which `CUDACodeCache` then inherits and extends with CUDA-specific logic. This design allows `CUTLASSCodeCache` to be reused by XPU as well. In addition, CUDA compilation logic has been moved into torch/_inductor/codegen/cuda/compile_utils.py, making codecache.py cleaner.



cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx ipiszy chenyang78 kadeng muchulee8 amjames chauhang aakhundov coconutruben

[ghstack-poisoned]
This commit is contained in:
xinan.lin
2025-09-08 01:06:15 +00:00
292 changed files with 8956 additions and 5944 deletions

View File

@ -1 +1 @@
d0e80f39c562c70986fc548fa6e5852ad86e16e7
1b0418a9a454b2b93ab8d71f40e59d2297157fae

View File

@ -147,7 +147,7 @@ function install_128 {
}
function install_130 {
CUDNN_VERSION=9.12.0.46
CUDNN_VERSION=9.13.0.50
echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
# install CUDA 13.0 in the same container
install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux

View File

@ -74,6 +74,14 @@ RUN bash ./install_cuda.sh 13.0
RUN bash ./install_magma.sh 13.0
RUN ln -sf /usr/local/cuda-13.0 /usr/local/cuda
# Install libibverbs for libtorch and copy to CUDA directory
RUN apt-get update -y && \
apt-get install -y libibverbs-dev librdmacm-dev && \
cp /usr/lib/x86_64-linux-gnu/libmlx5.so* /usr/local/cuda/lib64/ && \
cp /usr/lib/x86_64-linux-gnu/librdmacm.so* /usr/local/cuda/lib64/ && \
cp /usr/lib/x86_64-linux-gnu/libibverbs.so* /usr/local/cuda/lib64/ && \
cp /usr/lib/x86_64-linux-gnu/libnl* /usr/local/cuda/lib64/
FROM cpu as rocm
ARG ROCM_VERSION
ARG PYTORCH_ROCM_ARCH

View File

@ -1 +1 @@
3.4.0
3.5.0

View File

@ -76,7 +76,6 @@ def sample_vllm_test_library():
),
"pytest -v -s entrypoints/llm/test_lazy_outlines.py",
"pytest -v -s entrypoints/llm/test_generate.py ",
"pytest -v -s entrypoints/llm/test_generate_multiple_loras.py",
"VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
],
},

View File

@ -124,6 +124,7 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
fi
if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
echo "Bundling with cudnn and cublas."
DEPS_LIST+=(
"/usr/local/cuda/lib64/libcudnn_adv.so.9"
"/usr/local/cuda/lib64/libcudnn_cnn.so.9"
@ -133,16 +134,11 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
"/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
"/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
"/usr/local/cuda/lib64/libcudnn.so.9"
"/usr/local/cuda/lib64/libcublas.so.12"
"/usr/local/cuda/lib64/libcublasLt.so.12"
"/usr/local/cuda/lib64/libcusparseLt.so.0"
"/usr/local/cuda/lib64/libcudart.so.12"
"/usr/local/cuda/lib64/libnvrtc.so.12"
"/usr/local/cuda/lib64/libnvrtc-builtins.so"
"/usr/local/cuda/lib64/libcufile.so.0"
"/usr/local/cuda/lib64/libcufile_rdma.so.1"
"/usr/local/cuda/lib64/libnvshmem_host.so.3"
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
)
DEPS_SONAME+=(
@ -154,22 +150,56 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
"libcudnn_engines_precompiled.so.9"
"libcudnn_heuristic.so.9"
"libcudnn.so.9"
"libcublas.so.12"
"libcublasLt.so.12"
"libcusparseLt.so.0"
"libcudart.so.12"
"libnvrtc.so.12"
"libnvrtc-builtins.so"
"libnvshmem_host.so.3"
"libcufile.so.0"
"libcufile_rdma.so.1"
"libcupti.so.12"
"libnvperf_host.so"
)
# Add libnvToolsExt only if CUDA version is not 12.9
if [[ $CUDA_VERSION != 12.9* ]]; then
DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
DEPS_SONAME+=("libnvToolsExt.so.1")
if [[ $CUDA_VERSION == 13* ]]; then
DEPS_LIST+=(
"/usr/local/cuda/lib64/libcublas.so.13"
"/usr/local/cuda/lib64/libcublasLt.so.13"
"/usr/local/cuda/lib64/libcudart.so.13"
"/usr/local/cuda/lib64/libnvrtc.so.13"
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13"
"/usr/local/cuda/lib64/libibverbs.so.1"
"/usr/local/cuda/lib64/librdmacm.so.1"
"/usr/local/cuda/lib64/libmlx5.so.1"
"/usr/local/cuda/lib64/libnl-3.so.200"
"/usr/local/cuda/lib64/libnl-route-3.so.200")
DEPS_SONAME+=(
"libcublas.so.13"
"libcublasLt.so.13"
"libcudart.so.13"
"libnvrtc.so.13"
"libcupti.so.13"
"libibverbs.so.1"
"librdmacm.so.1"
"libmlx5.so.1"
"libnl-3.so.200"
"libnl-route-3.so.200")
export USE_CUPTI_SO=1
export ATEN_STATIC_CUDA=0
export USE_CUDA_STATIC_LINK=0
export USE_CUFILE=0
else
DEPS_LIST+=(
"/usr/local/cuda/lib64/libnvToolsExt.so.1"
"/usr/local/cuda/lib64/libcublas.so.12"
"/usr/local/cuda/lib64/libcublasLt.so.12"
"/usr/local/cuda/lib64/libcudart.so.12"
"/usr/local/cuda/lib64/libnvrtc.so.12"
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
DEPS_SONAME+=(
"libnvToolsExt.so.1"
"libcublas.so.12"
"libcublasLt.so.12"
"libcudart.so.12"
"libnvrtc.so.12"
"libcupti.so.12")
fi
else
echo "Using nvidia libs from pypi."

View File

@ -199,7 +199,7 @@ torchbench_setup_macos() {
git checkout "$(cat ../.github/ci_commit_pins/vision.txt)"
git submodule update --init --recursive
python setup.py clean
python setup.py develop
python -m pip install -e . -v --no-build-isolation
popd
pushd torchaudio
@ -208,7 +208,7 @@ torchbench_setup_macos() {
git submodule update --init --recursive
python setup.py clean
#TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp
USE_OPENMP=0 python setup.py develop
USE_OPENMP=0 python -m pip install -e . -v --no-build-isolation
popd
checkout_install_torchbench

View File

@ -124,19 +124,15 @@ popd
export TH_BINARY_BUILD=1
export INSTALL_TEST=0 # dont install test binaries into site-packages
export MACOSX_DEPLOYMENT_TARGET=10.15
export MACOSX_DEPLOYMENT_TARGET=11.0
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
SETUPTOOLS_PINNED_VERSION="==70.1.0"
PYYAML_PINNED_VERSION="==5.3"
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
RENAME_WHEEL=true
case $desired_python in
3.14t)
echo "Using 3.14 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
PYYAML_PINNED_VERSION=">=6.0.1"
NUMPY_PINNED_VERSION="==2.1.0"
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
@ -145,8 +141,6 @@ case $desired_python in
;;
3.14)
echo "Using 3.14t deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
PYYAML_PINNED_VERSION=">=6.0.1"
NUMPY_PINNED_VERSION="==2.1.0"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
@ -154,8 +148,6 @@ case $desired_python in
;;
3.13t)
echo "Using 3.13 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
PYYAML_PINNED_VERSION=">=6.0.1"
NUMPY_PINNED_VERSION="==2.1.0"
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
@ -164,37 +156,23 @@ case $desired_python in
;;
3.13)
echo "Using 3.13 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
PYYAML_PINNED_VERSION=">=6.0.1"
NUMPY_PINNED_VERSION="==2.1.0"
;;
3.12)
echo "Using 3.12 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
PYYAML_PINNED_VERSION=">=6.0.1"
NUMPY_PINNED_VERSION="==2.0.2"
;;
3.11)
echo "Using 3.11 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
PYYAML_PINNED_VERSION=">=5.3"
NUMPY_PINNED_VERSION="==2.0.2"
;;
3.10)
echo "Using 3.10 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
PYYAML_PINNED_VERSION=">=5.3"
NUMPY_PINNED_VERSION="==2.0.2"
;;
3.9)
echo "Using 3.9 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
PYYAML_PINNED_VERSION=">=5.3"
NUMPY_PINNED_VERSION="==2.0.2"
;;
*)
echo "Using default deps"
NUMPY_PINNED_VERSION="==1.11.3"
echo "Unsupported version $desired_python"
exit 1
;;
esac
@ -204,8 +182,6 @@ conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_p
source activate "$tmp_env_name"
PINNED_PACKAGES=(
"setuptools${SETUPTOOLS_PINNED_VERSION}"
"pyyaml${PYYAML_PINNED_VERSION}"
"numpy${NUMPY_PINNED_VERSION}"
)
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
@ -224,7 +200,7 @@ export BUILD_TEST=OFF
pushd "$pytorch_rootdir"
echo "Calling setup.py bdist_wheel at $(date)"
python setup.py bdist_wheel -d "$whl_tmp_dir"
python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version}
echo "Finished setup.py bdist_wheel at $(date)"

View File

@ -12,7 +12,9 @@ self-hosted-runner:
- linux.9xlarge.ephemeral
- am2.linux.9xlarge.ephemeral
- linux.12xlarge
- linux.12xlarge.memory
- linux.24xlarge
- linux.24xlarge.memory
- linux.24xlarge.ephemeral
- linux.24xlarge.amd
- linux.arm64.2xlarge

View File

@ -4,6 +4,11 @@ name: Build External packages
description: build external packages for PyTorch
inputs:
cuda-version:
description: CUDA version to use
type: string
required: true
default: '12.8.1'
cuda-arch-list:
description: TORCH_CUDA_ARCH_LIST (e.g., "8.0;8.9;9.0")
type: string
@ -44,11 +49,12 @@ runs:
env:
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SCCACHE_REGION: us-east-1
CUDA_VERSION: ${{ inputs.cuda-version }}
TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
BASE_IMAGE: ${{ inputs.docker-image }}
BUILD_TARGETS: ${{ inputs.build-targets }}
PARENT_OUTPUT_DIR: ${{ inputs.output-dir}}
PARENT_OUTPUT_DIR: ${{ inputs.output-dir }}
TORCH_WHEELS_PATH: ${{ inputs.torch-wheel-dir }}
shell: bash
run: |
set -euo pipefail
@ -69,7 +75,6 @@ runs:
export OUTPUT_DIR
echo "Building external package: $target in directory $OUTPUT_DIR"
python3 -m cli.run build external "$target"
done
END_TIME=$(date +%s)

View File

@ -1 +1 @@
0757bbb660855272f7dd8d31cc84e7c631522805
2e300559e4e123928a22187b8f59a5b56f57ddc8

View File

@ -1 +1 @@
b5ee1e3261d9edf94d76ba8b437ebdef7ac599ea
4172235ab78b09989fb56edaf734dbee283dda3e

View File

@ -12,54 +12,46 @@ ARG BUILD_BASE_IMAGE=torch-nightly-base
# by default, it uses devel-ubuntu22.04 official image.
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
# The logic is copied from https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile
ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"
#################### TORCH NIGHTLY BASE IMAGE ####################
#################### TORCH NIGHTLY BASE IMAGE ####################
# A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci
From nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
ARG CUDA_VERSION=12.8.1
ARG PYTHON_VERSION=3.12
ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
ARG CUDA_VERSION
ARG PYTHON_VERSION
ARG GET_PIP_URL
# Install Python and other dependencies if it does not existed
RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
echo "Installing Python ${PYTHON_VERSION}..." && \
echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
apt-get update -y && \
apt-get install -y ccache software-properties-common git curl sudo && \
for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done && \
apt-get update -y && \
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
else \
echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
fi \
&& python3 --version && python3 -m pip --version
# Install Python and other dependencies
RUN apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim \
&& add-apt-repository -y ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
# Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519)
RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \
if [ "$current_gcc_version" -lt 10 ]; then \
echo "GCC version is $current_gcc_version, installing gcc-10..."; \
apt-get update && \
apt-get install -y gcc-10 g++-10 && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 && \
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
else \
echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
fi && \
gcc --version && g++ --version
if command -v apt-get >/dev/null; then \
if [ "$current_gcc_version" -lt 10 ]; then \
echo "GCC version is $current_gcc_version, installing gcc-10..."; \
apt-get update \
&& apt-get install -y gcc-10 g++-10 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 \
&& update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
else \
echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
fi \
fi \
&& gcc --version && g++ --version
# install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \
@ -79,6 +71,21 @@ ENV UV_LINK_MODE=copy
FROM ${BUILD_BASE_IMAGE} AS base
USER root
ARG CUDA_VERSION
ARG PYTHON_VERSION
# TODO (huydhn): Only work with PyTorch manylinux builder
ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
# Install some system dependencies and double check python version
RUN if command -v apt-get >/dev/null; then \
apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim; \
else \
dnf install -y git curl wget sudo vim; \
fi \
&& python3 --version && python3 -m pip --version
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
@ -118,17 +125,15 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
echo "[INFO] Installing torch wheels to build vllm"; \
torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
uv pip install --system "${torch_whl}[opt-einsum]"; \
uv pip install --system "${vision_whl}"; \
uv pip install --system "${audio_whl}"; \
vision_whl=$(find /dist -name 'torchvision*.whl' | head -n1 | xargs); \
audio_whl=$(find /dist -name 'torchaudio*.whl' | head -n1 | xargs); \
uv pip install --system "${torch_whl}[opt-einsum]" "${vision_whl}" "${audio_whl}" /dist/*.whl; \
elif [ -n "$PINNED_TORCH_VERSION" ]; then \
echo "[INFO] Installing pinned torch nightly version to build vllm: $PINNED_TORCH_VERSION"; \
uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu128; \
uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
else \
echo "[INFO] Installing torch nightly with latest one to build vllm"; \
uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128; \
uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
fi
# Install numba 0.61.2 for cuda environment
@ -137,12 +142,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Install common dependencies from vllm common.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/common.txt
uv pip install --system -r requirements/common.txt
# Must put before installing xformers, so it can install the correct version of xfomrers.
ARG exformer_cuda_arch_list='7.5;8.0+PTX;9.0a'
ENV TORCH_CUDA_ARCH_LIST=${exformer_cuda_arch_list}
ARG xformers_cuda_arch_list='7.5;8.0+PTX;9.0a'
ENV TORCH_CUDA_ARCH_LIST=${xformers_cuda_arch_list}
ARG max_jobs=16
ENV MAX_JOBS=${max_jobs}
@ -153,8 +157,8 @@ RUN pip freeze | grep -E 'ninja'
# Build xformers with cuda and torch nightly/wheel
# following official xformers guidance: https://github.com/facebookresearch/xformers#build
# sha for https://github.com/facebookresearch/xformers/tree/v0.0.31
ARG XFORMERS_COMMIT=eb0946a363464da96ea40afd1a7f72a907c25497
# sha for https://github.com/facebookresearch/xformers/tree/v0.0.32.post2
ARG XFORMERS_COMMIT=5d4b92a5e5a9c6c6d4878283f47d82e17995b468
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
@ -188,11 +192,6 @@ RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
FROM base AS build
ARG TARGETPLATFORM
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
COPY . .
RUN python3 use_existing_torch.py
@ -251,9 +250,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
fi
RUN echo "[DEBUG] Listing current directory:" && \
RUN echo "[INFO] Listing current directory:" && \
ls -al && \
echo "[DEBUG] Showing torch_build_versions.txt content:" && \
echo "[INFO] Showing torch_build_versions.txt content:" && \
cat torch_build_versions.txt
#################### WHEEL BUILD IMAGE ####################
@ -263,42 +262,40 @@ RUN echo "[DEBUG] Listing current directory:" && \
# Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer
FROM ${FINAL_BASE_IMAGE} AS vllm-base
USER root
ARG CUDA_VERSION
ARG PYTHON_VERSION
ARG GET_PIP_URL
# TODO (huydhn): Only work with PyTorch manylinux builder
ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
# prepare for environment starts
WORKDIR /workspace
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
# Install Python and other dependencies if it does not existed
RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
echo "Installing Python ${PYTHON_VERSION}..." && \
echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
apt-get update -y && \
apt-get install -y ccache software-properties-common git curl sudo && \
for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done && \
apt-get update -y && \
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
else \
echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
fi \
&& python3 --version && python3 -m pip --version
# Install Python and other dependencies
RUN if command -v apt-get >/dev/null; then \
apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim \
&& add-apt-repository -y ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
else \
dnf install -y git curl wget sudo vim; \
fi \
&& python3 --version && python3 -m pip --version
# Get the torch versions, and whls used in previous stagtes for consistency
COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
COPY --from=base /workspace/xformers-dist /wheels/xformers
COPY --from=build /workspace/vllm-dist /wheels/vllm
RUN echo "[DEBUG] Listing current directory before torch install step:" && \
RUN echo "[INFO] Listing current directory before torch install step:" && \
ls -al && \
echo "[DEBUG] Showing torch_build_versions.txt content:" && \
echo "[INFO] Showing torch_build_versions.txt content:" && \
cat torch_build_versions.txt
# Workaround for https://github.com/openai/triton/issues/2507 and
@ -307,7 +304,6 @@ RUN echo "[DEBUG] Listing current directory before torch install step:" && \
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
if ! python3 -m uv --version > /dev/null 2>&1; then \
@ -327,15 +323,13 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
--mount=type=cache,target=/root/.cache/uv \
if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
vision_whl=$(find /dist -name 'torchvision*.whl' | head -n1 | xargs); \
audio_whl=$(find /dist -name 'torchaudio*.whl' | head -n1 | xargs); \
echo "[INFO] Use wheels to build : '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \
uv pip install --system "${torch_whl}[opt-einsum]"; \
uv pip install --system "${vision_whl}"; \
uv pip install --system "${audio_whl}"; \
uv pip install --system "${torch_whl}[opt-einsum]" "${vision_whl}" "${audio_whl}" /dist/*.whl; \
else \
echo "[INFO] Installing torch versions from torch_build_versions.txt"; \
uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128; \
uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
fi
# Install the vllm wheel from previous stage
@ -346,9 +340,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system /wheels/xformers/*.whl --verbose
# Build flashinfer from source.
ARG torch_cuda_arch_list='8.0;8.9;9.0a'
ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
# install package for build flashinfer
# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
@ -416,11 +409,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/nightly_torch_test.txt
# Workaround for #17068
# pinned commit for v2.2.4
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@95d8aba8a8c75aedcaa6143713b11e745e7cd0d9#egg=mamba-ssm"
# Logging to confirm the torch versions
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'

View File

@ -16,18 +16,16 @@ from typing import Optional
# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
CUDA_ARCHES = ["12.6", "12.8", "13.0"]
CUDA_STABLE = "12.8"
CUDA_ARCHES_FULL_VERSION = {
"12.6": "12.6.3",
"12.8": "12.8.1",
"12.9": "12.9.1",
"13.0": "13.0.0",
}
CUDA_ARCHES_CUDNN_VERSION = {
"12.6": "9",
"12.8": "9",
"12.9": "9",
"13.0": "9",
}
@ -40,7 +38,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]
CPU_S390X_ARCH = ["cpu-s390x"]
CUDA_AARCH64_ARCHES = ["12.9-aarch64", "13.0-aarch64"]
CUDA_AARCH64_ARCHES = ["13.0-aarch64"]
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
@ -78,28 +76,11 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
),
"12.9": (
"nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
),
"13.0": (
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -240,8 +221,6 @@ def generate_libtorch_matrix(
if os == "linux":
arches += CUDA_ARCHES
arches += ROCM_ARCHES
if "13.0" in arches:
arches.remove("13.0")
elif os == "windows":
arches += CUDA_ARCHES
if libtorch_variants is None:
@ -343,7 +322,7 @@ def generate_wheels_matrix(
# cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
if (
arch_version in ["13.0", "12.9", "12.8", "12.6"]
arch_version in ["13.0", "12.8", "12.6"]
and os == "linux"
or arch_version in CUDA_AARCH64_ARCHES
):
@ -407,6 +386,5 @@ def generate_wheels_matrix(
validate_nccl_dep_consistency("13.0")
validate_nccl_dep_consistency("12.9")
validate_nccl_dep_consistency("12.8")
validate_nccl_dep_consistency("12.6")

View File

@ -22,7 +22,7 @@ LABEL_CIFLOW_BINARIES = "ciflow/binaries"
LABEL_CIFLOW_PERIODIC = "ciflow/periodic"
LABEL_CIFLOW_BINARIES_LIBTORCH = "ciflow/binaries_libtorch"
LABEL_CIFLOW_BINARIES_WHEEL = "ciflow/binaries_wheel"
LABEL_CIFLOW_ROCM = "ciflow/rocm-mi300"
LABEL_CIFLOW_ROCM = "ciflow/rocm"
@dataclass
@ -139,6 +139,8 @@ ROCM_SMOKE_WORKFLOWS = [
),
ciflow_config=CIFlowConfig(
labels={
LABEL_CIFLOW_BINARIES,
LABEL_CIFLOW_BINARIES_WHEEL,
LABEL_CIFLOW_ROCM,
},
isolated_workflow=True,

View File

@ -171,7 +171,7 @@ jobs:
- name: Teardown XPU
uses: ./.github/actions/teardown-xpu
{%- else %}
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: !{{ common.timeout_minutes }}
!{{ upload.binary_env(config) }}
steps:

View File

@ -68,11 +68,6 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
!{{ common.checkout(deep_clone=False, directory="pytorch") }}
- name: Populate binary env
run: |

View File

@ -33,7 +33,7 @@
{%- if is_windows %}
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
{%- endif %}
{%- else %}

248
.github/workflows/build-vllm-wheel.yml vendored Normal file
View File

@ -0,0 +1,248 @@
name: Build vLLM wheels
on:
push:
branches:
- main
paths:
- .github/workflows/build-vllm-wheel.yml
- .github/ci_commit_pins/vllm.txt
workflow_dispatch:
pull_request:
paths:
- .github/workflows/build-vllm-wheel.yml
- .github/ci_commit_pins/vllm.txt
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
jobs:
build-wheel:
if: github.repository_owner == 'pytorch'
strategy:
fail-fast: false
matrix:
python-version: [ '3.12' ]
# TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
device: [ 'cu128', 'cu129' ]
runner: [ 'linux.12xlarge.memory' ]
include:
- device: cu128
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
- device: cu129
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
name: "Build ${{ matrix.device }} vLLM wheel"
runs-on: ${{ matrix.runner }}
timeout-minutes: 480
env:
PY_VERS: ${{ matrix.python-version }}
MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
PLATFORM: 'manylinux_2_28_x86_64'
BUILD_DEVICE: ${{ matrix.device }}
steps:
- name: Setup SSH (Click me for login details)
uses: pytorch/test-infra/.github/actions/setup-ssh@main
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
submodules: false
- name: Setup Linux
uses: ./.github/actions/setup-linux
- name: Get latest PyTorch nightly
shell: bash
run: |
set -eux
# Keep PyTorch nightly wheel here so that we can install it later during
# vLLM build process
mkdir -p "${RUNNER_TEMP}/artifacts/"
container_name=$(docker run \
--tty \
--detach \
-e PLATFORM \
-v "${GITHUB_WORKSPACE}:/pytorch" \
-v "${RUNNER_TEMP}/artifacts:/artifacts" \
-w /artifacts/ \
"${MANYLINUX_IMAGE}"
)
# Determine python executable for given version (copied from build-triton-wheel)
case $PY_VERS in
3.10)
PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
;;
3.11)
PYTHON_EXECUTABLE=/opt/python/cp311-cp311/bin/python
;;
3.12)
PYTHON_EXECUTABLE=/opt/python/cp312-cp312/bin/python
;;
3.13)
PYTHON_EXECUTABLE=/opt/python/cp313-cp313/bin/python
;;
3.13t)
PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python
;;
3.14)
PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python
;;
3.14t)
PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python
;;
*)
echo "Unsupported python version ${PY_VERS}"
exit 1
;;
esac
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \
--pre torch torchvision torchaudio \
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
# I wonder if there is a command to both download and install the wheels
# in one go
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip download \
--pre torch torchvision torchaudio \
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
# Save this for later
echo "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" >> "$GITHUB_ENV"
echo "container_name=${container_name}" >> "$GITHUB_ENV"
- name: Build vLLM wheel
uses: ./.github/actions/build-external-packages
with:
build-targets: vllm
docker-image: ${{ env.MANYLINUX_IMAGE }}
cuda-arch-list: '8.0;8.9;9.0;10.0;12.0'
torch-wheel-dir: ${{ runner.temp }}/artifacts
output-dir: ${{ runner.temp }}/artifacts/externals
- name: Prepare vLLM wheel
shell: bash
run: |
set -eux
# Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh
docker exec -t "${container_name}" bash -c "
set -eux
nightly=\$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2 | cut -d'.' -f4)
pushd externals/vllm/wheels
for package in xformers flashinfer-python vllm; do
pushd \$package
auditwheel repair --plat \$PLATFORM *.whl \
--exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
repair_wheel=\$(find wheelhouse -name *\${PLATFORM}*)
repair_wheel=\$(basename \${repair_wheel})
popd
cp \${package}/wheelhouse/\${repair_wheel} .
version=\$(unzip -p \$repair_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
if [[ \$package == vllm ]]; then
new_wheel=\${repair_wheel/\$version/1.0.0.\$nightly}
else
major_version=\$(echo \$version | tr '.+' '.' | cut -d'.' -f1-3)
new_wheel=\${repair_wheel/\$version/\$major_version.\$nightly}
fi
mv -- \$repair_wheel \$new_wheel
rm -rf \$package
done
popd
"
docker exec -t "${container_name}" chown -R 1000:1000 /artifacts
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with:
name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
if-no-files-found: error
path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()
# Copied from build-triton-wheel workflow (mostly)
upload-wheel:
name: "Upload ${{ matrix.device }} vLLM wheel"
needs:
- build-wheel
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
device: [ 'cu128', 'cu129' ]
env:
BUILD_DEVICE: ${{ matrix.device }}
permissions:
id-token: write
contents: read
container:
image: continuumio/miniconda3:4.12.0
environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Configure AWS credentials(PyTorch account) for main
if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
aws-region: us-east-1
- name: Configure AWS credentials(PyTorch account) for RC builds
if: ${{ github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }}
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels
aws-region: us-east-1
- name: Download Build Artifacts
uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
with:
# Download all available artifacts
path: ${{ runner.temp }}/artifacts-all
- name: Select Wheel Artifacts
shell: bash
run: |
set -eux
mkdir -p "${RUNNER_TEMP}/artifacts/"
mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
shell: bash
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
shell: bash
run: |
set -ex
if [[ "${GITHUB_REF_NAME}" = *-rc[0-9]* ]]; then
echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
fi
- name: Upload binaries
env:
PACKAGE_TYPE: wheel
UPLOAD_SUBFOLDER: ${{ env.BUILD_DEVICE }}
PKG_DIR: ${{ runner.temp }}/artifacts
shell: bash
run: |
set -ex
bash .circleci/scripts/binary_upload.sh

View File

@ -112,52 +112,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_10-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -178,7 +132,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -269,52 +223,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_11-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -335,7 +243,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -426,52 +334,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_12-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -492,7 +354,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -583,52 +445,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -649,7 +465,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -740,52 +556,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13t-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
build_name: manywheel-py3_13t-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -806,7 +576,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -897,52 +667,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
build_name: manywheel-py3_14-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -963,7 +687,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1054,52 +778,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda-aarch64-12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda-aarch64-12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14t-cuda-aarch64-12_9-build
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9-aarch64"
GPU_ARCH_TYPE: cuda-aarch64
DOCKER_IMAGE: manylinuxaarch64-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
build_name: manywheel-py3_14t-cuda-aarch64-12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda-aarch64-13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -1120,7 +798,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -248,7 +248,7 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-cuda12_9-shared-with-deps-release-build:
libtorch-cuda13_0-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
@ -257,22 +257,22 @@ jobs:
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: libtorch-cuda12_9-shared-with-deps-release
build_name: libtorch-cuda13_0-shared-with-deps-release
build_environment: linux-binary-libtorch
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-cuda12_9-shared-with-deps-release-test: # Testing
libtorch-cuda13_0-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-cuda12_9-shared-with-deps-release-build
- libtorch-cuda13_0-shared-with-deps-release-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
@ -280,38 +280,38 @@ jobs:
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-cuda12_9-shared-with-deps-release
build_name: libtorch-cuda13_0-shared-with-deps-release
build_environment: linux-binary-libtorch
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading
libtorch-cuda13_0-shared-with-deps-release-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-cuda12_9-shared-with-deps-release-test
needs: libtorch-cuda13_0-shared-with-deps-release-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-cuda12_9-shared-with-deps-release
build_name: libtorch-cuda13_0-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
@ -342,7 +342,7 @@ jobs:
needs:
- libtorch-rocm6_3-shared-with-deps-release-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -456,7 +456,7 @@ jobs:
needs:
- libtorch-rocm6_4-shared-with-deps-release-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch

View File

@ -241,72 +241,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_10-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_10-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -325,7 +259,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda13_0-test: # Testing
@ -398,7 +332,7 @@ jobs:
needs:
- manywheel-py3_10-rocm6_3-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -509,7 +443,7 @@ jobs:
needs:
- manywheel-py3_10-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -899,72 +833,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_11-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_11-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -983,7 +851,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda13_0-test: # Testing
@ -1056,7 +924,7 @@ jobs:
needs:
- manywheel-py3_11-rocm6_3-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -1167,7 +1035,7 @@ jobs:
needs:
- manywheel-py3_11-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -1557,72 +1425,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_12-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_12-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -1641,7 +1443,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda13_0-test: # Testing
@ -1714,7 +1516,7 @@ jobs:
needs:
- manywheel-py3_12-rocm6_3-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -1825,7 +1627,7 @@ jobs:
needs:
- manywheel-py3_12-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -2215,72 +2017,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_13-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13"
build_name: manywheel-py3_13-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -2299,7 +2035,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda13_0-test: # Testing
@ -2372,7 +2108,7 @@ jobs:
needs:
- manywheel-py3_13-rocm6_3-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -2483,7 +2219,7 @@ jobs:
needs:
- manywheel-py3_13-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -2873,72 +2609,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_13t-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
build_name: manywheel-py3_13t-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_13t-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.13t"
build_name: manywheel-py3_13t-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13t-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -2957,7 +2627,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda13_0-test: # Testing
@ -3030,7 +2700,7 @@ jobs:
needs:
- manywheel-py3_13t-rocm6_3-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -3141,7 +2811,7 @@ jobs:
needs:
- manywheel-py3_13t-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -3531,72 +3201,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_14-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
build_name: manywheel-py3_14-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14"
build_name: manywheel-py3_14-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -3615,7 +3219,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda13_0-test: # Testing
@ -3688,7 +3292,7 @@ jobs:
needs:
- manywheel-py3_14-rocm6_3-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -3799,7 +3403,7 @@ jobs:
needs:
- manywheel-py3_14-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -4189,72 +3793,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_9-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_14t-cuda12_9-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
build_name: manywheel-py3_14t-cuda12_9
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_9-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_14t-cuda12_9-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
DESIRED_PYTHON: "3.14t"
build_name: manywheel-py3_14t-cuda12_9
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_14t-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -4273,7 +3811,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda13_0
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda13_0-test: # Testing
@ -4346,7 +3884,7 @@ jobs:
needs:
- manywheel-py3_14t-rocm6_3-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -4457,7 +3995,7 @@ jobs:
needs:
- manywheel-py3_14t-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch

View File

@ -10,7 +10,9 @@ on:
branches:
- main
tags:
- 'ciflow/rocm-mi300/*'
- 'ciflow/binaries/*'
- 'ciflow/binaries_wheel/*'
- 'ciflow/rocm/*'
workflow_dispatch:
permissions:
@ -67,7 +69,7 @@ jobs:
needs:
- manywheel-py3_9-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu.gfx942.1
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch

View File

@ -46,7 +46,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -67,11 +67,6 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
- name: Checkout PyTorch
uses: actions/checkout@v4
with:

View File

@ -63,11 +63,6 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
@ -208,11 +203,6 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
@ -353,11 +343,6 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
@ -498,11 +483,6 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
@ -643,11 +623,6 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
@ -788,11 +763,6 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
@ -933,11 +903,6 @@ jobs:
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
- name: Checkout PyTorch
uses: actions/checkout@v4
with:

View File

@ -64,7 +64,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
- name: Populate binary env
shell: cmd
@ -141,7 +141,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
- name: Populate binary env
shell: cmd
@ -201,7 +201,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
build_name: libtorch-cpu-shared-with-deps-debug
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -64,7 +64,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
- name: Populate binary env
shell: cmd
@ -141,7 +141,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
- name: Populate binary env
shell: cmd
@ -201,7 +201,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
build_name: libtorch-cpu-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -51,7 +51,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -166,7 +166,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
shell: bash

View File

@ -58,7 +58,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -173,7 +173,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
shell: bash
@ -283,7 +283,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
build_name: libtorch-cpu-shared-with-deps-debug
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -306,7 +306,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -422,7 +422,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
shell: bash
@ -533,7 +533,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
build_name: libtorch-cuda12_6-shared-with-deps-debug
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -556,7 +556,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -672,7 +672,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
shell: bash
@ -783,261 +783,11 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
build_name: libtorch-cuda12_8-shared-with-deps-debug
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-cuda12_9-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: debug
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
- uses: actions/upload-artifact@v4.4.0
if: always()
with:
name: libtorch-cuda12_9-shared-with-deps-debug
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
libtorch-cuda12_9-shared-with-deps-debug-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-cuda12_9-shared-with-deps-debug-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: debug
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-cuda12_9-shared-with-deps-debug
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Test PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
libtorch-cuda12_9-shared-with-deps-debug-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-cuda12_9-shared-with-deps-debug-test
with:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
LIBTORCH_CONFIG: debug
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
build_name: libtorch-cuda12_9-shared-with-deps-debug
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-cuda13_0-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
@ -1056,7 +806,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -1172,7 +922,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
shell: bash
@ -1283,7 +1033,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
build_name: libtorch-cuda13_0-shared-with-deps-debug
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -51,7 +51,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -166,7 +166,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
shell: bash

View File

@ -58,7 +58,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -173,7 +173,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
shell: bash
@ -283,7 +283,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
build_name: libtorch-cpu-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -306,7 +306,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -422,7 +422,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
shell: bash
@ -533,7 +533,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
build_name: libtorch-cuda12_6-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -556,7 +556,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -672,7 +672,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
shell: bash
@ -783,261 +783,11 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
build_name: libtorch-cuda12_8-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-cuda12_9-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
- uses: actions/upload-artifact@v4.4.0
if: always()
with:
name: libtorch-cuda12_9-shared-with-deps-release
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
libtorch-cuda12_9-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-cuda12_9-shared-with-deps-release-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-cuda12_9-shared-with-deps-release
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Test PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-cuda12_9-shared-with-deps-release-test
with:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu129
GPU_ARCH_VERSION: "12.9"
GPU_ARCH_TYPE: cuda
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
build_name: libtorch-cuda12_9-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-cuda13_0-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
@ -1056,7 +806,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -1172,7 +922,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
steps:
- name: Display EC2 information
shell: bash
@ -1283,7 +1033,7 @@ jobs:
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
DESIRED_PYTHON: "3.10"
build_name: libtorch-cuda13_0-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

File diff suppressed because it is too large Load Diff

15
CLAUDE.md Normal file
View File

@ -0,0 +1,15 @@
# Testing
Use our test class and test runner:
```
from torch.testing._internal.common_utils import run_tests, TestCase
class TestFeature(TestCase):
...
if __name__ == "__main__":
run_tests()
```
To test Tensor equality, use assertEqual.

View File

@ -880,10 +880,21 @@ cmake_dependent_option(
USE_FBGEMM_GENAI
"Whether to build FBGEMM GenAI quantized GEMM kernels.\
Will be disabled if not supported by the platform"
OFF
"USE_CUDA OR USE_ROCM"
ON
"USE_ROCM"
OFF)
IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
set(USE_FBGEMM_GENAI off)
endif()
# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0a")
message(WARNING "Setting USE_FBGEMM_GENAI to ON for CUDA build on SM100")
set(USE_FBGEMM_GENAI ON)
endif()
# CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
# Eff Attention won't
cmake_dependent_option(

View File

@ -88,13 +88,13 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
* If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.
* When installing with `python -m pip install -e .` (in contrast to `python -m pip install .`) Python runtime will use
* When installing with `python -m pip install -e . -v --no-build-isolation` (in contrast to `python -m pip install . -v --no-build-isolation`) Python runtime will use
the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder)
This way you do not need to repeatedly install after modifying Python files (`.py`).
However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
One way to avoid running `python -m pip install -e .` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
One way to avoid running `python -m pip install -e . -v --no-build-isolation` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following:
```bash
pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd
@ -116,7 +116,7 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
Next run `python setup.py clean`. After that, you can install in editable mode again.
* If you run into errors when running `python -m pip install -e .`, here are some debugging steps:
* If you run into errors when running `python -m pip install -e . -v --no-build-isolation`, here are some debugging steps:
1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure
your CMake works and can compile this simple Hello World program without errors.
2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many
@ -129,10 +129,10 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
git clean -xdf
python setup.py clean
git submodule update --init --recursive
python -m pip install -r requirements.txt
python -m pip install --group dev
python -m pip install --no-build-isolation -v -e .
```
4. The main step within `python -m pip install -e .` is running `cmake --build build` from the `build` directory. If you want to
4. The main step within `python -m pip install -e . -v --no-build-isolation` is running `make` from the `build` directory. If you want to
experiment with some environment variables, you can pass them into the command:
```bash
ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e .
@ -259,6 +259,7 @@ dependencies as well as the nightly binaries into the repo directory.
support for PyTorch.
* [tools](tools) - Code generation scripts for the PyTorch library.
See [README](tools/README.md) of this directory for more details.
* [torchgen](torchgen) - contains the logic and tooling for generating PyTorch's low-level C++ and Python bindings from operator definitions, typically specified in native_functions.yaml
* [test](test) - Python unit tests for PyTorch Python frontend.
* [test_torch.py](test/test_torch.py) - Basic tests for PyTorch
functionality.
@ -294,7 +295,7 @@ The following packages should be installed with `pip`:
- `pytest` - recommended to run tests more selectively
Running
```
pip install -r requirements.txt
pip install --group dev
```
will install these dependencies for you.
@ -645,9 +646,9 @@ can be selected interactively with your mouse to zoom in on a particular part of
the program execution timeline. The `--native` command-line option tells
`py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers
for C++ code it may be necessary to compile PyTorch in debug mode by prepending
your `python -m pip install -e .` call to compile PyTorch with `DEBUG=1`.
Depending on your operating system it may also be necessary to run `py-spy` with
root privileges.
your `python -m pip install -e . -v --no-build-isolation` call to compile
PyTorch with `DEBUG=1`. Depending on your operating system it may also be
necessary to run `py-spy` with root privileges.
`py-spy` can also work in an `htop`-like "live profiling" mode and can be
tweaked to adjust the stack sampling rate, see the `py-spy` readme for more
@ -655,10 +656,10 @@ details.
## Managing multiple build trees
One downside to using `python -m pip install -e .` is that your development
version of PyTorch will be installed globally on your account (e.g., if
you run `import torch` anywhere else, the development version will be
used).
One downside to using `python -m pip install -e . -v --no-build-isolation` is
that your development version of PyTorch will be installed globally on your
account (e.g., if you run `import torch` anywhere else, the development version
will be used).
If you want to manage multiple builds of PyTorch, you can make use of
[venv environments](https://docs.python.org/3/library/venv.html) to maintain
@ -719,7 +720,7 @@ options.
### Code completion and IDE support
When using `python -m pip install -e .`, PyTorch will generate
When using `python -m pip install -e . -v --no-build-isolation`, PyTorch will generate
a `compile_commands.json` file that can be used by many editors
to provide command completion and error highlighting for PyTorch's
C++ code. You need to `pip install ninja` to generate accurate

View File

@ -243,7 +243,7 @@ git submodule update --init --recursive
```bash
# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above
pip install -r requirements.txt
pip install --group dev
```
**On Linux**
@ -394,7 +394,7 @@ On macOS
```bash
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build
MACOSX_DEPLOYMENT_TARGET=11.0 CMAKE_ONLY=1 python setup.py build
ccmake build # or cmake-gui build
```

View File

@ -252,47 +252,80 @@ if(USE_MEM_EFF_ATTENTION)
list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu})
endif()
IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
set(USE_FBGEMM_GENAI off)
endif()
# FBGEMM GenAI
IF(USE_FBGEMM_GENAI)
set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
set(FBGEMM_GENAI_DIR ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
set(FBGEMM_GENAI_SRCS ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
if(USE_CUDA)
# To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
# If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
set(FBGEMM_CUTLASS_KERNELS_REGEX ".*mx8mx8bf16_grouped.*")
file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
list(FILTER fbgemm_genai_native_cuda_cu INCLUDE REGEX ${FBGEMM_CUTLASS_KERNELS_REGEX})
if(USE_ROCM)
# Only include the kernels we want to build to avoid increasing binary size.
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
file(GLOB_RECURSE fbgemm_genai_native_cuda_cpp
"${FBGEMM_GENAI_SRCS}/common/*.cpp"
)
# Add additional HIPCC compiler flags for performance
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-mllvm
-amdgpu-coerce-illegal-types=1
-mllvm
-enable-post-misched=0
-mllvm
-greedy-reverse-local-assignment=1
-fhip-new-launch-api)
# Combine all source files into a single list
list(APPEND fbgemm_genai_all_sources
${fbgemm_genai_native_cuda_cu}
${fbgemm_genai_native_cuda_cpp}
)
# Now, create the library and provide the sources at the same time
add_library(fbgemm_genai OBJECT ${fbgemm_genai_all_sources})
hip_add_library(
fbgemm_genai STATIC
${fbgemm_genai_native_rocm_hip}
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
set(fbgemm_genai_mx8mx8bf16_grouped
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
)
target_include_directories(fbgemm_genai PUBLIC
# FBGEMM version of Composable Kernel is used due to some customizations
${FBGEMM_THIRD_PARTY}/composable_kernel/include
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
${FBGEMM_GENAI_DIR}/include/
${FBGEMM_GENAI_DIR}/common/include/
${FBGEMM_THIRD_PARTY}/cutlass/include
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
${fbgemm_genai_mx8mx8bf16_grouped}
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
)
else()
if(USE_ROCM)
# Only include the kernels we want to build to avoid increasing binary size.
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
# Add additional HIPCC compiler flags for performance
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-mllvm
-amdgpu-coerce-illegal-types=1
-mllvm
-enable-post-misched=0
-mllvm
-greedy-reverse-local-assignment=1
-fhip-new-launch-api)
hip_add_library(
fbgemm_genai STATIC
${fbgemm_genai_native_rocm_hip}
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
target_include_directories(fbgemm_genai PUBLIC
# FBGEMM version of Composable Kernel is used due to some customizations
${FBGEMM_THIRD_PARTY}/composable_kernel/include
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
${FBGEMM_THIRD_PARTY}/cutlass/include
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
)
endif()
endif()
endif()
@ -635,12 +668,26 @@ if(USE_CUDA AND NOT USE_ROCM)
add_definitions(-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
# Add FBGEMM_GENAI include directories for torch_ops.h
if(USE_FBGEMM_GENAI)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
endif()
if($ENV{ATEN_STATIC_CUDA})
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
${CUDA_LIBRARIES}
CUDA::cusparse_static
CUDA::cufft_static_nocallback
)
if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
${CUDA_LIBRARIES}
CUDA::cusparse_static
CUDA::cufft_static_nocallback)
else()
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
${CUDA_LIBRARIES}
CUDA::cusparse_static
CUDA::cufft_static)
endif()
if(NOT BUILD_LAZY_CUDA_LINALG)
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
CUDA::cusolver_static

View File

@ -308,17 +308,44 @@ void fillVersion<DLManagedTensorVersioned>(
// constructed out of ATen tensor
template <class T>
T* toDLPackImpl(const Tensor& src) {
// create a new tensor with possibly normalized strides
// gh-83069
auto shape = src.sizes();
auto strides = src.strides().vec();
for (int i = 0; i < src.dim(); i++) {
if (shape[i] < 2) {
strides[i] = 1;
auto view = src;
// Detect whether there is need to normalize the strides
// Background: gh-83069
//
// However, normalizing strides can come at a high-cost
// to slow down toDLPack conversion 3x, so we
// only normalize if needed.
//
// The following code detects whether the src follows
// a continuous pattern. If the src follows such pattern (common-case)
// then we do not need to normalize the strides.
bool need_normalize_strides = false;
int64_t expected_stride = 1;
for (int i = src.dim() - 1; i >= 0; i--) {
// detect if we do not meet continuous pattern
// and the size is 1, so there is opportunity to normalize
if (src.stride(i) != expected_stride && src.size(i) == 1) {
need_normalize_strides = true;
break;
}
expected_stride *= src.size(i);
}
// less common case, try normalizing the strides
if (need_normalize_strides) {
// create a new tensor with possibly normalized strides
// gh-83069
auto shape = src.sizes();
auto strides = src.strides().vec();
for (int i = 0; i < src.dim(); i++) {
if (shape[i] < 2) {
strides[i] = 1;
}
}
view = src.as_strided(shape, strides, src.storage_offset());
}
auto view = src.as_strided(shape, strides, src.storage_offset());
ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
atDLMTensor->handle = view;
atDLMTensor->tensor.manager_ctx = atDLMTensor;

View File

@ -0,0 +1,17 @@
#include <ATen/DTensorState.h>
namespace at {
namespace {
thread_local bool kDTensorAllowImplicitReplication = false;
}
bool get_dtensor_allow_implicit_replication() {
return kDTensorAllowImplicitReplication;
}
void set_dtensor_allow_implicit_replication(bool enabled) {
kDTensorAllowImplicitReplication = enabled;
}
} // namespace at

View File

@ -0,0 +1,34 @@
#pragma once
#include <c10/macros/Macros.h>
namespace at {
TORCH_API bool get_dtensor_allow_implicit_replication();
TORCH_API void set_dtensor_allow_implicit_replication(bool enabled);
struct DTensorAllowImplicitReplication {
DTensorAllowImplicitReplication()
: prev_dtensor_allow_implicit_replication_(
get_dtensor_allow_implicit_replication()) {
set_dtensor_allow_implicit_replication(true);
}
DTensorAllowImplicitReplication(const DTensorAllowImplicitReplication&) =
delete;
DTensorAllowImplicitReplication& operator=(
const DTensorAllowImplicitReplication&) = delete;
DTensorAllowImplicitReplication(DTensorAllowImplicitReplication&&) = delete;
DTensorAllowImplicitReplication& operator=(
DTensorAllowImplicitReplication&&) = delete;
~DTensorAllowImplicitReplication() {
set_dtensor_allow_implicit_replication(
prev_dtensor_allow_implicit_replication_);
}
private:
bool prev_dtensor_allow_implicit_replication_;
};
} // namespace at

View File

@ -8,6 +8,7 @@
#include <ATen/record_function.h>
#include <ATen/SavedTensorHooks.h>
#include <ATen/FunctionalTensorWrapper.h>
#include <ATen/DTensorState.h>
namespace at {
@ -19,6 +20,7 @@ ThreadLocalState::ThreadLocalState()
torch_dispatch_mode_state_(c10::impl::TorchDispatchModeTLS::get_state()), python_dispatcher_state_(c10::impl::PythonDispatcherTLS::get_state()),
python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()),
saved_tensors_default_hooks_state_(at::SavedTensorDefaultHooks::get_tls_state()), functionalization_reapply_views_state_(at::functionalization::impl::getFunctionalizationReapplyViewsTLS()),
dtensor_allow_implicit_replication_(at::get_dtensor_allow_implicit_replication()),
saved_objects_(at::impl::ThreadLocalPythonObjects::get_state()) {
#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) && !defined(BUILD_LITE_INTERPRETER)
for(size_t i=0; i<autocast_dtypes_.size(); i++) {
@ -52,6 +54,8 @@ void ThreadLocalState::setThreadLocalState(
c10::impl::PythonDispatcherTLS::set_state(state.python_dispatcher_state_);
at::set_dtensor_allow_implicit_replication(state.dtensor_allow_implicit_replication_);
c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_);
c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_);

View File

@ -75,6 +75,8 @@ class TORCH_API ThreadLocalState {
bool functionalization_reapply_views_state_;
bool dtensor_allow_implicit_replication_;
// TLS for arbitrary python objects that is registered via hooks
at::impl::ThreadLocalPythonObjects saved_objects_;

View File

@ -1937,11 +1937,11 @@ void scaled_gemm(
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
cublasLtMatmulDescAttributes_t matmulDescA = CUBLASLT_MATMUL_DESC_A_SCALE_POINTER;
cublasLtMatmulDescAttributes_t matmulDescB = CUBLASLT_MATMUL_DESC_B_SCALE_POINTER;
#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
// hipblaslt supported row-wise before cublas, and did so their own way (via
// the SCALE_POINTERSs), but then migrated to match how cublas does it (via
// the SCALE_MODEs). Here we check for this early custom mode.
bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
if (use_rowwise) {
matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
@ -1956,8 +1956,12 @@ void scaled_gemm(
}
#endif
}
#else
// rowwise isn't supported using cublaslt or older hipblaslt
#elif (CUDA_VERSION < 12090) && !defined(USE_ROCM)
// hipblaslt supported row-wise before cublas, and did so their own way (via
// the SCALE_POINTERSs), but then migrated to match how cublas does it (via
// the SCALE_MODEs). Here we check for this early custom mode.
bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
// rowwise isn't supported using older cublaslt or older hipblaslt
TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
#endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);

View File

@ -117,6 +117,8 @@ namespace at::cuda {
_(nvrtcGetPTXSize) \
_(nvrtcGetPTX) \
_(cuModuleLoadData) \
_(cuModuleLoad) \
_(cuGetErrorString) \
_(cuModuleGetFunction) \
_(HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR) \
_(nvrtcGetErrorString) \

View File

@ -7,6 +7,7 @@
#include <ATen/functorch/BatchRulesHelper.h>
#include <ATen/functorch/PlumbingHelper.h>
#include <ATen/core/dispatch/Dispatcher.h>
#include <ATen/DTensorState.h>
#include <utility>
@ -44,8 +45,13 @@ static std::tuple<Tensor, std::optional<int64_t>> embedding_batch_rule(
const auto weight_ = reshape_dim_into(*weight_bdim, 0, weight);
auto indices_ = moveBatchDimToFront(indices, indices_bdim);
const auto range = getStepTensor(indices, batch_size, num_embeddings);
indices_ = indices_ + range;
{
// getStepTensor returns a regular Tensor. If indices_ is a DTensor
// we want to allow this mixed DTensor-Tensor operation.
at::DTensorAllowImplicitReplication guard;
const auto range = getStepTensor(indices, batch_size, num_embeddings);
indices_ = indices_ + range;
}
auto result = at::embedding_symint(weight_, indices_, std::move(padding_idx), scale_grad_by_freq, sparse);
return std::make_tuple(std::move(result), 0);
}

View File

@ -9,6 +9,7 @@
#include <ATen/native/mkldnn/Matmul.h>
#include <ATen/native/mkldnn/Linear.h>
#include <ATen/native/Resize.h>
#include <ATen/native/GroupedMMUtils.h>
#if !defined(__s390x__) && !defined(__powerpc__)
#include <cpuinfo.h>
#endif
@ -332,4 +333,23 @@ _scaled_mm_cpu(const Tensor& mat_a, const Tensor& mat_b,
return _scaled_mm_out_cpu(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
}
// TODO(vasiliy, future PR): figure out why we need to declare this function, when
// other functions that live in ATen/native/*.cpp without declarations
// or headers work just fine.
Tensor _grouped_mm(const Tensor& mat_a, const Tensor& mat_b,
const std::optional<at::Tensor>& offs,
const std::optional<at::Tensor>& bias,
std::optional<c10::ScalarType> out_dtype);
Tensor _grouped_mm(const Tensor& mat_a, const Tensor& mat_b,
const std::optional<at::Tensor>& offs,
const std::optional<at::Tensor>& bias,
std::optional<c10::ScalarType> out_dtype) {
_grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
_grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
return out;
}
} // namespace at::native

View File

@ -0,0 +1,167 @@
#pragma once
#include <ATen/core/Tensor.h>
#include <ATen/TensorUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/CPUFunctions.h>
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/bmm.h>
#include <ATen/ops/empty.h>
#include <ATen/ops/empty_strided.h>
#include <ATen/ops/mm.h>
#endif
namespace at::native {
inline bool check_valid_strides_and_return_transposed(const Tensor& mat) {
IntArrayRef tensor_strides = mat.strides();
IntArrayRef tensor_sizes = mat.sizes();
int end_dim = mat.dim() - 1;
int alignment = 16 / mat.element_size();
TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n");
if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max<int64_t>(1, tensor_sizes[end_dim - 1]))) {
TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes");
return true;
} else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max<int64_t>(1, tensor_sizes[end_dim]))) {
TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes");
return false;
} else {
TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes");
}
}
inline at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a,
const Tensor& mat_b,
const std::optional<at::Tensor>& offs,
c10::ScalarType out_dtype
) {
c10::SmallVector<int64_t, 3> out_size;
const bool a_is_2d = mat_a.dim() == 2;
const bool b_is_2d = mat_b.dim() == 2;
if (a_is_2d) {
if (b_is_2d) {
out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)};
} else {
TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match");
out_size = {mat_a.size(0), mat_b.size(-1)};
}
} else {
if (b_is_2d) {
// this case is not actually encountered for MoE gemms
TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match");
out_size = {mat_a.size(1), mat_b.size(1)};
} else { // regular bmm
TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match");
out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)};
}
}
#ifndef USE_ROCM
// For TMA transfers, strides of output tensor have to be either
// 1, or aligned to 16 bytes.
const auto last_dim = out_size.size() - 1;
const auto alignment = 16 / c10::elementSize(out_dtype);
const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment;
std::vector<int64_t> out_stride;
if (a_is_2d != b_is_2d) {
out_stride = {size_padded, 1};
} else {
out_stride = {out_size[1] * size_padded, size_padded, 1};
}
return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype));
#else
return at::empty(out_size, mat_a.options().dtype(out_dtype));
#endif
}
inline void _grouped_mm_validate_inputs(const Tensor& mat_a, const Tensor& mat_b,
const std::optional<at::Tensor>& offs,
const std::optional<at::Tensor>& bias,
std::optional<c10::ScalarType> out_dtype) {
TORCH_CHECK((mat_a.dtype() == at::kBFloat16) || (mat_a.dtype() == at::kFloat) || (mat_a.dtype() == at::kHalf), "Expected mat_a to be Float32, BFloat16 or Float16 matrix, got ", mat_a.scalar_type());
TORCH_CHECK((mat_b.dtype() == at::kBFloat16) || (mat_b.dtype() == at::kFloat) || (mat_b.dtype() == at::kHalf), "Expected mat_b to be Float32, BFloat16 or Float16 matrix, got ", mat_b.scalar_type());
TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
const bool a_is_2d = mat_a.dim() == 2;
const bool b_is_2d = mat_b.dim() == 2;
if (!a_is_2d || !b_is_2d) {
TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
}
// check that the strides are valid, the fn will throw an error if not
check_valid_strides_and_return_transposed(mat_a);
check_valid_strides_and_return_transposed(mat_b);
TORCH_CHECK(offs.has_value() == (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d");
if (offs.has_value()) {
TORCH_CHECK(offs->dim() == 1, "offs has to be 1D");
TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
}
TORCH_CHECK(!bias.has_value(), "Bias not supported yet");
}
inline c10::ScalarType _resolve_grouped_mm_out_dtype(const Tensor& mat_a, const Tensor& mat_b,
std::optional<c10::ScalarType> out_dtype) {
const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
// TODO(future PR): enable float32 output dtype for bfloat16 and float16 inputs
TORCH_CHECK(out_dtype_ == mat_a.dtype(), "Grouped gemm output dtype must match `mat_a` dtype");
return out_dtype_;
}
inline void _grouped_mm_fallback(const Tensor& mat_a, const Tensor& mat_b,
const std::optional<at::Tensor>& offs,
const std::optional<at::Tensor>& bias,
std::optional<c10::ScalarType> out_dtype,
Tensor out) {
LOG(INFO) << "fallback path for `torch._grouped_mm`, performance may not be optimal";
const bool a_is_2d = mat_a.dim() == 2;
const bool b_is_2d = mat_b.dim() == 2;
if (a_is_2d && !b_is_2d) {
// 2d x 3d with offsets
int group_start_idx = 0;
auto offs_cpu = offs.value().cpu();
for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) {
int group_end_idx = offs_cpu[group_idx].item<int>();
auto mat_a_slice = mat_a.slice(0, group_start_idx, group_end_idx);
auto out_slice = out.slice(0, group_start_idx, group_end_idx);
at::mm_out(out_slice, mat_a_slice, mat_b[group_idx]);
group_start_idx = group_end_idx;
}
} else if (!a_is_2d && b_is_2d) {
// 3d x 2d with offsets
int group_start_idx = 0;
auto offs_cpu = offs.value().cpu();
for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) {
int group_end_idx = offs_cpu[group_idx].item<int>();
auto mat_b_slice = mat_b.slice(1, group_start_idx, group_end_idx);
auto out_slice = out.slice(1, group_start_idx, group_end_idx);
at::mm_out(out_slice, mat_a[group_idx], mat_b_slice);
group_start_idx = group_end_idx;
}
} else if (a_is_2d && b_is_2d) {
// 2d x 2d with offsets
int group_start_idx = 0;
auto offs_cpu = offs.value().cpu();
for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) {
int group_end_idx = offs_cpu[group_idx].item<int>();
auto mat_a_slice = mat_a.slice(1, group_start_idx, group_end_idx);
auto mat_b_slice = mat_b.slice(0, group_start_idx, group_end_idx);
auto out_slice = out[group_idx];
at::mm_out(out_slice, mat_a_slice, mat_b_slice);
group_start_idx = group_end_idx;
}
} else {
// 3d x 3d without offsets - regular bmm
at::bmm_out(out, mat_a, mat_b);
}
}
} // namespace at::native

View File

@ -47,10 +47,14 @@ TORCH_META_FUNC(nll_loss_forward)
TORCH_CHECK(
target.dim() <= 1,
"0D or 1D target tensor expected, multi-target not supported");
auto no_batch_dim = self.dim() == 1 && target.dim() == 0;
if (self.dim() == 1 && target.dim() == 1) {
TORCH_CHECK_VALUE(
target.size(0) == 1,
"For 1D input, 1D target must have size 1, but got target size: ",
target.size(0));
}
TORCH_CHECK(
no_batch_dim || (self.size(0) == target.size(0)),
self.dim() == 1 || (self.size(0) == target.size(0)),
"size mismatch (got input: ",
self.sizes(),
", target: ",

View File

@ -1640,6 +1640,9 @@ Tensor zeros_symint(
std::optional<Layout> layout,
std::optional<Device> device,
std::optional<bool> pin_memory) {
for (const auto& dim_size : size) {
TORCH_CHECK(dim_size >= 0, "zeros: Dimension size must be non-negative.");
}
Layout layout_ = layout.value_or(Layout::Strided);
if (at::sparse_csr::is_sparse_compressed(layout_)) {
return zeros_sparse_compressed_symint(

View File

@ -16,6 +16,7 @@
#include <ATen/cuda/tunable/TunableGemm.h>
#include <ATen/native/Resize.h>
#include <c10/util/MaybeOwned.h>
#include <ATen/native/GroupedMMUtils.h>
#include <ATen/native/cuda/RowwiseScaledMM.h>
#include <ATen/native/cuda/ScaledGroupMM.h>
#include <ATen/native/cuda/GroupMM.h>
@ -1079,6 +1080,16 @@ static bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=fals
#endif
}
static bool _grouped_mm_allowed_device() {
#ifdef USE_ROCM
return false;
#else
auto dprops = at::cuda::getCurrentDeviceProperties();
// CUDA capability 8.0 and greater
return dprops->major >= 8;
#endif
}
#ifdef USE_ROCM
static bool _scaled_mm_is_fnuz() {
return at::detail::getCUDAHooks().isGPUArch({"gfx942"});
@ -1540,71 +1551,8 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
}
namespace {
at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a,
const Tensor& mat_b,
const std::optional<at::Tensor>& offs,
std::optional<c10::ScalarType> out_dtype
) {
c10::SmallVector<int64_t, 3> out_size;
const bool a_is_2d = mat_a.dim() == 2;
const bool b_is_2d = mat_b.dim() == 2;
if (a_is_2d) {
if (b_is_2d) {
out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)};
} else {
TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match");
out_size = {mat_a.size(0), mat_b.size(-1)};
}
} else {
if (b_is_2d) {
// this case is not actually encountered for MoE gemms
TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match");
out_size = {mat_a.size(1), mat_b.size(1)};
} else { // regular bmm
TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match");
out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)};
}
}
const auto out_dtype_ = out_dtype.value_or(kBFloat16);
TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
#ifndef USE_ROCM
// For TMA transfers, strides of output tensor have to be either
// 1, or aligned to 16 bytes.
const auto last_dim = out_size.size() - 1;
const auto alignment = 16 / c10::elementSize(out_dtype_);
const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment;
std::vector<int64_t> out_stride;
if (a_is_2d != b_is_2d) {
out_stride = {size_padded, 1};
} else {
out_stride = {out_size[1] * size_padded, size_padded, 1};
}
return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_));
#else
return at::empty(out_size, mat_a.options().dtype(out_dtype_));
#endif
}
bool check_valid_strides_and_return_transposed(const Tensor& mat) {
IntArrayRef tensor_strides = mat.strides();
IntArrayRef tensor_sizes = mat.sizes();
int end_dim = mat.dim() - 1;
int alignment = 16 / mat.element_size();
TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n");
if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max<int64_t>(1, tensor_sizes[end_dim - 1]))) {
TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes");
return true;
} else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max<int64_t>(1, tensor_sizes[end_dim]))) {
TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes");
return false;
} else {
TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes");
}
}
void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
// Checks scales for 2d or 3d target tensors (`mat`).
if (mat.dim() == 2) {
TORCH_CHECK(
scale.dim() == 1,
@ -1638,9 +1586,66 @@ namespace {
"scale must have the same first dimension as mat for arg ",
arg_idx);
}
}
}
void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
// Checks scales for 2d or 3d target tensors (`mat`).
if (mat.dim() == 2) {
// For MXFP8, 2d tensors have variable size groups represented as subtensors,
// that are converted to blocked padded format individually,
// so we can't check the scale sizes without doing a d2h sync to get the group sizes here.
TORCH_CHECK(
scale.dim() == mat.dim(),
"for mxfp8, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(), " and scale.dim() = ", scale.dim(), " for arg ", arg_idx);
// LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/32, 4))
// RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/32, 4))
// * weight is transposed prior to the call, scale stays non-transposed.
bool LHS = arg_idx == 0;
int scale_dim_to_check = 0;
int mat_dim_to_check = LHS ? 0 : 1;
TORCH_CHECK(
scale.size(scale_dim_to_check) >= mat.size(mat_dim_to_check),
"for mxfp8, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
"must have scale.shape[", scale_dim_to_check, "] >= ", mat.size(mat_dim_to_check), " but got scale.shape=(", scale.size(0), ", ", scale.size(1), ")");
} else {
// For MXFP8, 3d tensors have static group sizes (stack of 2d tensors),
// so we can check the exact expected scale sizes here without a d2h sync.
auto round_up = [](auto x, auto y) {
return ((x + y - 1) / y) * y;
};
// TODO: this is for 3d tensor in 2d-3d case specifically.
// We'll need to support 3d-3d and 3d-2d cases once mxfp8 grouped gemm supports them.
int64_t G = mat.size(0);
int64_t K = mat.size(1);
int64_t N = mat.size(2);
int64_t blocked_scale_K = round_up(K/32, 4);
int64_t blocked_scale_N = round_up(N, 128);
// fbgemm expects stack of flattened blocked scales for 3d tensor, shape (G, blocked_scale_K * blocked_scale_N).
TORCH_CHECK(
scale.dim() == mat.dim() - 1,
"for mxfp8 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N), but scale is ", scale.dim(), "D for arg ", arg_idx
);
TORCH_CHECK(
scale.size(0) == G && scale.size(1) == blocked_scale_K * blocked_scale_N,
"for mxfp8, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ") for arg ", arg_idx
);
}
}
void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
bool using_fp8_rowwise = scale.scalar_type() == kFloat;
bool using_mxfp8 = scale.scalar_type() == at::kFloat8_e8m0fnu;
if (using_fp8_rowwise) {
_check_scales_fp8_rowwise(mat, scale, dim, arg_idx, scale_multiplier);
} else if (using_mxfp8) {
_check_scales_mxfp8(mat, scale, dim, arg_idx);
} else {
TORCH_CHECK(false, "scale must be float32 or float8_e8m0fnu, but got ", scale.dtype());
}
}
}
Tensor
@ -1665,8 +1670,8 @@ const std::optional<at::Tensor>& bias,
const std::optional<at::Tensor>& scale_result,
std::optional<c10::ScalarType> out_dtype,
bool use_fast_accum) {
bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/false);
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+");
bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true);
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = [9.0, 10.0], or ROCm MI300+");
TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed");
@ -1699,16 +1704,47 @@ bool use_fast_accum) {
TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
}
// Both Per-Tensor and Row-wise scaling expect fp32 tensors
// FP8 per-tensor and per-row scaling expect fp32 scales.
// MXFP8 expects float8_e8m0fnu scales.
TORCH_CHECK(
scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat,
"Both scale_a and scale_b must be float (fp32) tensors.");
(scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat) ||
(scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu),
"For FP8 tensorwise and rowwise, both scales must both be float32 tensors. For MXFP8, scales must both be float8_e8m0fnu tensors.");
const int scale_multiplier = (mat_a.dim() == 2 && mat_b.dim() == 2) ? offs->size(0) : 1;
check_scale(mat_a, scale_a, 0 ,0, scale_multiplier);
check_scale(mat_b, scale_b, 1, 1, scale_multiplier);
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
const auto out_dtype_ = out_dtype.value_or(kBFloat16);
TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
#if defined(USE_FBGEMM_GENAI) && defined(USE_CUDA) && !defined(USE_ROCM)
// MXFP8 grouped GEMM dispatching
bool is_mx8mx8bf16 = (
mat_a.scalar_type() == at::kFloat8_e4m3fn && mat_b.scalar_type() == at::kFloat8_e4m3fn &&
scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu
);
TORCH_CHECK(out_dtype == at::kBFloat16, "Only bf16 out_dtype is supported for MXFP8 grouped gemm");
if (is_mx8mx8bf16) {
bool b_is_3d = mat_b.dim() == 3;
bool is_2d_2d = a_is_2d && b_is_2d;
bool is_2d_3d = a_is_2d && b_is_3d;
TORCH_CHECK(is_2d_2d || is_2d_3d, "MXFP8 grouped GEMM currently only supports 2d-2d and 2d-3d cases");
TORCH_CHECK(offs.has_value(), "MXFP8 2d-2d and 2d-3d grouped GEMMs requires offsets");
fbgemm_gpu::mx8mx8bf16_grouped_mm(
mat_a,
mat_b,
scale_a,
scale_b,
offs.value(),
out);
return out;
}
#endif
#ifndef USE_ROCM
TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
@ -1741,6 +1777,7 @@ bool use_fast_accum) {
#else
TORCH_CHECK(false, "grouped gemm is not supported without USE_FBGEMM_GENAI on ROCM")
#endif
#endif
}
@ -1750,33 +1787,21 @@ const std::optional<at::Tensor>& offs,
const std::optional<at::Tensor>& bias,
std::optional<c10::ScalarType> out_dtype) {
#ifndef USE_ROCM
bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true);
TORCH_CHECK(allowed_device, "torch._grouped_mm is only supported on CUDA devices with compute capability = 9.0, 10.0");
TORCH_CHECK(mat_a.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_a.scalar_type());
TORCH_CHECK(mat_b.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_b.scalar_type());
TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
const bool a_is_2d = mat_a.dim() == 2;
const bool b_is_2d = mat_b.dim() == 2;
if (!a_is_2d || !b_is_2d) {
TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
_grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
bool a_b_and_out_are_bf16 = (
mat_a.dtype() == at::kBFloat16 &&
mat_b.dtype() == at::kBFloat16 &&
out_dtype.value_or(at::kBFloat16) == at::kBFloat16
);
bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16;
const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
if (use_fast_path) {
// fast path, no d2h sync needed
at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
} else {
_grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
}
// check that the strides are valid, the fn will throw an error if not
check_valid_strides_and_return_transposed(mat_a);
check_valid_strides_and_return_transposed(mat_b);
TORCH_CHECK(offs.has_value() == (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d");
if (offs.has_value()) {
TORCH_CHECK(offs->dim() == 1, "offs has to be 1D");
TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
}
TORCH_CHECK(!bias.has_value(), "Bias not supported yet");
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
return out;
#else
TORCH_CHECK(false, "grouped gemm is not supported on ROCM")

View File

@ -1412,7 +1412,7 @@
- func: cat(Tensor[] tensors, int dim=0) -> Tensor
structured_delegate: cat.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: cat_sparse
SparseCPU, SparseCUDA: cat_sparse
QuantizedCPU: cat_quantized_cpu
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested
tags: core
@ -7158,6 +7158,7 @@
- func: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
variants: function
dispatch:
CompositeExplicitAutograd: _grouped_mm
CUDA: _grouped_mm_cuda
# NOTE [ Sparse: autograd and API ]

View File

@ -76,13 +76,14 @@ bool priority_order_init_ = false;
// TODO(eqy): more benchmarking to determine whether this should include sm86/89
// Needs to be kept in-sync with test_fused_chocie in test_transformers.py
bool check_prefer_cudnn_attention() {
static const bool prefer_cudnn = c10::utils::check_env("TORCH_CUDNN_SDPA_PREFERRED") == true;
static const bool prefer_cudnn = c10::utils::check_env("TORCH_CUDNN_SDPA_PREFERRED") != false;
if (!prefer_cudnn) {
return false;
}
#if (defined(CUDNN_VERSION) && (CUDNN_VERSION > 90000))
#if (defined(CUDNN_VERSION) && (CUDNN_VERSION >= 90900))
auto dprops = at::cuda::getCurrentDeviceProperties();
return dprops->major >= 9 && !dprops->minor;
auto major = dprops->major;
return (major == 9 || major == 10) && !dprops->minor;
#else
return false;
#endif

View File

@ -948,6 +948,7 @@ def define_buck_targets(
[
("torch/csrc/api/include", "torch/**/*.h"),
("", "torch/csrc/**/*.h"),
("", "torch/csrc/**/*.hpp"),
("", "torch/nativert/**/*.h"),
("", "torch/headeronly/**/*.h"),
("", "torch/script.h"),
@ -2033,6 +2034,7 @@ def define_buck_targets(
("", "caffe2/utils/*.h"),
("", "caffe2/core/*.h"),
("", "torch/csrc/*.h"),
("", "torch/csrc/*.hpp"),
("", "torch/csrc/api/include/torch/*.h"),
("", "torch/csrc/autograd/*.h"),
("", "torch/csrc/autograd/*/*.h"),

View File

@ -512,6 +512,7 @@ libtorch_distributed_base_sources = [
"torch/csrc/distributed/c10d/TCPStore.cpp",
"torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
"torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp",
"torch/csrc/distributed/c10d/Types.cpp",
"torch/csrc/distributed/c10d/Utils.cpp",
"torch/csrc/distributed/c10d/Work.cpp",
"torch/csrc/distributed/c10d/comm.cpp",
@ -635,6 +636,12 @@ libtorch_nativert_sources = [
"torch/nativert/graph/passes/pass_manager/GraphPasses.cpp",
"torch/nativert/graph/passes/pass_manager/PassManager.cpp",
"torch/nativert/kernels/KernelHandlerRegistry.cpp",
"torch/nativert/kernels/TritonKernel.cpp",
"torch/nativert/executor/triton/CpuTritonKernelManager.cpp",
]
libtorch_nativert_cuda_sources = [
"torch/nativert/executor/triton/CudaTritonKernelManager.cpp",
]
torch_mobile_tracer_sources = [
@ -770,7 +777,7 @@ libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + lib
libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_sources + [
"torch/csrc/cuda/nccl.cpp",
]
] + libtorch_nativert_cuda_sources
torch_cpp_srcs = [
"torch/csrc/api/src/cuda.cpp", # this just forwards stuff, no real CUDA
@ -1087,6 +1094,7 @@ aten_cpu_source_non_codegen_list = [
"aten/src/ATen/DeviceAccelerator.cpp",
"aten/src/ATen/Context.cpp",
"aten/src/ATen/DLConvertor.cpp",
"aten/src/ATen/DTensorState.cpp",
"aten/src/ATen/EmptyTensor.cpp",
"aten/src/ATen/ExpandUtils.cpp",
"aten/src/ATen/CachedTensorUtils.cpp",

View File

@ -33,7 +33,8 @@ bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
}
// Return a SymBool with underlying symbolic expression that represents
// contiguity. Guaranteed not to add guards.
// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions
// or symbolic True.
inline static c10::SymBool _compute_contiguous_sym(
ArrayRef<c10::SymInt> sizes,
ArrayRef<c10::SymInt> strides,
@ -76,6 +77,8 @@ inline static c10::SymBool _compute_contiguous_sym(
return true;
};
// We try to minimize creating large symbolic expressions when not needed to
// avoid symbolic evaluation perf issues.
if (is_contiguous_or_false()) {
return c10::SymBool(true);
}
@ -94,6 +97,9 @@ inline static c10::SymBool _compute_contiguous_sym(
return is_contiguous_cond.sym_or(is_empty);
}
// When T is SymInt this function may throw a data dependent error.
// _compute_channels_last_contiguous_2d_sym does not. Only use this function
// when inputs are hinted.
template <typename T>
bool _compute_channels_last_contiguous_2d(
ArrayRef<T> sizes,
@ -105,8 +111,8 @@ bool _compute_channels_last_contiguous_2d(
T expected = 1;
for (auto& d : {1, 3, 2, 0}) {
const auto& size_d = sizes[d];
if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) {
if (size_d != 1) {
if (strides[d] != expected) {
return false;
}
expected *= size_d;
@ -123,6 +129,65 @@ bool _compute_channels_last_contiguous_2d(
}
}
// Return a SymBool with underlying symbolic expression that represents
// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions
// or symbolic True.
inline static c10::SymBool _compute_channels_last_contiguous_2d_sym(
ArrayRef<c10::SymInt> sizes,
ArrayRef<c10::SymInt> strides) {
switch (sizes.size()) {
case 4: {
// When this function return True, result always true. When it return
// False, result could be False or data dependent.
auto guard_or_false = [&]() {
c10::SymInt expected = 1;
for (auto& d : {1, 3, 2, 0}) {
const auto& size_d = sizes[d];
// Not taking this branch could make this return False instead of True
// but not vice-versa. so its ok.
if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) {
continue;
}
// Taking this branch could make this return False instead of True
// but not vice-versa. so its ok.
if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) {
return false;
}
expected *= size_d;
}
return true;
};
// We try to minimize creating large symbolic expressions when not needed
// to avoid symbolic evaluation perf issues.
if (guard_or_false()) {
return c10::SymBool(true);
}
// Result is either false, or data dependent.
c10::SymInt expected_stride = 1;
c10::SymBool cond = true;
for (auto& d : {1, 3, 2, 0}) {
const auto& size_d = sizes[d];
cond = cond.sym_and(
size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride)));
expected_stride *= size_d;
}
return cond;
}
// NOLINTNEXTLINE(bugprone-branch-clone)
case 3:
// TODO dim == 3 case will be enabled once it is fully tested
return c10::SymBool(false);
default:
return c10::SymBool(false);
}
}
// When T is SymInt this function may throw a data dependent error.
// _compute_channels_last_contiguous_3d_sym does not. Only use this function
// when inputs are hinted.
template <typename T>
bool _compute_channels_last_contiguous_3d(
ArrayRef<T> sizes,
@ -134,8 +199,8 @@ bool _compute_channels_last_contiguous_3d(
T expected = 1;
for (auto& d : {1, 4, 3, 2, 0}) {
const auto& size_d = sizes[d];
if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) {
if (size_d != 1) {
if (strides[d] != expected) {
return false;
}
expected *= size_d;
@ -152,6 +217,59 @@ bool _compute_channels_last_contiguous_3d(
}
}
inline static c10::SymBool _compute_channels_last_contiguous_3d_sym(
ArrayRef<c10::SymInt> sizes,
ArrayRef<c10::SymInt> strides) {
switch (sizes.size()) {
case 5: {
// When this function return True, result always true. When it return
// False, result could be False or data dependent.
auto guard_or_false = [&]() {
c10::SymInt expected = 1;
for (auto& d : {1, 4, 3, 2, 0}) {
const auto& size_d = sizes[d];
// Not taking this branch could make this return False instead of True
// but not vice-versa. so its ok.
if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) {
continue;
}
// Taking this branch could make this return False instead of True
// but not vice-versa. so its ok.
if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) {
return false;
}
expected *= size_d;
}
return true;
};
// We try to minimize creating large symbolic expressions when not needed
// to avoid symbolic evaluation perf issues.
if (guard_or_false()) {
return c10::SymBool(true);
}
// Result is either false, or data dependent.
c10::SymInt expected_stride = 1;
c10::SymBool cond = true;
for (auto& d : {1, 4, 3, 2, 0}) {
const auto& size_d = sizes[d];
cond = cond.sym_and(
size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride)));
expected_stride *= size_d;
}
return cond;
}
// NOLINTNEXTLINE(bugprone-branch-clone)
case 4:
// TODO dim == 4 case will be enabled once it is fully tested
return c10::SymBool(false);
default:
return c10::SymBool(false);
}
}
template <typename T>
bool _compute_non_overlapping_and_dense(
ArrayRef<T> sizes,

View File

@ -71,6 +71,27 @@ normalize_sym_sizes_strides(SymIntArrayRef sizes, SymIntArrayRef strides) {
return std::tuple<SymNode, std::vector<SymNode>, std::vector<SymNode>>(
std::move(base), std::move(size_nodes), std::move(stride_nodes));
}
namespace {
bool all_hinted(
const c10::SymIntArrayRef& sizes,
const c10::SymIntArrayRef& strides) {
auto all_hinted = true;
for (const auto& s : sizes) {
if (!s.has_hint()) {
return false;
}
}
if (all_hinted) {
for (const auto& s : strides) {
if (!s.has_hint()) {
return false;
}
}
}
return all_hinted;
}
} // namespace
// Special treatment because of numel
SymBool SymbolicShapeMeta::compute_contiguous() const {
@ -88,24 +109,7 @@ SymBool SymbolicShapeMeta::compute_contiguous() const {
return maybe_as_bool.value();
}
auto all_hinted = true;
for (const auto& s : sizes) {
if (!s.has_hint()) {
all_hinted = false;
break;
}
}
if (all_hinted) {
for (const auto& s : strides) {
if (!s.has_hint()) {
all_hinted = false;
break;
}
}
}
if (all_hinted) {
if (all_hinted(sizes, strides)) {
// We avoid going through the slow path if everything is hinted,
// because evaluating a large SymPy expression can be expensive.
// TODO exclude backed_size_oblivious from this path.
@ -115,6 +119,56 @@ SymBool SymbolicShapeMeta::compute_contiguous() const {
return result;
}
SymBool SymbolicShapeMeta::compute_channels_last_contiguous_2d() const {
if (!strides_valid_) {
return false;
}
c10::SymIntArrayRef sizes(sizes_);
c10::SymIntArrayRef strides(strides_);
auto result = _compute_channels_last_contiguous_2d_sym(sizes, strides);
// If the result is already determined without guarding, just return it.
auto maybe_as_bool = result.maybe_as_bool();
if (maybe_as_bool.has_value()) {
return maybe_as_bool.value();
}
if (all_hinted(sizes, strides)) {
// We avoid going through the slow path if everything is hinted,
// because evaluating a large SymPy expression can be expensive.
// TODO exclude backed_size_oblivious from this path.
return _compute_channels_last_contiguous_2d<SymInt>(sizes_, strides_);
}
return result;
}
SymBool SymbolicShapeMeta::compute_channels_last_contiguous_3d() const {
if (!strides_valid_) {
return false;
}
c10::SymIntArrayRef sizes(sizes_);
c10::SymIntArrayRef strides(strides_);
auto result = _compute_channels_last_contiguous_3d_sym(sizes, strides);
// If the result is already determined without guarding, just return it.
auto maybe_as_bool = result.maybe_as_bool();
if (maybe_as_bool.has_value()) {
return maybe_as_bool.value();
}
if (all_hinted(sizes, strides)) {
// We avoid going through the slow path if everything is hinted,
// because evaluating a large SymPy expression can be expensive.
// TODO exclude backed_size_oblivious from this path.
return _compute_channels_last_contiguous_3d<SymInt>(sizes_, strides_);
}
return result;
}
// The rest of them
#define DEFINE_EAGER_SYMBOOL_COMPUTE(name, fallback) \
SymBool SymbolicShapeMeta::name() const { \
@ -143,8 +197,6 @@ SymBool SymbolicShapeMeta::compute_contiguous() const {
}
// clang-format off
DEFINE_EAGER_SYMBOOL_COMPUTE(compute_channels_last_contiguous_2d, _compute_channels_last_contiguous_2d)
DEFINE_EAGER_SYMBOOL_COMPUTE(compute_channels_last_contiguous_3d, _compute_channels_last_contiguous_3d)
DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_2d, is_channels_last_strides_2d)
DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_3d, is_channels_last_strides_3d)

View File

@ -25,6 +25,7 @@ CUDAAllocatorConfig::CUDAAllocatorConfig()
#endif
m_release_lock_on_cudamalloc(false),
m_pinned_use_cuda_host_register(false),
m_graph_capture_record_stream_reuse(false),
m_pinned_use_background_threads(false) {
m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
}
@ -373,6 +374,9 @@ void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
} else if (config_item_view == "pinned_use_background_threads") {
i = parsePinnedUseBackgroundThreads(config, i);
used_native_specific_option = true;
} else if (config_item_view == "graph_capture_record_stream_reuse") {
i = parseGraphCaptureRecordStreamReuse(config, i);
used_native_specific_option = true;
} else {
TORCH_CHECK(
false, "Unrecognized CachingAllocator option: ", config_item_view);
@ -406,6 +410,23 @@ size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
return i;
}
size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
const std::vector<std::string>& config,
size_t i) {
consumeToken(config, ++i, ':');
if (++i < config.size()) {
TORCH_CHECK(
(config[i] == "True" || config[i] == "False"),
"Expected a single True/False argument for graph_capture_record_stream_reuse");
m_graph_capture_record_stream_reuse = (config[i] == "True");
} else {
TORCH_CHECK(
false, "Error, expecting graph_capture_record_stream_reuse value", "");
}
return i;
}
size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
const std::vector<std::string>& config,
size_t i) {

View File

@ -53,6 +53,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
return instance().m_release_lock_on_cudamalloc;
}
static bool graph_capture_record_stream_reuse() {
return instance().m_graph_capture_record_stream_reuse;
}
/** Pinned memory allocator settings */
static bool pinned_use_cuda_host_register() {
return instance().m_pinned_use_cuda_host_register;
@ -142,6 +146,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
size_t parsePinnedUseBackgroundThreads(
const std::vector<std::string>& config,
size_t i);
size_t parseGraphCaptureRecordStreamReuse(
const std::vector<std::string>& config,
size_t i);
std::atomic<size_t> m_max_split_size;
std::atomic<size_t> m_max_non_split_rounding_size;
@ -153,6 +160,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
m_expandable_segments_handle_type;
std::atomic<bool> m_release_lock_on_cudamalloc;
std::atomic<bool> m_pinned_use_cuda_host_register;
std::atomic<bool> m_graph_capture_record_stream_reuse;
std::atomic<bool> m_pinned_use_background_threads;
std::string m_last_allocator_settings;
std::mutex m_last_allocator_settings_mutex;

View File

@ -1167,8 +1167,13 @@ class DeviceCachingAllocator {
// tracks which pools we can use as a last resort before ooming
ska::flat_hash_set<MempoolId_t, MempoolIdHash> use_on_oom_pools;
// See free() for this thing's purpose
std::vector<Block*> needs_events_deferred_until_no_capture;
// Map of blocks whose freeing is deferred until after CUDA graph capture.
// - Key: Block* to be freed.
// - Value: List of "empty nodes" inserted as free markers during capture.
// If the vector is empty, the block must always be deferred until capture
// ends.
ska::flat_hash_map<Block*, std::vector<cudaGraphNode_t>> deferred_blocks;
// outstanding cuda events
ska::flat_hash_map<
cuda::CUDAStream,
@ -1329,6 +1334,11 @@ class DeviceCachingAllocator {
// capture. Cross-stream memory use is uncommon, so the deferral's
// effect on memory use during capture should be small.
process_events(context);
} else {
if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) {
// We check if there is some block that is safe to reuse on this stream
free_safe_blocks_in_capture(context, stream);
}
}
size_t size = round_size(orig_size);
auto& pool = get_pool(size, stream);
@ -1619,6 +1629,248 @@ class DeviceCachingAllocator {
return block;
}
// Insert "free marker" (empty nodes) into the CUDA graph for all streams that
// have used the block, including the allocation stream. These nodes mark the
// last use of the block in the capture graph. Returns a vector of the
// inserted nodes, or an empty vector if any stream is not capturing.
std::vector<cudaGraphNode_t> insert_free_marker(Block* block) {
std::vector<cudaGraphNode_t> empty_nodes;
auto try_add_empty_node = [&](cudaStream_t stream) -> bool {
cudaStreamCaptureStatus status{};
cudaGraph_t graph{};
const cudaGraphNode_t* deps = nullptr;
size_t num_deps = 0;
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
stream, &status, nullptr, &graph, &deps, nullptr, &num_deps));
#else
C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
stream, &status, nullptr, &graph, &deps, &num_deps));
#endif
TORCH_INTERNAL_ASSERT(
status != cudaStreamCaptureStatusInvalidated,
"Invalid stream capture status");
if (status == cudaStreamCaptureStatusNone) {
return false;
}
cudaGraphNode_t node{};
C10_CUDA_CHECK(cudaGraphAddEmptyNode(&node, graph, deps, num_deps));
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies(
stream, &node, nullptr, 1, cudaStreamSetCaptureDependencies));
#else
C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies(
stream, &node, 1, cudaStreamSetCaptureDependencies));
#endif
empty_nodes.push_back(node);
return true;
};
// If any stream is not currently capturing, return an empty node vector.
// An empty vector indicates that the block should be deferred for freeing
// until after capture.
// Attempt to add an empty node for the allocation stream.
if (!try_add_empty_node(block->stream)) {
return {};
}
// Attempt to add empty nodes for all streams that have used the block.
for (const auto& s : block->stream_uses) {
if (!try_add_empty_node(s.stream())) {
return {};
}
}
return empty_nodes;
}
// Returns the current set of "terminal" nodes in the CUDA graph for a given
// stream. These represent the current endpoints of the stream, and may
// include additional nodes if the graph branches. Any new work captured will
// be attached after one or more of these terminals.
std::vector<cudaGraphNode_t> get_terminals(cudaStream_t stream) {
std::vector<cudaGraphNode_t> result;
cudaStreamCaptureStatus status{};
cudaGraph_t graph{};
const cudaGraphNode_t* dependencies = nullptr;
size_t num_dependencies = 0;
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
stream,
&status,
nullptr,
&graph,
&dependencies,
nullptr,
&num_dependencies));
#else
C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
stream, &status, nullptr, &graph, &dependencies, &num_dependencies));
#endif
TORCH_INTERNAL_ASSERT(
status == cudaStreamCaptureStatusActive,
"Invalid stream capture status");
for (size_t i = 0; i < num_dependencies; i++) {
auto node = dependencies[i];
if (node != nullptr) {
result.push_back(node);
}
}
return result;
}
// Returns the set of "reusable" free markers (empty nodes) in the current
// CUDA graph capture. A free marker is considered reusable if it is a
// predecessor of every terminal node.
// This ensures that all future captured work will occur after the free
// marker, making it safe to reuse.
ska::flat_hash_set<cudaGraphNode_t> get_reusable_empty_nodes(
cudaStream_t stream) {
auto terminals = get_terminals(stream);
if (terminals.empty()) {
// No terminal nodes found; nothing to free.
return {};
}
auto get_dependencies = [](cudaGraphNode_t node,
cudaGraphNode_t* pDependencies,
size_t* pNumDependencies) -> void {
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
C10_CUDA_CHECK(cudaGraphNodeGetDependencies(
node, pDependencies, nullptr, pNumDependencies));
#else
C10_CUDA_CHECK(
cudaGraphNodeGetDependencies(node, pDependencies, pNumDependencies));
#endif
};
// Helper to retrieve all parent nodes (dependencies) of a given node.
auto get_parents =
[&](cudaGraphNode_t node) -> std::vector<cudaGraphNode_t> {
size_t count = 0;
get_dependencies(node, nullptr, &count);
std::vector<cudaGraphNode_t> out(count);
if (count) {
get_dependencies(node, out.data(), &count);
out.resize(count);
}
return out;
};
// Helper to determine if a node is an empty node (used as a free marker).
auto is_empty_node = [](cudaGraphNode_t n) -> bool {
cudaGraphNodeType type{};
C10_CUDA_CHECK(cudaGraphNodeGetType(n, &type));
return type == cudaGraphNodeTypeEmpty;
};
// For each terminal node, perform a reverse DFS to count, for each empty
// node, how many terminals it can reach (i.e., for how many terminals it is
// a predecessor). An empty node is reusable if it is a predecessor of all
// terminal nodes.
ska::flat_hash_map<cudaGraphNode_t, size_t> num_terminals_reachable;
for (auto terminal : terminals) {
ska::flat_hash_set<cudaGraphNode_t> visited;
ska::flat_hash_set<cudaGraphNode_t> empty_nodes;
std::function<void(cudaGraphNode_t)> reverse_dfs =
[&](cudaGraphNode_t node) {
if (!visited.insert(node).second)
return;
if (is_empty_node(node)) {
num_terminals_reachable[node]++;
empty_nodes.insert(node);
}
auto parents = get_parents(node);
for (auto p : parents) {
reverse_dfs(p);
}
};
reverse_dfs(terminal);
}
ska::flat_hash_set<cudaGraphNode_t> reusable_empty_nodes;
for (auto [node, count] : num_terminals_reachable) {
if (count == terminals.size()) {
reusable_empty_nodes.insert(node);
}
}
return reusable_empty_nodes;
}
// A block is considered reusable during CUDA graph capture if every free
// marker (empty node) associated with the block is a predecessor of every
// terminal node.
//
// This ensures that any new operation added to the graph will be attached
// after all terminal nodes, which themselves are after all free markers. As a
// result, all future work is guaranteed to occur after the block's last use
// on every stream, so the block's previous lifetime ends before any new
// lifetime begins. This check relies solely on the DAG topology and does not
// require event queries, making it safe to use during capture.
//
// This function iterates over all deferred blocks, determines if their empty
// nodes are reusable according to the above criteria, and frees the block if
// so.
void free_safe_blocks_in_capture(
const std::shared_ptr<GatheredContext>& context,
cudaStream_t stream) {
auto reusable_empty_nodes = get_reusable_empty_nodes(stream);
// If there are no reusable empty nodes (e.g., not currently capturing),
// there is nothing to do.
if (reusable_empty_nodes.empty()) {
return;
}
std::vector<Block*> blocks_to_erase;
for (auto& [block, inserted_empty_nodes] : deferred_blocks) {
// Skip this block if it has no empty nodes, as we defer its freeing until
// after graph capture. Also skip if the block was not allocated on the
// current stream; such blocks will be freed when
// free_safe_blocks_in_capture is attempted on that stream.
if (inserted_empty_nodes.empty() || block->stream != stream) {
continue;
}
bool is_reusable = true;
for (const auto& node : inserted_empty_nodes) {
if (reusable_empty_nodes.find(node) == reusable_empty_nodes.end()) {
is_reusable = false;
break;
}
}
if (is_reusable) {
// Clear stream uses since the graph ensures proper synchronization.
// No need to insert events.
block->stream_uses.clear();
free_block(block, context);
blocks_to_erase.push_back(block);
}
}
// Remove blocks that were freed from the deferred_blocks map.
for (auto* block : blocks_to_erase) {
deferred_blocks.erase(block);
}
}
void free(Block* block) {
std::shared_ptr<GatheredContext> context =
maybeGatherContext(RecordContext::ALL);
@ -1654,14 +1906,22 @@ class DeviceCachingAllocator {
if (block->size >= CUDAAllocatorConfig::max_split_size())
stats.oversize_allocations.decrease(1);
// If the block has been used on more than one stream, handle accordingly.
if (!block->stream_uses.empty()) {
if (C10_UNLIKELY(!captures_underway.empty())) {
// It's forbidden to cudaEventQuery an event recorded during CUDA graph
// capture. We conservatively defer recording end-of-life events until
// the next call to process_events() (which won't happen until no
// captures are underway)
needs_events_deferred_until_no_capture.push_back(block);
if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) {
// insert_free_marker returns a vector of free markers,
// or an empty vector if any associated stream is not currently
// capturing. The empty vector means that we will defer the free until
// capture is finished.
deferred_blocks.emplace(block, insert_free_marker(block));
} else {
// If graph_capture_record_stream_reuse is not enabled, always defer
// the free until capture is finished.
deferred_blocks.emplace(block, std::vector<cudaGraphNode_t>{});
}
} else {
// If not in a capture, insert events for the block.
insert_events(block);
}
} else {
@ -3287,8 +3547,8 @@ class DeviceCachingAllocator {
void insert_events_deferred_until_no_capture(
const std::shared_ptr<GatheredContext>& context) {
if (C10_UNLIKELY(!needs_events_deferred_until_no_capture.empty())) {
for (auto* block : needs_events_deferred_until_no_capture) {
if (C10_UNLIKELY(!deferred_blocks.empty())) {
for (auto& [block, inserted_empty_nodes] : deferred_blocks) {
TORCH_INTERNAL_ASSERT(!block->stream_uses.empty());
// only streams recorded before cudagraph will be used to insert events
// since we know all streams recorded during cudagraph must have
@ -3300,7 +3560,7 @@ class DeviceCachingAllocator {
free_block(block, context);
}
}
needs_events_deferred_until_no_capture.clear();
deferred_blocks.clear();
}
}
@ -3731,6 +3991,8 @@ class NativeCachingAllocator : public CUDAAllocator {
md.pinned_use_host_register =
CUDAAllocatorConfig::pinned_use_cuda_host_register();
md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
md.graph_capture_record_stream_reuse =
CUDAAllocatorConfig::graph_capture_record_stream_reuse();
md.roundup_power2_divisions =
CUDAAllocatorConfig::roundup_power2_divisions();

View File

@ -163,6 +163,7 @@ struct AllocatorConfigInfo {
bool expandable_segments;
bool release_lock_on_malloc;
bool pinned_use_host_register;
bool graph_capture_record_stream_reuse;
std::string last_allocator_settings;
std::vector<size_t> roundup_power2_divisions;
};

View File

@ -46,7 +46,7 @@ def define_targets(rules):
"util/typeid_test.cpp",
],
),
copts = ["-Wno-deprecated-declarations"],
copts = ["-Wno-deprecated-declarations", "-Wno-ctad-maybe-unsupported"],
deps = [
":Macros",
":complex_math_test_common",

View File

@ -1638,6 +1638,10 @@ if(USE_CUDA)
# order of the libraries in the linker call matters here when statically
# linking; libculibos and cublas must be last.
target_link_libraries(torch_cuda PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
if(USE_FBGEMM_GENAI)
# Link fbgemm_genai to torch_cuda (only for (1) CUDA build for SM100).
target_link_libraries(torch_cuda PRIVATE fbgemm_genai)
endif()
endif()
# ---[ XPU library.
@ -1759,9 +1763,10 @@ if(USE_ROCM)
target_link_libraries(torch_hip PRIVATE ${Caffe2_HIP_DEPENDENCY_LIBS})
if(USE_FBGEMM_GENAI)
target_link_libraries(torch_hip PRIVATE fbgemm_genai)
if(USE_ROCM)
target_link_libraries(torch_hip PRIVATE fbgemm_genai)
endif()
endif()
# Since PyTorch files contain HIP headers, this is also needed to capture the includes.
# ROCM_INCLUDE_DIRS is defined in LoadHIP.cmake
target_include_directories(torch_hip PRIVATE ${Caffe2_HIP_INCLUDE} ${ROCM_INCLUDE_DIRS})

View File

@ -1666,9 +1666,9 @@ if(USE_KINETO)
set(CMAKE_REQUIRED_LINK_OPTIONS "")
if(NOT EXCEPTIONS_WORK)
message(FATAL_ERROR
"Detected that statically linking against CUPTI causes exceptions to stop working. "
"See https://github.com/pytorch/pytorch/issues/57744 for more details. "
"Perhaps try: USE_CUPTI_SO=1 CMAKE_FRESH=1 python setup.py develop")
"Detected that statically linking against CUPTI causes exceptions to stop working. "
"See https://github.com/pytorch/pytorch/issues/57744 for more details. "
"Perhaps try: USE_CUPTI_SO=1 CMAKE_FRESH=1 python -m pip install -e . -v --no-build-isolation")
endif()
endif()

View File

@ -46,8 +46,8 @@ IF(NOT MKLDNN_FOUND)
endif()
endif()
ExternalProject_Add(xpu_mkldnn_proj
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN
GIT_TAG v3.8.1
GIT_REPOSITORY https://github.com/uxlfoundation/oneDNN
GIT_TAG v3.9.1
PREFIX ${XPU_MKLDNN_DIR_PREFIX}
BUILD_IN_SOURCE 0
CMAKE_ARGS -DCMAKE_C_COMPILER=icx

View File

@ -282,9 +282,15 @@ endif()
# cufft
add_library(caffe2::cufft INTERFACE IMPORTED)
if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
set_property(
TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
CUDA::cufft_static_nocallback)
if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
set_property(
TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
CUDA::cufft_static_nocallback)
else()
set_property(
TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
CUDA::cufft_static)
endif()
else()
set_property(
TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES

View File

@ -0,0 +1,242 @@
---
file_format: mystnb
kernelspec:
name: python3
mystnb:
execution_timeout: 30
execution_show_tb: True
merge_streams: True
---
```{code-cell}
:tags: [remove-cell]
import torch
import header_code
torch._logging.set_logs(graph_breaks=True)
```
# Toggling `error_on_graph_break`
**Summary:**
- When `fullgraph=False`, we can use `torch._dynamo.error_on_graph_break()` for more flexibility in
dealing with graph breaks.
So far, we have introduced two ways in dealing with graph breaks in `torch.compile`:
1. `fullgraph=True` errors on the first graph break and additionally guarantees that only one graph is traced from the code.
2. `fullgraph=False` continues tracing even when encountering graph breaks.
What if we want to disallow graph breaks for most of the code, but there are a few problematic functions where the graph breaks are hard to remove,
and we are okay with having those graph breaks? We can use `torch._dynamo.error_on_graph_break()` to achieve this.
`torch.compile` has an `error_on_graph_break` setting (initially set to `False`).
If a graph break or compiler error occurs in code while `error_on_graph_break` is set to `False`, then `torch.compile` will attempt to continue compilation after the graph break/error.
If `error_on_graph_break` is set to `True`, then `torch.compile` will abort compilation and propagate the error to user code.
A significant difference between `error_on_graph_break=True` and `fullgraph=True` is that the former **does not guarantee that a single graph will be captured**.
`error_on_graph_break` **can be arbitrarily toggled during compile time** by using the `torch._dynamo.error_on_graph_break()` context manager/decorator.
In comparison, once `fullgraph` is set to `True`, it cannot be set back to `False`.
Finally, `error_on_graph_break` has lower precedence than `fullgraph` - `error_on_graph_break` only takes effect when `fullgraph=False`.
## `error_on_graph_break(False)` example
```{code-cell}
@torch._dynamo.error_on_graph_break(False)
def code_with_a_difficult_graph_break(x):
x = x + 1
torch._dynamo.graph_break()
return x + 2
def inner(x):
return code_with_a_difficult_graph_break(x)
# NOTE: fullgraph=False
@torch._dynamo.error_on_graph_break(True)
@torch.compile
def fn(x):
return inner(x)
# No error, but there is a graph break
fn(torch.randn(3))
```
Using `error_on_graph_break(False)` under `error_on_graph_break(True)` is helpful for when we want to minimize graph breaks (i.e. follow the `fullgraph=True` programming model),
but there are some sections of code with non-performance-critical graph breaks that are difficult to work around.
`error_on_graph_break()` can be used as a context manager as well:
```{code-cell}
# NOTE: fullgraph=False
@torch._dynamo.error_on_graph_break(True)
@torch.compile
def fn(x):
x = x + 1
with torch._dynamo.error_on_graph_break(False):
torch._dynamo.graph_break() # no error
return x + 2
# No error, but there is a graph break
fn(torch.randn(3))
```
You can use monkey patching to toggle `error_on_graph_break` for code where you cannot edit the source (e.g. framework code):
```{code-cell}
class ThirdPartyModule(torch.nn.Module):
def forward(self, x):
x = x + 1
torch._dynamo.graph_break()
return x + 2
tp_mod = ThirdPartyModule()
tp_mod.forward = torch._dynamo.error_on_graph_break(False)(tp_mod.forward)
@torch._dynamo.error_on_graph_break(True)
@torch.compile
def fn(x):
return tp_mod.forward(x)
# No error, but there is a graph break
fn(torch.randn(3))
```
## `error_on_graph_break(True)` example
```{code-cell}
@torch._dynamo.error_on_graph_break(True)
def inner2(x):
x = x + 1
torch._dynamo.graph_break() # error
return x + 2
def inner(x):
return inner2(x)
# fullgraph=False, error_on_graph_break=False
@torch.compile
def fn(x):
x = x + 4
torch._dynamo.graph_break() # no error
return inner(x)
try:
fn(torch.randn(3))
except Exception as e:
print(e)
```
Using `error_on_graph_break(True)` under `error_on_graph_break(False)` is helpful for when we want to use `torch.compile` flexibly (i.e. follow the `fullgraph=False` programming model),
but there are some sections of the code that are performance-critical and we want to ensure that those sections do not contain graph breaks.
## `error_on_graph_break` nesting behavior
`torch._dynamo.error_on_graph_break()` affects the `error_on_graph_break` setting of nested calls as well:
```{code-cell}
def inner(x):
x = x + 1
torch._dynamo.graph_break()
return x + 2
def inner2(x):
with torch._dynamo.error_on_graph_break(False):
return inner(x)
@torch._dynamo.error_on_graph_break(True)
@torch.compile
def fn(x):
return inner2(x)
# no error
fn(torch.randn(3))
```
`torch._dynamo.error_on_graph_break()` can be used under another `torch._dynamo.error_on_graph_break()` region:
```{code-cell}
def inner(x):
x = x + 1
with torch._dynamo.error_on_graph_break(False):
torch._dynamo.graph_break()
return x + 2
def inner2(x):
with torch._dynamo.error_on_graph_break(True):
return inner(x)
@torch.compile
def fn(x):
return inner2(x)
# no error
fn(torch.randn(3))
```
## Interaction with `fullgraph`
`fullgraph=True` takes higher precedence than `error_on_graph_break`:
```{code-cell}
@torch._dynamo.error_on_graph_break(False)
def inner(x):
x = x + 1
torch._dynamo.graph_break()
return x + 2
@torch.compile(fullgraph=True)
def fn(x):
return inner(x)
try:
fn(torch.randn(3))
except Exception as e:
print(e)
```
`fullgraph=True` cannot be toggled back to `fullgraph=False`:
```{code-cell}
@torch.compile(fullgraph=False)
def inner(x):
x = x + 1
torch._dynamo.graph_break()
return x + 2
@torch.compile(fullgraph=True)
def fn(x):
return inner(x)
try:
fn(torch.randn(3))
except Exception as e:
print(e)
```
```{code-cell}
@torch.compile(fullgraph=True)
def inner(x):
x = x + 1
torch._dynamo.graph_break()
return x + 2
@torch.compile(fullgraph=False)
def fn(x):
return inner(x)
try:
fn(torch.randn(3))
except Exception as e:
print(e)
```
## Summary of `fullgraph=True/False` vs `error_on_graph_break`
Here is a table summarizing the differences between `fullgraph=True/False` and `error_on_graph_break`:
| | `error_on_graph_break=True` | `error_on_graph_break=False` (default) |
| --- | --- | --- |
| `fullgraph=True` | Graph breaks result in errors. Only the first graph break will be reported. **One graph guarantee.**<br><br>`fullgraph` cannot be toggled to `False`. `error_on_graph_break` has no effect.<br><br>User code must be fully compatible with `torch.compile`. Guarantees no performance hits from graph breaks (because there are no graph breaks).<br><br>Ideal for code sensitive to graph breaks: framework/library code or cases where getting maximum performance is required. Prevents downstream user code from inadvertently allowing graph breaks. | Same as `fullgraph=True` and `error_on_graph_break=True` as `error_on_graph_break` has no effect when `fullgraph=True`. |
| `fullgraph=False` (default) | Graph breaks result in errors. Only the first graph break will be reported. **No one graph guarantee.**<br><br>`error_on_graph_break` can be toggled to `False`.<br><br>User code must be fully compatible with `torch.compile`. Guarantees no performance hits from graph breaks (because there are no graph breaks).<br><br>Ideal for user code sensitive to graph breaks. `error_on_graph_break` can be toggled to `False` to deal with sections that have graph breaks that are difficult to work around. | Will continue to compile after encountering graph breaks. All graph breaks will be reported.<br><br>`error_on_graph_break` can be toggled to `True`.<br><br>Doesnt require many user code changes to work. Performance may be negatively impacted due to graph breaks.<br><br>Ideal for out-of-the-box use cases, on “non-weird” code, or where squeezing maximal performance is not necessary |

View File

@ -19,6 +19,7 @@ The strategy for using `torch.compile(fullgraph=False)` is as follows:
```{toctree}
programming_model.where_to_apply_compile
programming_model.compiler_disable
programming_model.error_on_graph_break
programming_model.nested_graph_breaks
programming_model.skipped_functions
```

View File

@ -645,6 +645,7 @@ export/programming_model
export/ir_spec
export/pt2_archive
export/draft_export
export/joint_with_descriptors
cond
generated/exportdb/index
torch.compiler_aot_inductor

View File

@ -0,0 +1,111 @@
# Joint with descriptors
Joint with descriptors is an experimental API for exporting a traced joint
graph that supports all of torch.compile's features in full generality and,
after processing, can be converted back into a differentiable callable that
can be executed as normal. For example, it is used to implement autoparallel,
a system that takes a model and reshards inputs and parameters to make it
a distributed SPMD program.
```{eval-rst}
.. currentmodule:: torch._functorch.aot_autograd
.. autofunction:: aot_export_joint_with_descriptors
.. autofunction:: aot_compile_joint_with_descriptors
```
## Descriptors
```{eval-rst}
.. currentmodule:: torch._functorch._aot_autograd.descriptors
.. autoclass:: AOTInput
:members:
.. autoclass:: AOTOutput
:members:
.. autoclass:: BackwardTokenAOTInput
:members:
.. autoclass:: BackwardTokenAOTOutput
:members:
.. autoclass:: BufferAOTInput
:members:
.. autoclass:: DummyAOTInput
:members:
.. autoclass:: DummyAOTOutput
:members:
.. autoclass:: GradAOTOutput
:members:
.. autoclass:: InputMutationAOTOutput
:members:
.. autoclass:: IntermediateBaseAOTOutput
:members:
.. autoclass:: ParamAOTInput
:members:
.. autoclass:: PhiloxBackwardBaseOffsetAOTInput
:members:
.. autoclass:: PhiloxBackwardSeedAOTInput
:members:
.. autoclass:: PhiloxForwardBaseOffsetAOTInput
:members:
.. autoclass:: PhiloxForwardSeedAOTInput
:members:
.. autoclass:: PhiloxUpdatedBackwardOffsetAOTOutput
:members:
.. autoclass:: PhiloxUpdatedForwardOffsetAOTOutput
:members:
.. autoclass:: PlainAOTInput
:members:
.. autoclass:: PlainAOTOutput
:members:
.. autoclass:: SavedForBackwardsAOTOutput
:members:
.. autoclass:: SubclassGetAttrAOTInput
:members:
.. autoclass:: SubclassGetAttrAOTOutput
:members:
.. autoclass:: SubclassSizeAOTInput
:members:
.. autoclass:: SubclassSizeAOTOutput
:members:
.. autoclass:: SubclassStrideAOTInput
:members:
.. autoclass:: SubclassStrideAOTOutput
:members:
.. autoclass:: SyntheticBaseAOTInput
:members:
.. autoclass:: ViewBaseAOTInput
:members:
```
## FX utilities
```{eval-rst}
.. automodule:: torch._functorch._aot_autograd.fx_utils
:members:
```

View File

@ -608,6 +608,14 @@ Available options:
for processing events. This avoids any slow path associated with querying/processing of
events in the fast allocation path. This feature is disabled by default.
* ``graph_capture_record_stream_reuse`` (experimental, default: `False`)
If set to `True`, the CUDA caching allocator will attempt to reclaim device memory during
CUDA Graph capture by using the graph topology (instead of CUDA events) to determine
when a freed block is safe to reuse. This can reduce peak memory during long captures that free
and reallocate buffers across multiple streams, especially when the capture DAG frequently
reaches joined frontiers. Note: Enabling this option can significantly increase the time spent
capturing the graph.
.. note::
Some stats reported by the

View File

@ -16,6 +16,39 @@ requires = [
]
build-backend = "setuptools.build_meta"
[dependency-groups]
dev = [
# This list should be kept in sync with the requirements-build.txt
# in PyTorch root until the project fully migrates to pyproject.toml
# after which this can be removed as it is already specified in the
# [build-system] section
"setuptools>=70.1.0,<80.0", # setuptools develop deprecated on 80.0
"cmake>=3.27",
"ninja",
"numpy",
"packaging",
"pyyaml",
"requests",
"six", # dependency chain: NNPACK -> PeachPy -> six
"typing-extensions>=4.10.0",
# This list should be kept in sync with the requirements.txt in
# PyTorch root until the project fully migrates to pyproject.toml
"build[uv]",
"expecttest>=0.3.0",
"filelock",
"fsspec>=0.8.5",
"hypothesis",
"jinja2",
"lintrunner; platform_machine != 's390x' and platform_machine != 'riscv64'",
"networkx>=2.5.1",
"optree>=0.13.0",
"psutil",
"sympy>=1.13.3",
"typing-extensions>=4.13.2",
"wheel",
]
[project]
name = "torch"
description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"

View File

@ -58,8 +58,8 @@
# USE_FBGEMM=0
# disables the FBGEMM build
#
# USE_FBGEMM_GENAI=1
# enables the FBGEMM GenAI kernels to build
# USE_FBGEMM_GENAI=0
# disables the FBGEMM GenAI build
#
# USE_KINETO=0
# disables usage of libkineto library for profiling

View File

@ -40,8 +40,16 @@ set(NATIVERT_TEST_SRCS
${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/GraphPasses.cpp
${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/PassManager.cpp
${TORCH_ROOT}/torch/nativert/kernels/KernelHandlerRegistry.cpp
${TORCH_ROOT}/torch/nativert/kernels/TritonKernel.cpp
${TORCH_ROOT}/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
${TORCH_ROOT}/torch/nativert/executor/DelegateExecutor.cpp
)
if(USE_CUDA)
list(APPEND NATIVERT_TEST_SRCS ${TORCH_ROOT}/torch/nativert/executor/triton/CudaTritonKernelManager.cpp)
endif(MSVC)
add_executable(test_nativert
${TORCH_ROOT}/test/cpp/common/main.cpp
${NATIVERT_TEST_SRCS}

View File

@ -0,0 +1,14 @@
#include <gtest/gtest.h>
#include <torch/nativert/kernels/TritonKernel.h>
using namespace ::testing;
using namespace torch::nativert;
TEST(TritonKernelManagerRegistrationTests, TestRegister) {
#ifndef USE_CUDA
EXPECT_TRUE(create_cuda_triton_kernel_manager == nullptr);
#else
EXPECT_FALSE(create_cuda_triton_kernel_manager == nullptr);
#endif // USE_CUDA
}

View File

@ -12,7 +12,7 @@ from torch.distributed.fsdp import fully_shard
from torch.distributed.tensor.debug import CommDebugMode
from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
from torch.testing._internal.common_fsdp import FSDPTest, get_devtype, MLPStack
from torch.testing._internal.common_utils import run_tests
from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
from torch.testing._internal.distributed._tensor.common_dtensor import (
ModelArgs,
Transformer,
@ -123,6 +123,7 @@ class TestClipGradNormWorldSize4(_TestClipGradNormBase):
return min(torch.get_device_module(device_type).device_count(), 4)
@skip_if_lt_x_gpu(4)
@xfailIf(TEST_XPU) # https://github.com/intel/torch-xpu-ops/issues/1661
def test_clip_grad_norm_2d(self):
for norm_type in (2, 1, 3, float("inf")):
dp_size = 2

View File

@ -5,6 +5,7 @@ import functools
import itertools
import os
import tempfile
import unittest
from typing import Callable, Optional, Union
from unittest.mock import MagicMock
@ -54,7 +55,7 @@ from torch.testing._internal.common_fsdp import (
patch_reshard,
patch_unshard,
)
from torch.testing._internal.common_utils import run_tests
from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
from torch.testing._internal.distributed._tensor.common_dtensor import (
ModelArgs,
Transformer,
@ -414,6 +415,7 @@ class TestFullyShardCommunication(FSDPTest):
)
@skip_if_lt_x_gpu(2)
@xfailIf(TEST_XPU) # https://github.com/intel/torch-xpu-ops/issues/1571
def test_set_reduce_scatter_divide_factor(self):
self.run_subtests(
{"divide_factor": [self.world_size * 2, self.world_size]},
@ -1454,6 +1456,9 @@ class TestFullyShardForceSumReduction(FSDPTest):
# Test reduce-scatter only on plain FSDP on 2 GPUs
@skip_if_lt_x_gpu(2)
@unittest.skipIf(
TEST_XPU, "Related environment variable is not supported with XCCL"
)
def test_fully_shard_force_sum_reduce_scatter(self):
torch.manual_seed(42)
model_args = ModelArgs()
@ -1506,6 +1511,9 @@ class TestFullyShardForceSumReduction(FSDPTest):
# Test both reduce-scatter and all-reduce on HSDP (DDP+FSDP) on 4 GPUs
@skip_if_lt_x_gpu(4)
@unittest.skipIf(
TEST_XPU, "Related environment variable is not supported with XCCL"
)
def test_fully_shard_force_sum_both_reductions(self):
mesh = init_device_mesh(
device_type.type, (2, self.world_size // 2), mesh_dim_names=("ddp", "fsdp")

View File

@ -133,7 +133,7 @@ class TestFullyShardCompile(FSDPTest):
device_type.type,
self.rank % torch.get_device_module(device_type).device_count(),
)
if not sm_is_or_higher_than(device, 8, 0):
if device_type.type == "cuda" and not sm_is_or_higher_than(device, 8, 0):
self.skipTest("bf16 requires sm >= 8.0")
def test_dynamo_trace_use_training_state(self):

View File

@ -24,7 +24,7 @@ from torch.testing._internal.common_fsdp import (
patch_register_post_backward_hook_backward,
reduce_scatter_with_assert,
)
from torch.testing._internal.common_utils import run_tests
from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
device_type = torch.device(get_devtype())
@ -36,6 +36,7 @@ class TestFullyShardFrozen(FSDPTest):
return min(4, torch.get_device_module(device_type).device_count())
@skip_if_lt_x_gpu(2)
@xfailIf(TEST_XPU) # https://github.com/pytorch/pytorch/issues/156782
def test_train_mixed_requires_grad_per_group(self):
"""
Tests training parity with DDP when mixing frozen and non-frozen

View File

@ -8,7 +8,12 @@ import torch
from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, OffloadPolicy
from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
from torch.testing._internal.common_fsdp import FSDPTest, get_devtype
from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU
from torch.testing._internal.common_utils import (
run_tests,
TEST_CUDA,
TEST_HPU,
TEST_XPU,
)
from torch.testing._internal.distributed._tensor.common_dtensor import (
ModelArgs,
Transformer,
@ -236,14 +241,15 @@ class TestFullyShardMemory(FSDPTest):
def _get_peak_active_memory_mb(self) -> int:
mem_stats = torch.get_device_module(device_type).memory_stats()
if TEST_CUDA:
if TEST_CUDA or TEST_XPU:
return round(mem_stats["active_bytes.all.peak"] / 1e6)
if TEST_HPU:
return round(mem_stats["MaxInUse"] / 1e6)
def _get_curr_active_memory_mb(self) -> int:
mem_stats = torch.get_device_module(device_type).memory_stats()
if TEST_CUDA:
if TEST_CUDA or TEST_XPU:
return round(mem_stats["active_bytes.all.current"] / 1e6)
if TEST_HPU:
return round(mem_stats["InUse"] / 1e6)

View File

@ -28,7 +28,13 @@ from torch.testing._internal.common_fsdp import (
patch_reduce_scatter,
reduce_scatter_with_assert,
)
from torch.testing._internal.common_utils import run_tests, skipIfRocm, TEST_HPU
from torch.testing._internal.common_utils import (
run_tests,
skipIfRocmVersionLessThan,
TEST_HPU,
TEST_XPU,
xfailIf,
)
device_type = torch.device(get_devtype())
@ -86,9 +92,10 @@ class TestFullyShardMixedPrecisionTraining(FSDPTest):
use_shard_placement_fn_vals.append(True)
return use_shard_placement_fn_vals
@skipIfRocm # regressed in ROCm 6.4, but ROCm 6.5 fixes it
@skipIfRocmVersionLessThan((7, 0))
@skip_if_lt_x_gpu(2)
@requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
@xfailIf(TEST_XPU) # https://github.com/pytorch/pytorch/issues/156782
def test_compute_dtype(self):
use_shard_placement_fn_vals = (
self._get_use_shard_placement_fn_vals_for_bf16_reduce()
@ -166,9 +173,10 @@ class TestFullyShardMixedPrecisionTraining(FSDPTest):
self.assertEqual(fsdp_loss, ref_loss)
check_sharded_parity(self, ref_model, model)
@skipIfRocm # regressed in ROCm 6.4, but ROCm 6.5 fixes it
@skipIfRocmVersionLessThan((7, 0))
@skip_if_lt_x_gpu(2)
@requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
@xfailIf(TEST_XPU) # https://github.com/pytorch/pytorch/issues/156782
def test_reduce_dtype(self):
self.run_subtests(
{
@ -291,6 +299,7 @@ class TestFullyShardMixedPrecisionTraining(FSDPTest):
check_sharded_parity(self, ref_model, model)
@skip_if_lt_x_gpu(2)
@xfailIf(TEST_XPU) # https://github.com/pytorch/pytorch/issues/156782
def test_grad_acc_with_reduce_dtype(self):
"""
Tests that gradient accumulation without reduce-scatter when using
@ -610,7 +619,7 @@ class TestFullyShardMixedPrecisionCasts(FSDPTestMultiThread):
torch.bfloat16, torch.bfloat16, torch.bfloat16, True
)
model = Model()
inp = Input(torch.randn(2, 10).cuda())
inp = Input(torch.randn(2, 10).to(device_type))
fully_shard(model, mp_policy=mp_policy)
loss = model(inp).sum()

View File

@ -42,7 +42,9 @@ from torch.testing._internal.common_utils import (
get_cycles_per_ms,
run_tests,
TEST_HPU,
TEST_XPU,
wrapSwapTensorsTest,
xfailIf,
)
from torch.testing._internal.distributed._tensor.common_dtensor import (
ModelArgs,
@ -324,7 +326,7 @@ class TestFullyShard1DTrainingCore(FSDPTest):
self.assertEqual(losses[0], losses[1])
@skip_if_lt_x_gpu(2)
@unittest.skipIf(TEST_HPU, "Sleep kernel not supported for HPU")
@unittest.skipIf(TEST_HPU or TEST_XPU, "Sleep kernel not supported for HPU/XPU")
@compiled_fsdp_test(compile_compute_on_module=Transformer)
def test_train_parity_multi_group(self):
"""
@ -347,7 +349,7 @@ class TestFullyShard1DTrainingCore(FSDPTest):
)
@skip_if_lt_x_gpu(2)
@unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
@unittest.skipIf(TEST_HPU or TEST_XPU, "sleep kernel not supported on HPU/XPU")
def test_train_parity_multi_group_cpu_offload_eager(self):
"""
Tests train parity against DDP when using multiple parameter groups for
@ -371,7 +373,7 @@ class TestFullyShard1DTrainingCore(FSDPTest):
)
@skip_if_lt_x_gpu(2)
@unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
@unittest.skipIf(TEST_HPU or TEST_XPU, "sleep kernel not supported on HPU/XPU")
@compiled_fsdp_test(compile_compute_on_module=Transformer)
def test_train_parity_multi_group_unshard_async_op(self):
"""
@ -495,6 +497,7 @@ class TestFullyShard1DTrainingCore(FSDPTest):
self.assertEqual(losses[0], losses[1])
@skip_if_lt_x_gpu(2)
@unittest.skipIf(TEST_XPU, "Sleep is not supported on XPU")
def test_non_root_forward_backward(self):
"""
Tests running forward/backward through the root and then through a
@ -625,7 +628,7 @@ class TestFullyShard1DTrainingCore(FSDPTest):
self.assertEqual(losses[0], losses[1])
@skip_if_lt_x_gpu(2)
@unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
@unittest.skipIf(TEST_HPU or TEST_XPU, "Sleep is not supported on HPU/XPU")
def test_post_optim_event(self):
torch.manual_seed(42)
model_args = ModelArgs(dropout_p=0.0)
@ -678,6 +681,7 @@ class TestFullyShard1DTrainingCompose(FSDPTest):
@skip_if_lt_x_gpu(2)
@compiled_fsdp_test(compile_compute_on_module=Transformer)
@xfailIf(TEST_XPU) # https://github.com/intel/torch-xpu-ops/issues/1661
def test_train_parity_with_activation_checkpointing(self):
"""
Tests train parity against DDP when composing with activation
@ -930,6 +934,7 @@ class TestFullyShardGradientAccumulation(FSDPTest):
return min(4, torch.get_device_module(device_type).device_count())
@skip_if_lt_x_gpu(2)
@xfailIf(TEST_XPU) # https://github.com/pytorch/pytorch/issues/156782
def test_gradient_accumulation(self):
"""
Tests gradient accumulation with/without gradient reduction and
@ -1111,6 +1116,7 @@ class TestFullyShardGradientAccumulation(FSDPTest):
_optim.zero_grad(set_to_none=(iter_idx % 2))
@skip_if_lt_x_gpu(2)
@xfailIf(TEST_XPU) # https://github.com/pytorch/pytorch/issues/156782
def test_1f1b_microbatching(self):
self.run_subtests(
{

View File

@ -7,7 +7,6 @@ import torch.nn as nn
from torch.distributed._tools.mem_tracker import MemTracker
from torch.testing._internal.common_utils import (
run_tests,
skipIfRocm,
skipIfTorchDynamo,
TEST_CUDA,
TEST_XPU,
@ -34,7 +33,6 @@ class TestMemTracker(TestCase):
@unittest.skipIf(
not TEST_CUDA and not TEST_XPU, "Neither CUDA or XPU is not available"
)
@skipIfRocm()
def test_accelerator_tracker_equivalence(
self,
):

View File

@ -1,7 +1,7 @@
# Owner(s): ["oncall: distributed checkpointing"]
import tempfile
from unittest.mock import MagicMock
from unittest.mock import MagicMock, patch
import torch
from torch.distributed.checkpoint.metadata import MetadataIndex
@ -23,38 +23,70 @@ class TestQuantizedHfStorage(TestCase):
self.temp_dir.cleanup()
def test_dequantization(self):
"""Test that quantized tensors are properly dequantized during read operations."""
"""Test quantized tensors with weights and scales in both same and different files."""
reader = QuantizedHuggingFaceStorageReader(self.path, thread_count=1)
# Test data
quantized_tensor = torch.ones(4, 4, dtype=torch.float32)
scale_inv = torch.tensor([[2.0]], dtype=torch.float32)
# Test data for two different weights
quantized_tensor1 = torch.ones(4, 4, dtype=torch.float32)
quantized_tensor2 = (
torch.ones(4, 4, dtype=torch.float32) * 3.0
) # Different values
scale_inv1 = torch.tensor([[2.0]], dtype=torch.float32)
scale_inv2 = torch.tensor([[0.5]], dtype=torch.float32) # Different scale
# Mock the safetensors file for reading data
mock_file = MagicMock()
# Define weight and scale tensor names
weight1_fqn = "model.layers.0.self_attn.q_proj.weight" # Scale in same file
scale1_fqn = "model.layers.0.self_attn.q_proj.weight_scale_inv"
weight2_fqn = (
"model.layers.0.self_attn.k_proj.weight" # Scale in different file
)
scale2_fqn = "model.layers.0.self_attn.k_proj.weight_scale_inv"
# Mock get_slice to return a tensor that can be sliced
def mock_get_slice(tensor_name):
mock_tensor = MagicMock()
mock_tensor.__getitem__ = lambda self, slices: quantized_tensor
return mock_tensor
mock_file.get_slice = mock_get_slice
mock_file.get_tensor.return_value = scale_inv
file1_name = "model-00001-of-00002.safetensors"
file2_name = "model-00002-of-00002.safetensors"
# Setup weight-scale mapping and file locations
reader._weight_scale_mapping = {
"model.layers.0.self_attn.kv_b_proj.weight": "model.layers.0.self_attn.kv_b_proj.weight_scale_inv",
weight1_fqn: scale1_fqn,
weight2_fqn: scale2_fqn,
}
reader._weight_map = {
weight1_fqn: file1_name, # Weight in file 1
scale1_fqn: file1_name, # Scale also in file 1 (same file scenario)
weight2_fqn: file1_name, # Weight in file 1
scale2_fqn: file2_name, # Scale in file 2 (different file scenario)
}
# Create a read request for quantized tensor
read_item = ReadItem(
# Mock the main safetensors file (file1)
mock_file1 = MagicMock()
# Mock get_slice to return different tensors based on tensor name
def mock_get_slice(tensor_name):
mock_tensor = MagicMock()
if tensor_name == weight1_fqn:
mock_tensor.__getitem__ = lambda _, __: quantized_tensor1
elif tensor_name == weight2_fqn:
mock_tensor.__getitem__ = lambda _, __: quantized_tensor2
return mock_tensor
mock_file1.get_slice = mock_get_slice
# Mock get_tensor for same-file scale (scale1)
mock_file1.get_tensor.return_value = scale_inv1
# Mock the cross-file safetensors file (file2) for scale2
mock_file2 = MagicMock()
mock_file2.get_tensor.return_value = scale_inv2
# Test 1: Same-file scenario (weight1 + scale1 both in file1)
read_item1 = ReadItem(
type=LoadItemType.TENSOR,
storage_index=MetadataIndex(
fqn="model.layers.0.self_attn.kv_b_proj.weight",
fqn=weight1_fqn,
offset=torch.Size([0, 0]),
),
dest_index=MetadataIndex(
fqn="model.layers.0.self_attn.kv_b_proj.weight",
fqn=weight1_fqn,
offset=torch.Size([0, 0]),
),
storage_offsets=[0, 0],
@ -62,22 +94,73 @@ class TestQuantizedHfStorage(TestCase):
lengths=[4, 4],
)
# Mock planner
target_tensor = torch.zeros(4, 4, dtype=torch.float32)
mock_planner = MagicMock()
mock_planner.resolve_tensor.return_value = target_tensor
target_tensor1 = torch.zeros(4, 4, dtype=torch.float32)
mock_planner1 = MagicMock()
mock_planner1.resolve_tensor.return_value = target_tensor1
# Test the _process_read_request method
reader._process_read_request(mock_file, read_item, mock_planner)
# Process first weight (same file scenario)
reader._process_read_request(mock_file1, read_item1, mock_planner1)
# Verify the tensor was dequantized (ones * 2.0 = twos)
expected_result = torch.ones(4, 4, dtype=torch.float32) * 2.0
mock_planner.commit_tensor.assert_called_once()
# Verify first tensor was dequantized (ones * 2.0 = twos)
expected_result1 = torch.ones(4, 4, dtype=torch.float32) * 2.0
mock_planner1.commit_tensor.assert_called_once()
# Check that target_tensor was updated correctly
args, _ = mock_planner.commit_tensor.call_args
committed_tensor = args[1] # second argument is the tensor
torch.testing.assert_close(committed_tensor, expected_result)
# Check that target_tensor1 was updated correctly
args1, _ = mock_planner1.commit_tensor.call_args
committed_tensor1 = args1[1]
torch.testing.assert_close(committed_tensor1, expected_result1)
# Test 2: Cross-file scenario (weight2 in file1, scale2 in file2)
read_item2 = ReadItem(
type=LoadItemType.TENSOR,
storage_index=MetadataIndex(
fqn=weight2_fqn,
offset=torch.Size([0, 0]),
),
dest_index=MetadataIndex(
fqn=weight2_fqn,
offset=torch.Size([0, 0]),
),
storage_offsets=[0, 0],
dest_offsets=[0, 0],
lengths=[4, 4],
)
target_tensor2 = torch.zeros(4, 4, dtype=torch.float32)
mock_planner2 = MagicMock()
mock_planner2.resolve_tensor.return_value = target_tensor2
# Mock the entire safetensors module since it may not be available in test environment
mock_safetensors = MagicMock()
mock_safe_open = MagicMock()
mock_safetensors.safe_open = mock_safe_open
# Set up the mock to return a context manager that yields mock_file2
mock_safe_open.return_value.__enter__.return_value = mock_file2
mock_safe_open.return_value.__exit__.return_value = False
# Mock the module import and safe_open function
with patch.dict("sys.modules", {"safetensors": mock_safetensors}):
# Process second weight (cross-file scenario)
reader._process_read_request(mock_file1, read_item2, mock_planner2)
# Verify safe_open was called with the correct file path
expected_path = f"{self.path}/{file2_name}"
mock_safe_open.assert_called_once()
call_args = mock_safe_open.call_args[0]
self.assertEqual(str(call_args[0]), expected_path)
# Verify the scale tensor was loaded from the correct file
mock_file2.get_tensor.assert_called_once_with(scale2_fqn)
# Verify second tensor was dequantized (3.0 * 0.5 = 1.5)
expected_result2 = torch.ones(4, 4, dtype=torch.float32) * 3.0 * 0.5 # 1.5
mock_planner2.commit_tensor.assert_called_once()
# Check that target_tensor2 was updated correctly
args2, _ = mock_planner2.commit_tensor.call_args
committed_tensor2 = args2[1]
torch.testing.assert_close(committed_tensor2, expected_result2)
if __name__ == "__main__":

View File

@ -116,7 +116,6 @@ class DistributedUtilTest(TestCase):
timeout=1,
)
@skipIfRocm
def test_create_store_timeout_on_worker(self):
with self.assertRaises(DistNetworkError):
# use any available port (port 0) since timeout is expected

View File

@ -44,8 +44,11 @@ class TestFlattenParams(FSDPTest):
return 1
def _get_default_config(self):
device_type = (
acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
)
return {
"device": torch.device("cuda"),
"device": torch.device(device_type),
"sharding_strategy": HandleShardingStrategy.FULL_SHARD,
"offload_params": False,
"mp_param_dtype": None,

View File

@ -31,6 +31,8 @@ if TEST_WITH_DEV_DBG_ASAN:
)
sys.exit(0)
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
class Model(nn.Module):
def __init__(
@ -47,7 +49,6 @@ class Model(nn.Module):
nn.AdaptiveAvgPool2d(output_size=(1, 1)),
nn.Flatten(),
)
self.device = torch.cuda.current_device()
self.head = nn.Linear(64, 10)
if with_fsdp and freeze_after_wrap_fsdp:
self.fsdp_wrap(fsdp_kwargs)
@ -145,7 +146,7 @@ class TestFreezingWeights(FSDPTest):
forward_prefetch,
):
torch.manual_seed(0)
batch = torch.randn(size=(2, 3, 224, 224)).cuda()
batch = torch.randn(size=(2, 3, 224, 224)).to(device_type)
fsdp_kwargs = {
"device_id": self.rank,
@ -164,7 +165,7 @@ class TestFreezingWeights(FSDPTest):
disable_autograd,
fsdp_kwargs,
)
model = model.cuda()
model = model.to(device_type)
# freezing the trunk using requires_grad.
if freezing_method == FreezingMethod.RequiresGrad:
@ -178,7 +179,7 @@ class TestFreezingWeights(FSDPTest):
else:
model = DistributedDataParallel(model, **ddp_kwargs)
target = torch.tensor([0, 1], dtype=torch.long).cuda()
target = torch.tensor([0, 1], dtype=torch.long).to(device_type)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

View File

@ -49,6 +49,8 @@ if TEST_WITH_DEV_DBG_ASAN:
)
sys.exit(0)
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
@contextlib.contextmanager
def patch_allreduce(new_allreduce):
@ -97,7 +99,7 @@ class ShardingStrategyMode(Enum):
class TestFSDPHybridShard(FSDPTest):
@property
def world_size(self):
return max(torch.cuda.device_count(), 2)
return max(torch.accelerator.device_count(), 2)
@property
def process_group(self):
@ -105,7 +107,7 @@ class TestFSDPHybridShard(FSDPTest):
@skip_if_lt_x_gpu(2)
def test_raises_manual_wrap_hybrid_shard_when_none_policy(self):
model = MyModel().cuda()
model = MyModel().to(device_type)
err_ctx = self.assertRaisesRegex(
ValueError,
"requires explicit specification of process group or device_mesh.",
@ -119,8 +121,8 @@ class TestFSDPHybridShard(FSDPTest):
@skip_if_lt_x_gpu(4)
def test_hsdp_save_load_state_dict(self):
model = MyModel().cuda()
num_node_devices = torch.cuda.device_count()
model = MyModel().to(device_type)
num_node_devices = torch.accelerator.device_count()
shard_rank_lists = (
list(range(0, num_node_devices // 2)),
list(range(num_node_devices // 2, num_node_devices)),
@ -161,7 +163,7 @@ class TestFSDPHybridShard(FSDPTest):
msd = model.state_dict()
osd = FSDP.optim_state_dict(model, optim)
load_model = fsdp_ctor(MyModel().cuda())
load_model = fsdp_ctor(MyModel().to(device_type))
load_optim = torch.optim.AdamW(load_model.parameters())
with FSDP.state_dict_type(load_model, StateDictType.SHARDED_STATE_DICT):
load_model.load_state_dict(msd)
@ -170,8 +172,8 @@ class TestFSDPHybridShard(FSDPTest):
@skip_if_lt_x_gpu(4)
def test_hsdp_sync_module_state(self):
model = MyModel().cuda()
num_node_devices = torch.cuda.device_count()
model = MyModel().to(device_type)
num_node_devices = torch.accelerator.device_count()
shard_rank_lists = (
list(range(0, num_node_devices // 2)),
list(range(num_node_devices // 2, num_node_devices)),
@ -214,7 +216,7 @@ class TestFSDPHybridShard(FSDPTest):
@skip_if_lt_x_gpu(2)
def test_invalid_pg_specification_raises(self):
pol = ModuleWrapPolicy({nn.Linear})
model = MyModel().cuda()
model = MyModel().to(device_type)
with self.assertRaisesRegex(
ValueError, "Expected process_group to be passed in"
):
@ -260,7 +262,7 @@ class TestFSDPHybridShard(FSDPTest):
use_device_mesh: bool,
):
if use_device_mesh:
device_mesh = init_device_mesh("cuda", (1, self.world_size))
device_mesh = init_device_mesh(device_type, (1, self.world_size))
else:
device_mesh = None
hsdp_model = self._init_hsdp_model(
@ -316,7 +318,7 @@ class TestFSDPHybridShard(FSDPTest):
patch_allreduce(patched_allreduce),
patch_reduce_scatter(patched_reduce_scatter),
):
inp = hsdp_model.get_input(device=torch.cuda.current_device())
inp = hsdp_model.get_input(device=torch.accelerator.current_device_index())
out = hsdp_model(inp[0], inp[1])
loss = hsdp_model.get_loss(inp, out)
loss.backward()
@ -365,7 +367,7 @@ class TestFSDPHybridShard(FSDPTest):
hsdp_optim = torch.optim.Adam(hsdp_model.parameters(), lr=1e-2)
torch.manual_seed(global_pg.rank() + 1)
for _ in range(5):
inp = fsdp_model.module.get_input(torch.device("cuda"))
inp = fsdp_model.module.get_input(torch.device(device_type))
losses: list[torch.Tensor] = []
for model, optim in ((fsdp_model, fsdp_optim), (hsdp_model, hsdp_optim)):
optim.zero_grad()
@ -381,7 +383,7 @@ class TestFSDPHybridShard(FSDPTest):
)
hsdp_kwargs = {
"auto_wrap_policy": auto_wrap_policy,
"device_id": torch.cuda.current_device(),
"device_id": torch.accelerator.current_device_index(),
"use_orig_params": use_orig_params,
}
fsdp_model = TransformerWithSharedParams.init(
@ -408,7 +410,7 @@ class TestFSDPHybridShard(FSDPTest):
{TransformerEncoderLayer, TransformerDecoderLayer},
)
hsdp_kwargs = {
"device_id": torch.cuda.current_device(),
"device_id": torch.accelerator.current_device_index(),
"auto_wrap_policy": auto_wrap_policy,
"sharding_strategy": hsdp_sharding_strategy,
"use_orig_params": use_orig_params,
@ -435,7 +437,7 @@ class TestFSDPHybridShard(FSDPTest):
# Use `FULL_SHARD` for the embedding and output projection
hsdp_model = FSDP(
model,
device_id=torch.cuda.current_device(),
device_id=torch.accelerator.current_device_index(),
sharding_strategy=ShardingStrategy.FULL_SHARD,
use_orig_params=use_orig_params,
)

View File

@ -36,6 +36,8 @@ if TEST_WITH_DEV_DBG_ASAN:
)
sys.exit(0)
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
class Model(torch.nn.Module):
def __init__(self) -> None:
@ -94,9 +96,9 @@ class ModelWithIgnoredModules(Model):
class TestFSDPIgnoredModules(FSDPTest):
@property
def world_size(self):
return min(torch.cuda.device_count(), 2)
return min(torch.accelerator.device_count(), 2)
def _train_model(self, model, optim, num_iters, device=torch.device("cuda")):
def _train_model(self, model, optim, num_iters, device=torch.device(device_type)):
for _ in range(num_iters):
module = model.module if isinstance(model, FSDP) else model
inp = module.get_input(device)
@ -198,7 +200,7 @@ class TestFSDPIgnoredModules(FSDPTest):
# Initialize an FSDP-wrapped nested model that first wraps the nested
# sequential's second linear layer (`layer1[1]`) and then wraps the
# overall model while ignoring the nested sequential (`layer1`)
model = Model().cuda()
model = Model().to(device_type)
fsdp_fn = functools.partial(FSDP, use_orig_params=use_orig_params)
model.layer1[1] = fsdp_fn(model.layer1[1])
if ignore_modules:
@ -246,7 +248,7 @@ class TestFSDPIgnoredModules(FSDPTest):
)
def _test_ignored_states_auto_wrap(self, policy, ignore_bias: bool):
model = Model().cuda()
model = Model().to(device_type)
ignored_states = [model.layer1[1].weight]
if ignore_bias:
ignored_states.append(model.layer1[1].bias)
@ -285,7 +287,7 @@ class TestFSDPIgnoredModules(FSDPTest):
def test_ignored_modules_invalid(self):
"""Tests that passing an FSDP module as an ignored module or the
top-level module itself errors."""
model = Model().cuda()
model = Model().to(device_type)
wrap_cls = FSDP
model.layer1 = wrap_cls(model.layer1)
# Passing an FSDP module as an ignored module should error
@ -302,7 +304,7 @@ class TestFSDPIgnoredModules(FSDPTest):
):
# FSDP does not allow to wrap the same model twice, so create
# a new local model here.
new_model = Model().cuda()
new_model = Model().to(device_type)
wrap_cls(new_model, ignored_modules=[new_model])
@skip_if_lt_x_gpu(2)
@ -334,7 +336,7 @@ class TestFSDPIgnoredModules(FSDPTest):
# we wrap `layer3` with FSDP, where `layer3` is registered as a module
# after `layer1`, which has the variable number of ignored modules
wrap_cls = FSDP
model = ModelWithIgnoredModules(num_ignored=self.rank + 1).cuda()
model = ModelWithIgnoredModules(num_ignored=self.rank + 1).to(device_type)
layer1_ignored_modules = [
m for m in model.layer1.modules() if isinstance(m, IgnoredModule)
]
@ -370,7 +372,7 @@ class TestFSDPIgnoredModules(FSDPTest):
@skip_if_lt_x_gpu(2)
@parametrize("ignore_modules", [True, False])
def test_ignored_modules_not_under_wrapped_root(self, ignore_modules: bool):
model = Model().cuda()
model = Model().to(device_type)
ignored_modules = list(model.layer1.children())[1:]
ignore_kwargs = (
@ -409,7 +411,7 @@ class TestFSDPIgnoredModules(FSDPTest):
)
def _test_ignored_states_check(self, ignore_modules: bool):
model = Model().cuda()
model = Model().to(device_type)
ignored_modules = list(model.layer1.children())[1:]
ignored_params = {p for m in ignored_modules for p in m.parameters()}
ignored_states = ignored_params.union(set(ignored_modules))

View File

@ -14,6 +14,7 @@ from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
parametrize,
run_tests,
TEST_CUDA,
TEST_HPU,
TEST_WITH_DEV_DBG_ASAN,
)
@ -31,11 +32,14 @@ if TEST_WITH_DEV_DBG_ASAN:
)
sys.exit(0)
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
def get_cur_mem(rank, result, prefix):
"""Collect memory allocated values in a result dict in MB"""
torch._C._cuda_clearCublasWorkspaces()
result[prefix] = round(torch.cuda.memory_allocated() / 1024 / 1024)
if TEST_CUDA:
torch._C._cuda_clearCublasWorkspaces()
result[prefix] = round(torch.accelerator.memory_allocated() / 1024 / 1024)
class Model(nn.Module):
@ -110,14 +114,14 @@ class TestFSDPMemory(FSDPTest):
def _dist_train(self, with_checkpoint, expected, model_hidden_dim, iterations):
gpu_id = self.rank
batch = torch.randn(size=(2, 3, 224, 224)).cuda()
batch = torch.randn(size=(2, 3, 224, 224)).to(device_type)
model = create_model(
with_fsdp=True,
with_checkpoint=with_checkpoint,
model_hidden_dim=model_hidden_dim,
)
model = model.cuda()
model = model.to(device_type)
model = FSDP(model)
# We enable momentum so that after the first iteration, the optimizer state is added
@ -133,7 +137,7 @@ class TestFSDPMemory(FSDPTest):
get_cur_mem(gpu_id, results, f"iter {iteration}: after fwd")
out = sum(o.sum() for o in out[0])
fake_loss = criterion(out, torch.tensor(0.0).cuda())
fake_loss = criterion(out, torch.tensor(0.0).to(device_type))
get_cur_mem(gpu_id, results, f"iter {iteration}: after loss")
fake_loss.backward()
@ -167,8 +171,8 @@ class TestFSDPMemory(FSDPTest):
model = create_model(
with_fsdp=False, with_checkpoint=False, model_hidden_dim=model_hidden_dim
).cuda()
model_size_mb = round(torch.cuda.memory_allocated() / 1024 / 1024)
).to(device_type)
model_size_mb = round(torch.accelerator.memory_allocated() / 1024 / 1024)
del model
sharded_model_size_mb = int(model_size_mb / self.world_size)

View File

@ -43,6 +43,8 @@ if TEST_WITH_DEV_DBG_ASAN:
)
sys.exit(0)
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
def _reset_params_if_meta(is_meta: bool, model: nn.Module):
# For torchdistX init, we don't need to call reset_params, as
@ -117,7 +119,7 @@ def _init_with_reset_params(module: nn.Module):
)
)
if has_meta_states:
device = torch.device("cuda", torch.cuda.current_device())
device = torch.device(device_type, torch.accelerator.current_device_index())
module.to_empty(device=device, recurse=False)
module.reset_parameters()
@ -164,13 +166,13 @@ class TestFSDPWithMetaDevice(FSDPTest):
# Test to make sure it is the same model parameters as regular FSDP
# approach.
regular = MyModel(device="cuda")
regular = MyModel(device=device_type)
_reset_params_if_meta(is_meta, regular)
fsdp_regular = FSDP(regular, auto_wrap_policy=always_wrap)
regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
self._compare_fsdp(fsdp_meta, fsdp_regular)
inp = torch.randn(10, 2, device="cuda")
inp = torch.randn(10, 2, device=device_type)
fsdp_meta(inp).sum().backward()
fsdp_regular(inp).sum().backward()
meta_opt.step()
@ -182,7 +184,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
model = meta_module_fn()
fsdp_meta = FSDP(model, param_init_fn=init_fn)
meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3)
regular = MyModel(device="cuda")
regular = MyModel(device=device_type)
_reset_params_if_meta(is_meta, regular)
fsdp_regular = FSDP(regular, auto_wrap_policy=always_wrap)
regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
@ -217,7 +219,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
)
def test_simple_model_with_torchdistX_default_init(self):
def meta_module_fn():
return deferred_init.deferred_init(MyModel, device="cuda")
return deferred_init.deferred_init(MyModel, device=device_type)
self._test_simple_model_with_meta_device(meta_module_fn)
@ -228,7 +230,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
)
def test_simple_model_with_torchdistX_init_fn(self):
def meta_module_fn():
return deferred_init.deferred_init(MyModel, device="cuda")
return deferred_init.deferred_init(MyModel, device=device_type)
self._test_simple_model_with_meta_device(
meta_module_fn, init_fn=_init_with_torchdistX
@ -248,7 +250,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
param_init_fn=init_fn,
)
meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3)
module_regular = NestedModel(device="cuda")
module_regular = NestedModel(device=device_type)
_reset_params_if_meta(is_meta, module_regular)
fsdp_regular = FSDP(
module_regular,
@ -269,7 +271,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
# Init and reset parameters before wrapping so that reset_params
# matches up with meta device's initialization.
module_regular = NestedModel(device="cuda")
module_regular = NestedModel(device=device_type)
_reset_params_if_meta(is_meta, module_regular)
with enable_wrap(wrapper_cls=FSDP):
module_regular.lin1 = wrap(module_regular.lin1)
@ -279,7 +281,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
# Compare it before training
self._compare_fsdp(fsdp_meta, fsdp_regular)
inp = torch.randn(10, 2, device="cuda")
inp = torch.randn(10, 2, device=device_type)
fsdp_meta(inp).sum().backward()
fsdp_regular(inp).sum().backward()
meta_opt.step()
@ -317,7 +319,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
@parametrize("auto_wrap", [True, False])
def test_nested_model_with_torchdistX_default_init(self, auto_wrap):
def meta_module_fn():
return deferred_init.deferred_init(NestedModel, device="cuda")
return deferred_init.deferred_init(NestedModel, device=device_type)
self._test_nested_model_with_meta_device(
auto_wrap=auto_wrap, meta_module_fn=meta_module_fn
@ -331,7 +333,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
@parametrize("auto_wrap", [True, False])
def test_nested_model_with_torchdistX_init_fn(self, auto_wrap):
def meta_module_fn():
return deferred_init.deferred_init(NestedModel, device="cuda")
return deferred_init.deferred_init(NestedModel, device=device_type)
self._test_nested_model_with_meta_device(
auto_wrap=auto_wrap,
@ -351,7 +353,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
)
def test_bad_arg_torchdistx(self):
def meta_module_fn():
return deferred_init.deferred_init(NestedModel, "cuda")
return deferred_init.deferred_init(NestedModel, device_type)
self._test_bad_arg(meta_module_fn)
@ -401,7 +403,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
# TODO: `module.to_empty()` is not generally correct for meta
# device initialization.
# https://github.com/pytorch/pytorch/issues/90465
module.to_empty(device=torch.device("cuda"))
module.to_empty(device=torch.device(device_type))
module.apply(model._module_init_fn)
model = Model()
@ -414,7 +416,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
param_dtype=torch.float32, reduce_dtype=torch.float16
),
param_init_fn=_param_init_fn,
device_id=torch.cuda.current_device(),
device_id=torch.accelerator.current_device_index(),
)

View File

@ -38,7 +38,6 @@ from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
parametrize,
run_tests,
skipIfRocm,
TEST_WITH_DEV_DBG_ASAN,
)
@ -514,7 +513,6 @@ class TestFSDPOptimState(FSDPTest):
continue
self.assertEqual(full_osd_value, ref_osd_pg[name])
@skipIfRocm
@skip_if_lt_x_gpu(2)
@parametrize("state_dict_type", STATE_DICT_TYPES)
@parametrize("use_multiple_param_groups", [False, True])

View File

@ -678,6 +678,9 @@ class RingFlexAttentionTest(DTensorTestBase):
@skip_if_lt_x_gpu(2)
@with_comms
@unittest.skipIf(
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
)
def test_ring_flex_attention(self) -> None:
self.run_subtests(
{"qkv_size": [128 * self.world_size, 2048]},
@ -694,6 +697,9 @@ class RingFlexAttentionTest(DTensorTestBase):
# TODO: merge with the above test
@skip_if_lt_x_gpu(2)
@with_comms
@unittest.skipIf(
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
)
def test_ring_flex_attention_document_mask(self) -> None:
random.seed(10)

View File

@ -848,6 +848,30 @@ class DTensorMeshTest(DTensorTestBase):
self.assertEqual(local_shard.shape, (4, 3))
self.assertEqual(local_shard, torch.ones(4, 3) + torch.ones(3))
@with_comms
def test_vmap_embedding(self):
mesh = self.build_device_mesh()
batch_size, seq_len = 2, 6
output_dim = 32
indices = torch.zeros(*(batch_size, seq_len), dtype=torch.int64)
indices[0, 1] = 1
indices[1, 3] = 1
indices[1, 5] = 1
indices = DTensor.from_local(indices, mesh, [Shard(0)])
emb = torch.randn(
*(batch_size, 8, output_dim),
dtype=torch.float32,
)
emb = DTensor.from_local(emb, mesh, [Shard(0)])
result = torch.vmap(F.embedding)(indices, emb)
expected = [F.embedding(indices[i], emb[i]) for i in range(batch_size)]
expected = torch.stack(expected)
local_result = result.to_local()
local_expected = expected.to_local()
self.assertEqual(local_result, local_expected)
@with_comms
def test_auto_implicit_replication(self):
mesh = self.build_device_mesh()

View File

@ -131,7 +131,7 @@ class DTensorConstructorTest(DTensorTestBase):
@with_comms
def test_zeros_full_mesh(self):
# construct a cuda device 1d mesh
# construct a gpu device 1d mesh
mesh = self.build_device_mesh()
placements = [Shard(0)]
size = [32, 3]
@ -157,7 +157,7 @@ class DTensorConstructorTest(DTensorTestBase):
self.assertEqual(local_tensor.size(), torch.Size([7, 3]))
self.assertEqual(torch.zeros(7, 3), local_tensor)
# construct a cuda device mesh with 2d: shard, replicate
# construct a gpu device mesh with 2d: shard, replicate
mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).reshape(2, 2))
placements = [Shard(0), Replicate()]
size = [32, 4]
@ -168,7 +168,7 @@ class DTensorConstructorTest(DTensorTestBase):
self.assertEqual(local_tensor.size(), torch.Size([16, 4]))
self.assertEqual(local_tensor, torch.zeros([16, 4]))
# construct a cuda device mesh with 2d: shard, shard
# construct a gpu device mesh with 2d: shard, shard
placements = [Shard(0), Shard(1)]
size = [32, 4]
dist_tensor = zeros(size, device_mesh=mesh, placements=placements)
@ -197,7 +197,7 @@ class DTensorConstructorTest(DTensorTestBase):
@with_comms
def test_zeros_submesh(self):
# default world_size is 4
# construct a cuda device 1d mesh, with no sub pg initialized
# construct a gpu device 1d mesh, with no sub pg initialized
sub_mesh_list = [0, 3]
mesh = DeviceMesh(self.device_type, sub_mesh_list)
placements = [Shard(0)]
@ -213,7 +213,7 @@ class DTensorConstructorTest(DTensorTestBase):
self.assertEqual(local_tensor.size(), torch.Size([0]))
self.assertEqual(local_tensor, torch.zeros(0))
# construct a cuda device 1d mesh: unevenly, with subpg initialized
# construct a gpu device 1d mesh: unevenly, with subpg initialized
sub_mesh_list = [0, 1, 3]
mesh = DeviceMesh(self.device_type, sub_mesh_list)
placements = [Shard(0)]
@ -233,7 +233,7 @@ class DTensorConstructorTest(DTensorTestBase):
self.assertEqual(local_tensor.size(), torch.Size([0]))
self.assertEqual(local_tensor, torch.tensor([]))
# construct a cuda device 2d mesh, with no subpg initialized
# construct a gpu device 2d mesh, with no subpg initialized
sub_mesh_list = [[0], [3]]
mesh = DeviceMesh(self.device_type, sub_mesh_list)
placements = [Shard(0), Shard(1)]

View File

@ -24,7 +24,7 @@ from torch.distributed.tensor.parallel import (
RowwiseParallel,
SequenceParallel,
)
from torch.testing._internal.common_utils import run_tests, skipIfRocm
from torch.testing._internal.common_utils import run_tests
from torch.testing._internal.distributed._tensor.common_dtensor import (
DTensorTestBase,
skip_unless_torch_gpu,
@ -695,7 +695,6 @@ class DistMathOpsTest(DTensorTestBase):
self.assertEqual(grad1_norm.device_mesh, mesh_y)
@with_comms
@skipIfRocm
def test_foreach_add_different_mesh(self):
mesh_shape = (2, self.world_size // 2)
mesh_2d = init_device_mesh(

View File

@ -44,7 +44,7 @@ class DistTensorRandomInitTest(DTensorTestBase):
shard_spec = [Shard(0)]
input_size = (8, 4)
# NOTE: currently random initialization on cuda device has different
# NOTE: currently random initialization on gpu device has different
# behavior from other devices. Unify the test once the behavior is unified.
if not is_rng_supported_mesh(device_mesh):
input_tensor = torch.randn(*input_size, device=self.device_type)
@ -97,7 +97,7 @@ class DistTensorRandomInitTest(DTensorTestBase):
def test_init_with_user_generator(self):
device_mesh = self.build_device_mesh()
torch.manual_seed(42)
rng = torch.Generator(device="cuda").manual_seed(42)
rng = torch.Generator(device=self.device_type).manual_seed(42)
t1 = torch.distributed.tensor.empty(
(8, 3), device_mesh=device_mesh, placements=[Shard(0)]
)
@ -126,7 +126,7 @@ class DistTensorRandomInitTest(DTensorTestBase):
# The DTensor random ops will use the same generator as the default one on the device.
# Note: this behavior changed, and now the guideline is to set the same RNG seed on all SPMD ranks.
torch.cuda.manual_seed(0)
torch.get_device_module(self.device_type).manual_seed(0)
device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
size = [1024, 2048]
meta_dtensor = distribute_tensor(
@ -592,8 +592,8 @@ class DistTensorRandomOpsTest3D(DTensorTestBase):
def world_size(self):
return 8
@with_comms
@skip_if_lt_x_gpu(8)
@with_comms
def test_hsdp_tp_model_meta_init(self):
# initialize the 3-d device mesh
global_mesh = init_device_mesh(

View File

@ -43,6 +43,7 @@ from torch.testing._internal.common_utils import (
retry_on_connect_failures,
run_tests,
TEST_WITH_DEV_DBG_ASAN,
TEST_XPU,
TestCase,
)
from torch.utils.checkpoint import checkpoint
@ -63,6 +64,8 @@ else:
torch.backends.cuda.matmul.allow_tf32 = False
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
def gpus_for_rank(world_size):
"""Multigpu tests are designed to simulate the multi nodes with multi
@ -70,8 +73,9 @@ def gpus_for_rank(world_size):
On a single node, all visible GPUs are evenly
divided to subsets, each process only uses a subset.
"""
visible_devices = list(range(torch.cuda.device_count()))
gpus_per_process = torch.cuda.device_count() // world_size
device_count = torch.accelerator.device_count()
visible_devices = list(range(device_count))
gpus_per_process = device_count // world_size
gpus_for_rank = []
for rank in range(world_size):
gpus_for_rank.append(
@ -293,6 +297,23 @@ class ConvNet(nn.Module):
return self.conv3(x)
# A model involving FFTs, used to test DDP with complex tensors
class FFTModel(nn.Module):
def __init__(self, hin, win, n_features):
super().__init__()
self.hin = hin
self.win = win
self.weight = nn.Parameter(
torch.ones((n_features, n_features, hin, win // 2 + 1), dtype=torch.cfloat)
)
def forward(self, x):
xc = torch.fft.rfft2(x, s=(self.hin, self.win), dim=(-2, -1), norm="ortho")
xcw = torch.einsum("nchw,cohw->nohw", xc, self.weight)
x = torch.fft.irfft2(xcw, dim=(-2, -1), norm="ortho")
return x
class Task(nn.Module):
def __init__(self) -> None:
super().__init__()
@ -384,7 +405,7 @@ class CommonDistributedDataParallelTest:
gradient_as_bucket_view=gradient_as_bucket_view,
)
input = torch.randn(global_batch_size, 2).cuda(devices[0])
input = torch.randn(global_batch_size, 2).to(devices[0])
target = torch.randn(global_batch_size, 4)
return model, ddp_model, input, target
@ -418,10 +439,10 @@ class CommonDistributedDataParallelTest:
allow_none_grads=False,
):
# to reproduce the same training results
torch.cuda.set_device(self.rank)
torch.accelerator.set_device_index(self.rank)
torch.manual_seed(31415)
model = copy.deepcopy(input_model).cuda()
ddp_model = copy.deepcopy(input_model).cuda()
model = copy.deepcopy(input_model).to(device_type)
ddp_model = copy.deepcopy(input_model).to(device_type)
ddp_model = nn.parallel.DistributedDataParallel(
ddp_model,
bucket_cap_mb=1,
@ -537,8 +558,8 @@ class CommonDistributedDataParallelTest:
def _prepare_dummy_data(self):
ddp_bs = 16
bs = ddp_bs * self.world_size
input = torch.rand((bs, 20), device="cuda", requires_grad=True)
target = torch.randn((bs, 20), device="cuda")
input = torch.rand((bs, 20), device=device_type, requires_grad=True)
target = torch.randn((bs, 20), device=device_type)
offset = self.rank * ddp_bs
ddp_input = input[offset : offset + ddp_bs]
ddp_target = target[offset : offset + ddp_bs]
@ -698,7 +719,7 @@ class CommonDistributedDataParallelTest:
Test that checkpointing with weight sharing works.
"""
process_group = self._get_process_group()
torch.cuda.set_device(self.rank)
torch.accelerator.set_device_index(self.rank)
for use_bucket_view, static_graph in product((False, True), (False, True)):
torch.manual_seed(31415)
l1 = nn.Linear(20, 20)
@ -721,7 +742,7 @@ class CommonDistributedDataParallelTest:
same layer twice and having weights shared across layers.
"""
process_group = self._get_process_group()
torch.cuda.set_device(self.rank)
torch.accelerator.set_device_index(self.rank)
for use_bucket_view in (True, False):
self._test_ddp_checkpointing(
self.CheckpointTwiceModuleWeightSharing(),
@ -1145,7 +1166,7 @@ class AbstractCommTest:
# Verify sequence numbers are appropriately incremented
for i in range(10):
t = torch.ones(1, device=torch.cuda.current_device())
t = torch.ones(1, device=device_type)
dist.all_reduce(t, group=process_group)
if not c10d._rank_not_in_group(process_group):
seq_num = self._verify_sequence_number_across_pg(
@ -1176,7 +1197,7 @@ class AbstractCommTest:
self.assertEqual(rank_to_seq_num[0] + 1, rank_to_seq_num[1])
def _test_sequence_num_incremented_default_group(self, backend_name):
torch.cuda.set_device(self.rank)
torch.accelerator.set_device_index(self.rank)
store = dist.FileStore(self.file_name, self.world_size)
dist.init_process_group(
backend_name,
@ -1190,7 +1211,7 @@ class AbstractCommTest:
)
def _test_sequence_num_incremented_subgroup(self, backend_name):
torch.cuda.set_device(self.rank)
torch.accelerator.set_device_index(self.rank)
store = dist.FileStore(self.file_name, self.world_size)
dist.init_process_group(
backend_name,
@ -1245,8 +1266,8 @@ class AbstractCommTest:
in_group_ranks = list(filter(lambda x: x % 2 == 0, range(self.world_size)))
group = dist.new_group(in_group_ranks)
x = torch.zeros(2, 2).cuda(self.rank)
xs = [torch.zeros(2, 2).cuda(self.rank) for _ in range(len(in_group_ranks))]
x = torch.zeros(2, 2).to(self.rank)
xs = [torch.zeros(2, 2).to(self.rank) for _ in range(len(in_group_ranks))]
if self.rank not in in_group_ranks:
msg = ".*{}.*does not belong to.*"
with self.assertWarnsOnceRegex(UserWarning, msg.format("all_gather")):
@ -1375,7 +1396,7 @@ class AbstractCommTest:
rank=self.rank,
store=store,
)
device = "cuda" if backend == "nccl" else "cpu"
device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
# test alltoall_base
tensor = torch.tensor([1, 0, 0, 1], dtype=torch.bool, device=device)
zeros = torch.tensor([0, 0, 0, 0], dtype=torch.bool, device=device)
@ -1557,8 +1578,8 @@ class CommTest(AbstractCommTest, MultiProcessTestCase):
class DummyWork(dist._Work):
def wait(self, timeout=5.0):
if torch.cuda.is_available():
torch.cuda.current_stream().synchronize()
if torch.accelerator.is_available():
torch.accelerator.current_stream().synchronize()
return True
@ -1773,6 +1794,18 @@ class PythonProcessGroupExtensionTest(MultiProcessTestCase):
("cpu:gloo,cuda:nccl", "cpu:gloo,cuda:nccl"),
]
if TEST_XPU:
# Override backend_config_strings_and_expected_values for Intel GPU.
backend_config_strings_and_expected_values[4:10] = [
(dist.Backend.DUMMY, "cpu:dummy,cuda:dummy,xpu:dummy"),
("DUMMY", "cpu:dummy,cuda:dummy,xpu:dummy"),
("dummy", "cpu:dummy,cuda:dummy,xpu:dummy"),
("cpu:dummy,xpu:dummy", "cpu:dummy,xpu:dummy"),
("cpu:dummy,xpu:xccl", "cpu:dummy,xpu:xccl"),
("cpu:gloo,xpu:dummy", "cpu:gloo,xpu:dummy"),
("cpu:gloo,xpu:xccl", "cpu:gloo,xpu:xccl"),
]
for config_str, expected_value in backend_config_strings_and_expected_values:
with self.subTest(config_str):
# ensures these configs strings are valid and no ValueError is raised
@ -1783,6 +1816,8 @@ class PythonProcessGroupExtensionTest(MultiProcessTestCase):
invalid_backend_config_strings = [
"cpu:gloo,cuda:nccl,", # trailing comma
"cpu:gloo,cuda:nccl,cpu:dummy", # duplicate device
"cpu:gloo,xpu:xccl,", # trailing comma
"cpu:gloo,xpu:xccl,cpu:dummy", # duplicate device
]
for config_str in invalid_backend_config_strings:
with self.subTest(config_str):
@ -1797,7 +1832,7 @@ class PythonProcessGroupExtensionTest(MultiProcessTestCase):
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "6789"
dist.init_process_group(
"cpu:dummy,cuda:dummy", rank=self.rank, world_size=self.world_size
"cpu:dummy,cuda:dummy,xpu:dummy", rank=self.rank, world_size=self.world_size
)
# test all_gather
@ -2036,7 +2071,7 @@ dist.init_process_group(rank=0, world_size=1, store=dist.HashStore())
# correctly dispatched
# TODO: this will be updated in the future to not be backend specific
device = "cuda" if backend == "nccl" else "cpu"
device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
# ensure supported devices (cpu, cuda) succeeds during dispatch call
tensor = torch.zeros(2, 2, device=torch.device(device))
# multi tensor collectives
@ -2102,7 +2137,7 @@ dist.init_process_group(rank=0, world_size=1, store=dist.HashStore())
rank=self.rank,
store=store,
)
device = "cuda" if backend == "nccl" else "cpu"
device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
# test alltoall_base
input_tensor = torch.ones(2, 2, device=torch.device(device))
output_tensor = torch.zeros(2, 2, device=torch.device(device))
@ -2234,8 +2269,9 @@ class LocalRankTest(MultiProcessTestCase):
if __name__ == "__main__":
assert not torch.cuda._initialized, (
"test_distributed must not have initialized CUDA context on main process"
)
if device_type != "cpu":
assert not torch.get_device_module()._initialized, (
"test_distributed must not have initialized {device_type} context on main process"
)
run_tests()

View File

@ -24,7 +24,7 @@ from torch.distributed._functional_collectives import (
from torch.testing._internal.common_cuda import SM90OrLater
from torch.testing._internal.common_distributed import (
MultiProcessTestCase,
requires_nccl,
requires_accelerator_dist_backend,
skip_if_lt_x_gpu,
)
from torch.testing._internal.common_utils import ( # type: ignore[attr-defined]
@ -59,7 +59,7 @@ if not dist.is_available():
sys.exit(0)
@requires_nccl()
@requires_accelerator_dist_backend(["nccl", "xccl"])
class TestWithNCCL(MultiProcessTestCase):
def setUp(self) -> None:
super().setUp()
@ -75,13 +75,15 @@ class TestWithNCCL(MultiProcessTestCase):
@property
def device(self) -> torch.device:
return torch.device(f"cuda:{self.rank}")
return torch.device(self.rank)
def _init_process_group(self) -> None:
torch.cuda.set_device(self.device)
torch.accelerator.set_device_idx(self.device.index)
store = dist.FileStore(self.file_name, self.world_size)
backend = dist.get_default_backend_for_device(self.device.type)
dist.init_process_group(
backend="nccl",
backend=backend,
world_size=self.world_size,
rank=self.rank,
store=store,
@ -273,7 +275,7 @@ class TestWithNCCL(MultiProcessTestCase):
)
# check memory leak
for i in range(1, 10):
mem_usage[i] = torch.cuda.max_memory_allocated()
mem_usage[i] = torch.accelerator.max_memory_allocated()
compiled(arg)
assert mem_usage[9] == mem_usage[8]
@ -370,14 +372,16 @@ class TestWithNCCL(MultiProcessTestCase):
@skip_if_lt_x_gpu(2)
def test_all_to_all_single(self) -> None:
self._init_process_group()
torch.cuda.set_device(self.device)
torch.accelerator.set_device_index(self.rank)
torch.manual_seed(42)
send_sz_matrix = torch.randint(0, 20, (self.world_size, self.world_size))
input_split_sizes = send_sz_matrix[self.rank].tolist()
output_split_sizes = send_sz_matrix[:, self.rank].tolist()
input = torch.full((sum(input_split_sizes),), float(self.rank)).cuda()
input = torch.full((sum(input_split_sizes),), float(self.rank)).to(
self.device.type
)
output = torch.ops._c10d_functional.all_to_all_single(
input,
@ -388,7 +392,7 @@ class TestWithNCCL(MultiProcessTestCase):
output = torch.ops._c10d_functional.wait_tensor(output)
expect = torch.cat(
[
torch.full((sz,), float(rank)).cuda()
torch.full((sz,), float(rank)).to(self.device.type)
for rank, sz in enumerate(output_split_sizes)
]
)
@ -464,7 +468,7 @@ class TestWithNCCL(MultiProcessTestCase):
@fresh_cache()
def test_threading(self):
self._init_process_group()
device = torch.device(f"cuda:{self.rank}")
device = self.device
def func(arg: torch.Tensor) -> torch.Tensor:
buf0 = arg + 42
@ -546,9 +550,9 @@ class TestWithNCCL(MultiProcessTestCase):
return in_grad, w_grad
m, n, k = 128, 256, 64
in_ = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
w = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
out_grad = torch.randn((m, n), device="cuda", dtype=torch.bfloat16)
in_ = torch.randn((m, k), device=self.device.type, dtype=torch.bfloat16)
w = torch.randn((n, k), device=self.device.type, dtype=torch.bfloat16)
out_grad = torch.randn((m, n), device=self.device.type, dtype=torch.bfloat16)
eager_in_grad, eager_w_grad = fp8_rowwise_backward(in_, w, out_grad)
compile_in_grad, compile_w_grad = torch.compile(fp8_rowwise_backward)(
@ -777,7 +781,8 @@ class CompileTest(TestCase):
self.rank = 0
self.world_size = 2
torch.cuda.set_device("cuda:0")
torch.accelerator.set_device_index(0)
self.device = torch.accelerator.current_accelerator()
store = FakeStore()
dist.init_process_group(
@ -803,7 +808,7 @@ class CompileTest(TestCase):
ar1 = funcol.wait_tensor(ar1)
return ar0, ar1
arg = torch.rand(4, 4, device="cuda")
arg = torch.rand(4, 4, device=self.device)
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, arg)
@ -836,7 +841,7 @@ class CompileTest(TestCase):
# Test aoti
AOTIRunnerUtil.run(func, (arg,))
torch.cuda.synchronize()
torch.accelerator.synchronize()
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@fresh_cache()
@ -851,7 +856,7 @@ class CompileTest(TestCase):
ar1 = [funcol.wait_tensor(out) for out in ar1]
return ar0, ar1
args = [torch.rand(4, 4, device="cuda") for _ in range(2)]
args = [torch.rand(4, 4, device=self.device.type) for _ in range(2)]
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, args)
buf0, buf1, buf2, buf3 = find_buffer_assignments(code)
@ -881,7 +886,7 @@ class CompileTest(TestCase):
# Test aoti
out = AOTIRunnerUtil.run(func, (args,)) # noqa: F841
torch.cuda.synchronize()
torch.accelerator.synchronize()
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@fresh_cache()
@ -892,7 +897,7 @@ class CompileTest(TestCase):
ar0 = funcol.wait_tensor(ar0)
return ar0
arg = torch.rand(4, 4, device="cuda")
arg = torch.rand(4, 4, device=self.device.type)
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, arg)
@ -917,7 +922,7 @@ class CompileTest(TestCase):
# Expect allocation
return ar0
arg = torch.rand(4, 4, device="cuda").T
arg = torch.rand(4, 4, device=self.device.type).T
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, arg)
@ -948,7 +953,7 @@ class CompileTest(TestCase):
buf2 = torch.mm(arg, buf1)
return buf1, buf2
arg = torch.rand(4, 4, device="cuda")
arg = torch.rand(4, 4, device=self.device.type)
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, arg)
buf0, buf1 = find_buffer_assignments(code)
@ -978,7 +983,7 @@ class CompileTest(TestCase):
ag0 = funcol.wait_tensor(ag0)
return ag0
arg = torch.rand(4, 4, device="cuda")
arg = torch.rand(4, 4, device=self.device.type)
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, arg)
(
@ -995,7 +1000,7 @@ class CompileTest(TestCase):
# Test aoti
AOTIRunnerUtil.run(func, (arg,))
torch.cuda.synchronize()
torch.accelerator.synchronize()
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@fresh_cache()
@ -1005,7 +1010,7 @@ class CompileTest(TestCase):
ag0 = [funcol.wait_tensor(out) for out in ag0]
return ag0
args = [torch.rand(4, 4, device="cuda") for _ in range(4)]
args = [torch.rand(4, 4, device=self.device.type) for _ in range(4)]
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, args)
(
@ -1029,7 +1034,7 @@ class CompileTest(TestCase):
# Test aoti
out = AOTIRunnerUtil.run(func, (args,)) # noqa: F841
torch.cuda.synchronize()
torch.accelerator.synchronize()
@unittest.skipIf(not HAS_GPU, "This is a GPU test!")
@fresh_cache()
@ -1039,7 +1044,7 @@ class CompileTest(TestCase):
return funcol.wait_tensor(t)
# Test aoti
arg = torch.rand(4, 4, device="cuda")
arg = torch.rand(4, 4, device=self.device.type)
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, arg)
(
@ -1051,7 +1056,7 @@ class CompileTest(TestCase):
# Test aoti
AOTIRunnerUtil.run(func, (arg,))
torch.cuda.synchronize()
torch.accelerator.synchronize()
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@fresh_cache()
@ -1061,7 +1066,7 @@ class CompileTest(TestCase):
rs0 = funcol.wait_tensor(rs0)
return rs0
arg = torch.rand(4, 4, device="cuda")
arg = torch.rand(4, 4, device=self.device.type)
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, arg)
(
@ -1077,7 +1082,7 @@ class CompileTest(TestCase):
# Test aoti
AOTIRunnerUtil.run(func, (arg,))
torch.cuda.synchronize()
torch.accelerator.synchronize()
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@fresh_cache()
@ -1089,7 +1094,7 @@ class CompileTest(TestCase):
rs0 = [funcol.wait_tensor(out) for out in rs0]
return rs0
args = [torch.rand(4, 4, device="cuda") for _ in range(4)]
args = [torch.rand(4, 4, device=self.device.type) for _ in range(4)]
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, args)
(
@ -1113,7 +1118,7 @@ class CompileTest(TestCase):
# Test aoti
AOTIRunnerUtil.run(func, (args,))
torch.cuda.synchronize()
torch.accelerator.synchronize()
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@fresh_cache()
@ -1142,7 +1147,9 @@ class CompileTest(TestCase):
input_split_sizes = send_sz_matrix[self.rank]
output_split_sizes = send_sz_matrix[:, self.rank].contiguous()
input = torch.full((input_split_sizes.sum().item(),), float(self.rank)).cuda()
input = torch.full((input_split_sizes.sum().item(),), float(self.rank)).to(
self.device.type
)
with torch._dynamo.config.patch(
dynamic_shapes=True,
@ -1176,7 +1183,7 @@ class CompileTest(TestCase):
br1 = funcol.wait_tensor(br1)
return br0, br1
arg = torch.rand(4, 4, device="cuda")
arg = torch.rand(4, 4, device=self.device.type)
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, arg)
@ -1199,7 +1206,7 @@ class CompileTest(TestCase):
# Test aoti
AOTIRunnerUtil.run(func, (arg,))
torch.cuda.synchronize()
torch.accelerator.synchronize()
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@fresh_cache()
@ -1214,7 +1221,7 @@ class CompileTest(TestCase):
ar1 = funcol.wait_tensor(ar1)
return ar0, ar1
arg = torch.rand(4, 4, device="cuda")
arg = torch.rand(4, 4, device=self.device.type)
compiled = torch.compile(func, fullgraph=True)
code = run_and_get_triton_code(compiled, arg)

View File

@ -25,6 +25,7 @@ if not c10d.is_available() or not c10d.is_gloo_available():
import test_c10d_common
from test_c10d_common import (
FFTModel,
gpus_for_rank,
LOOPBACK,
ModuleForDdpCommHook,
@ -134,6 +135,32 @@ def simple_reduce_tests(rank, world_size):
),
)
# Extend tests for cfloat dtype
tests.extend(
(
(
c10d.ReduceOp.SUM,
torch.tensor([complex(rank + 1.0, rank + 1.0)], dtype=torch.cfloat),
torch.tensor(
[
complex(
world_size * (world_size + 1) / 2,
world_size * (world_size + 1) / 2,
)
],
dtype=torch.cfloat,
),
),
(
c10d.ReduceOp.AVG,
torch.tensor([complex(rank + 1.0, rank + 1.0)], dtype=torch.cfloat),
torch.tensor(
[complex(float((world_size + 1) / 2), float((world_size + 1) / 2))],
dtype=torch.cfloat,
),
),
)
)
return tests
@ -373,6 +400,13 @@ class ProcessGroupGlooTest(MultiProcessTestCase):
torch.tensor([i * num + j], dtype=torch.float32), output[1]
)
# Run with 1 input tensor of cfloat dtype
x = fn(torch.tensor([complex(self.rank, self.rank)], dtype=torch.cfloat))
output = broadcast([x], i, 0)
self.assertEqual(
torch.tensor([complex(i, i)], dtype=torch.cfloat), output[0]
)
# Test overloaded convenience function
x = torch.tensor([self.rank + 1.0])
fut = pg.broadcast(x, root=0).get_future()
@ -1605,6 +1639,22 @@ class ProcessGroupGlooTest(MultiProcessTestCase):
work.wait()
@requires_gloo()
def test_send_recv_complex(self):
store = c10d.FileStore(self.file_name, self.world_size)
pg = self._create_process_group_gloo(
store, self.rank, self.world_size, self.opts()
)
# Generate the same random tensor
torch.manual_seed(0)
send_tensor = torch.rand(10, 10, dtype=torch.cfloat)
if self.rank == 0:
pg.send([send_tensor], 1, 0).wait()
if self.rank == 1:
recv_tensor = torch.rand(10, 10, dtype=torch.cfloat)
pg.recv([recv_tensor], 0, 0).wait()
self.assertEqual(send_tensor, recv_tensor)
class DistributedDataParallelTest(
test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
@ -2270,6 +2320,24 @@ class DistributedDataParallelTest(
self._run_and_verify_sparse_gradients(vanilla_model, ddp_model)
@requires_gloo()
def test_ddp_complex_params(self):
process_group = self._get_process_group()
N, C, H, W = 1, 16, 64, 64
ddp_model = DistributedDataParallel(
FFTModel(hin=H, win=W, n_features=C),
process_group=process_group,
)
optimizer = torch.optim.Adam(ddp_model.parameters(), lr=0.001)
inp = torch.ones((N, C, H, W), dtype=torch.float32)
# train step
out = ddp_model(inp)
loss = torch.sum(out)
loss.backward()
optimizer.step()
class ReducerModule(nn.Module):
def __init__(self) -> None:

View File

@ -29,7 +29,13 @@ if not c10d.is_available() or not c10d.is_nccl_available():
import test_c10d_common
from test_c10d_common import ConvNet, DoubleGpuNet, gpus_for_rank, ModuleForDdpCommHook
from test_c10d_common import (
ConvNet,
DoubleGpuNet,
FFTModel,
gpus_for_rank,
ModuleForDdpCommHook,
)
import torch.distributed as dist
import torch.distributed.algorithms.ddp_comm_hooks.default_hooks as default
@ -2552,25 +2558,6 @@ class DistributedDataParallelTest(
@requires_nccl()
@skip_if_lt_x_gpu(2)
def test_ddp_complex_params(self):
class FFTModel(nn.Module):
def __init__(self, hin, win, n_features):
super().__init__()
self.hin = hin
self.win = win
self.weight = nn.Parameter(
torch.ones(
(n_features, n_features, hin, win // 2 + 1), dtype=torch.cfloat
)
)
def forward(self, x):
xc = torch.fft.rfft2(
x, s=(self.hin, self.win), dim=(-2, -1), norm="ortho"
)
xcw = torch.einsum("nchw,cohw->nohw", xc, self.weight)
x = torch.fft.irfft2(xcw, dim=(-2, -1), norm="ortho")
return x
process_group = self._get_process_group()
device_id = gpus_for_rank(self.world_size)[self.rank][0]
N, C, H, W = 1, 16, 64, 64

Some files were not shown because too many files have changed in this diff Show More