mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-02 06:24:59 +08:00
Update on "[Inductor XPU GEMM] Step 6/N: Refactor CUDACodeCache."
This PR is part of #160175. It extracts the CUDA-independent functionality from `CUDACodeCache` into `CUTLASSCodeCache`, which `CUDACodeCache` then inherits and extends with CUDA-specific logic. This design allows `CUTLASSCodeCache` to be reused by XPU as well. In addition, CUDA compilation logic has been moved into torch/_inductor/codegen/cuda/compile_utils.py, making codecache.py cleaner. cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx ipiszy chenyang78 kadeng muchulee8 amjames chauhang aakhundov coconutruben [ghstack-poisoned]
This commit is contained in:
@ -1 +1 @@
|
||||
d0e80f39c562c70986fc548fa6e5852ad86e16e7
|
||||
1b0418a9a454b2b93ab8d71f40e59d2297157fae
|
||||
|
||||
@ -147,7 +147,7 @@ function install_128 {
|
||||
}
|
||||
|
||||
function install_130 {
|
||||
CUDNN_VERSION=9.12.0.46
|
||||
CUDNN_VERSION=9.13.0.50
|
||||
echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
||||
# install CUDA 13.0 in the same container
|
||||
install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
|
||||
|
||||
@ -74,6 +74,14 @@ RUN bash ./install_cuda.sh 13.0
|
||||
RUN bash ./install_magma.sh 13.0
|
||||
RUN ln -sf /usr/local/cuda-13.0 /usr/local/cuda
|
||||
|
||||
# Install libibverbs for libtorch and copy to CUDA directory
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y libibverbs-dev librdmacm-dev && \
|
||||
cp /usr/lib/x86_64-linux-gnu/libmlx5.so* /usr/local/cuda/lib64/ && \
|
||||
cp /usr/lib/x86_64-linux-gnu/librdmacm.so* /usr/local/cuda/lib64/ && \
|
||||
cp /usr/lib/x86_64-linux-gnu/libibverbs.so* /usr/local/cuda/lib64/ && \
|
||||
cp /usr/lib/x86_64-linux-gnu/libnl* /usr/local/cuda/lib64/
|
||||
|
||||
FROM cpu as rocm
|
||||
ARG ROCM_VERSION
|
||||
ARG PYTORCH_ROCM_ARCH
|
||||
|
||||
@ -1 +1 @@
|
||||
3.4.0
|
||||
3.5.0
|
||||
|
||||
@ -76,7 +76,6 @@ def sample_vllm_test_library():
|
||||
),
|
||||
"pytest -v -s entrypoints/llm/test_lazy_outlines.py",
|
||||
"pytest -v -s entrypoints/llm/test_generate.py ",
|
||||
"pytest -v -s entrypoints/llm/test_generate_multiple_loras.py",
|
||||
"VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
|
||||
],
|
||||
},
|
||||
|
||||
@ -124,6 +124,7 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
|
||||
fi
|
||||
if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
|
||||
echo "Bundling with cudnn and cublas."
|
||||
|
||||
DEPS_LIST+=(
|
||||
"/usr/local/cuda/lib64/libcudnn_adv.so.9"
|
||||
"/usr/local/cuda/lib64/libcudnn_cnn.so.9"
|
||||
@ -133,16 +134,11 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
|
||||
"/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
|
||||
"/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
|
||||
"/usr/local/cuda/lib64/libcudnn.so.9"
|
||||
"/usr/local/cuda/lib64/libcublas.so.12"
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.12"
|
||||
"/usr/local/cuda/lib64/libcusparseLt.so.0"
|
||||
"/usr/local/cuda/lib64/libcudart.so.12"
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.12"
|
||||
"/usr/local/cuda/lib64/libnvrtc-builtins.so"
|
||||
"/usr/local/cuda/lib64/libcufile.so.0"
|
||||
"/usr/local/cuda/lib64/libcufile_rdma.so.1"
|
||||
"/usr/local/cuda/lib64/libnvshmem_host.so.3"
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
@ -154,22 +150,56 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
|
||||
"libcudnn_engines_precompiled.so.9"
|
||||
"libcudnn_heuristic.so.9"
|
||||
"libcudnn.so.9"
|
||||
"libcublas.so.12"
|
||||
"libcublasLt.so.12"
|
||||
"libcusparseLt.so.0"
|
||||
"libcudart.so.12"
|
||||
"libnvrtc.so.12"
|
||||
"libnvrtc-builtins.so"
|
||||
"libnvshmem_host.so.3"
|
||||
"libcufile.so.0"
|
||||
"libcufile_rdma.so.1"
|
||||
"libcupti.so.12"
|
||||
"libnvperf_host.so"
|
||||
)
|
||||
# Add libnvToolsExt only if CUDA version is not 12.9
|
||||
if [[ $CUDA_VERSION != 12.9* ]]; then
|
||||
DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
|
||||
DEPS_SONAME+=("libnvToolsExt.so.1")
|
||||
if [[ $CUDA_VERSION == 13* ]]; then
|
||||
DEPS_LIST+=(
|
||||
"/usr/local/cuda/lib64/libcublas.so.13"
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.13"
|
||||
"/usr/local/cuda/lib64/libcudart.so.13"
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.13"
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13"
|
||||
"/usr/local/cuda/lib64/libibverbs.so.1"
|
||||
"/usr/local/cuda/lib64/librdmacm.so.1"
|
||||
"/usr/local/cuda/lib64/libmlx5.so.1"
|
||||
"/usr/local/cuda/lib64/libnl-3.so.200"
|
||||
"/usr/local/cuda/lib64/libnl-route-3.so.200")
|
||||
DEPS_SONAME+=(
|
||||
"libcublas.so.13"
|
||||
"libcublasLt.so.13"
|
||||
"libcudart.so.13"
|
||||
"libnvrtc.so.13"
|
||||
"libcupti.so.13"
|
||||
"libibverbs.so.1"
|
||||
"librdmacm.so.1"
|
||||
"libmlx5.so.1"
|
||||
"libnl-3.so.200"
|
||||
"libnl-route-3.so.200")
|
||||
export USE_CUPTI_SO=1
|
||||
export ATEN_STATIC_CUDA=0
|
||||
export USE_CUDA_STATIC_LINK=0
|
||||
export USE_CUFILE=0
|
||||
else
|
||||
DEPS_LIST+=(
|
||||
"/usr/local/cuda/lib64/libnvToolsExt.so.1"
|
||||
"/usr/local/cuda/lib64/libcublas.so.12"
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.12"
|
||||
"/usr/local/cuda/lib64/libcudart.so.12"
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.12"
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
|
||||
DEPS_SONAME+=(
|
||||
"libnvToolsExt.so.1"
|
||||
"libcublas.so.12"
|
||||
"libcublasLt.so.12"
|
||||
"libcudart.so.12"
|
||||
"libnvrtc.so.12"
|
||||
"libcupti.so.12")
|
||||
fi
|
||||
else
|
||||
echo "Using nvidia libs from pypi."
|
||||
|
||||
@ -199,7 +199,7 @@ torchbench_setup_macos() {
|
||||
git checkout "$(cat ../.github/ci_commit_pins/vision.txt)"
|
||||
git submodule update --init --recursive
|
||||
python setup.py clean
|
||||
python setup.py develop
|
||||
python -m pip install -e . -v --no-build-isolation
|
||||
popd
|
||||
|
||||
pushd torchaudio
|
||||
@ -208,7 +208,7 @@ torchbench_setup_macos() {
|
||||
git submodule update --init --recursive
|
||||
python setup.py clean
|
||||
#TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp
|
||||
USE_OPENMP=0 python setup.py develop
|
||||
USE_OPENMP=0 python -m pip install -e . -v --no-build-isolation
|
||||
popd
|
||||
|
||||
checkout_install_torchbench
|
||||
|
||||
@ -124,19 +124,15 @@ popd
|
||||
|
||||
export TH_BINARY_BUILD=1
|
||||
export INSTALL_TEST=0 # dont install test binaries into site-packages
|
||||
export MACOSX_DEPLOYMENT_TARGET=10.15
|
||||
export MACOSX_DEPLOYMENT_TARGET=11.0
|
||||
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
|
||||
|
||||
SETUPTOOLS_PINNED_VERSION="==70.1.0"
|
||||
PYYAML_PINNED_VERSION="==5.3"
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
RENAME_WHEEL=true
|
||||
case $desired_python in
|
||||
3.14t)
|
||||
echo "Using 3.14 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
@ -145,8 +141,6 @@ case $desired_python in
|
||||
;;
|
||||
3.14)
|
||||
echo "Using 3.14t deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
@ -154,8 +148,6 @@ case $desired_python in
|
||||
;;
|
||||
3.13t)
|
||||
echo "Using 3.13 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
@ -164,37 +156,23 @@ case $desired_python in
|
||||
;;
|
||||
3.13)
|
||||
echo "Using 3.13 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
;;
|
||||
3.12)
|
||||
echo "Using 3.12 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="==2.0.2"
|
||||
;;
|
||||
3.11)
|
||||
echo "Using 3.11 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=5.3"
|
||||
NUMPY_PINNED_VERSION="==2.0.2"
|
||||
;;
|
||||
3.10)
|
||||
echo "Using 3.10 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=5.3"
|
||||
NUMPY_PINNED_VERSION="==2.0.2"
|
||||
;;
|
||||
3.9)
|
||||
echo "Using 3.9 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=5.3"
|
||||
NUMPY_PINNED_VERSION="==2.0.2"
|
||||
;;
|
||||
*)
|
||||
echo "Using default deps"
|
||||
NUMPY_PINNED_VERSION="==1.11.3"
|
||||
echo "Unsupported version $desired_python"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
@ -204,8 +182,6 @@ conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_p
|
||||
source activate "$tmp_env_name"
|
||||
|
||||
PINNED_PACKAGES=(
|
||||
"setuptools${SETUPTOOLS_PINNED_VERSION}"
|
||||
"pyyaml${PYYAML_PINNED_VERSION}"
|
||||
"numpy${NUMPY_PINNED_VERSION}"
|
||||
)
|
||||
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
|
||||
@ -224,7 +200,7 @@ export BUILD_TEST=OFF
|
||||
pushd "$pytorch_rootdir"
|
||||
echo "Calling setup.py bdist_wheel at $(date)"
|
||||
|
||||
python setup.py bdist_wheel -d "$whl_tmp_dir"
|
||||
python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version}
|
||||
|
||||
echo "Finished setup.py bdist_wheel at $(date)"
|
||||
|
||||
|
||||
2
.github/actionlint.yaml
vendored
2
.github/actionlint.yaml
vendored
@ -12,7 +12,9 @@ self-hosted-runner:
|
||||
- linux.9xlarge.ephemeral
|
||||
- am2.linux.9xlarge.ephemeral
|
||||
- linux.12xlarge
|
||||
- linux.12xlarge.memory
|
||||
- linux.24xlarge
|
||||
- linux.24xlarge.memory
|
||||
- linux.24xlarge.ephemeral
|
||||
- linux.24xlarge.amd
|
||||
- linux.arm64.2xlarge
|
||||
|
||||
@ -4,6 +4,11 @@ name: Build External packages
|
||||
description: build external packages for PyTorch
|
||||
|
||||
inputs:
|
||||
cuda-version:
|
||||
description: CUDA version to use
|
||||
type: string
|
||||
required: true
|
||||
default: '12.8.1'
|
||||
cuda-arch-list:
|
||||
description: TORCH_CUDA_ARCH_LIST (e.g., "8.0;8.9;9.0")
|
||||
type: string
|
||||
@ -44,11 +49,12 @@ runs:
|
||||
env:
|
||||
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
||||
SCCACHE_REGION: us-east-1
|
||||
CUDA_VERSION: ${{ inputs.cuda-version }}
|
||||
TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
|
||||
BASE_IMAGE: ${{ inputs.docker-image }}
|
||||
BUILD_TARGETS: ${{ inputs.build-targets }}
|
||||
PARENT_OUTPUT_DIR: ${{ inputs.output-dir}}
|
||||
|
||||
PARENT_OUTPUT_DIR: ${{ inputs.output-dir }}
|
||||
TORCH_WHEELS_PATH: ${{ inputs.torch-wheel-dir }}
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
@ -69,7 +75,6 @@ runs:
|
||||
export OUTPUT_DIR
|
||||
echo "Building external package: $target in directory $OUTPUT_DIR"
|
||||
python3 -m cli.run build external "$target"
|
||||
|
||||
done
|
||||
|
||||
END_TIME=$(date +%s)
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
0757bbb660855272f7dd8d31cc84e7c631522805
|
||||
2e300559e4e123928a22187b8f59a5b56f57ddc8
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
b5ee1e3261d9edf94d76ba8b437ebdef7ac599ea
|
||||
4172235ab78b09989fb56edaf734dbee283dda3e
|
||||
|
||||
188
.github/ci_configs/vllm/Dockerfile.tmp_vllm
vendored
188
.github/ci_configs/vllm/Dockerfile.tmp_vllm
vendored
@ -12,54 +12,46 @@ ARG BUILD_BASE_IMAGE=torch-nightly-base
|
||||
# by default, it uses devel-ubuntu22.04 official image.
|
||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
|
||||
# The logic is copied from https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile
|
||||
ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"
|
||||
|
||||
#################### TORCH NIGHTLY BASE IMAGE ####################
|
||||
|
||||
#################### TORCH NIGHTLY BASE IMAGE ####################
|
||||
# A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci
|
||||
From nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
|
||||
ARG CUDA_VERSION=12.8.1
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG TARGETPLATFORM
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
|
||||
|
||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||
ARG CUDA_VERSION
|
||||
ARG PYTHON_VERSION
|
||||
ARG GET_PIP_URL
|
||||
|
||||
# Install Python and other dependencies if it does not existed
|
||||
RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
|
||||
echo "Installing Python ${PYTHON_VERSION}..." && \
|
||||
echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
|
||||
echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
|
||||
apt-get update -y && \
|
||||
apt-get install -y ccache software-properties-common git curl sudo && \
|
||||
for i in 1 2 3; do \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
||||
done && \
|
||||
apt-get update -y && \
|
||||
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
|
||||
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
|
||||
update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
|
||||
ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
|
||||
curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
|
||||
else \
|
||||
echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
|
||||
fi \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
# Install Python and other dependencies
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim \
|
||||
&& add-apt-repository -y ppa:deadsnakes/ppa \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||
# as it was causing spam when compiling the CUTLASS kernels
|
||||
# Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519)
|
||||
RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \
|
||||
if [ "$current_gcc_version" -lt 10 ]; then \
|
||||
echo "GCC version is $current_gcc_version, installing gcc-10..."; \
|
||||
apt-get update && \
|
||||
apt-get install -y gcc-10 g++-10 && \
|
||||
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 && \
|
||||
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
|
||||
else \
|
||||
echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
|
||||
fi && \
|
||||
gcc --version && g++ --version
|
||||
if command -v apt-get >/dev/null; then \
|
||||
if [ "$current_gcc_version" -lt 10 ]; then \
|
||||
echo "GCC version is $current_gcc_version, installing gcc-10..."; \
|
||||
apt-get update \
|
||||
&& apt-get install -y gcc-10 g++-10 \
|
||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 \
|
||||
&& update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
|
||||
else \
|
||||
echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
|
||||
fi \
|
||||
fi \
|
||||
&& gcc --version && g++ --version
|
||||
|
||||
# install uv for faster pip installs
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
@ -79,6 +71,21 @@ ENV UV_LINK_MODE=copy
|
||||
FROM ${BUILD_BASE_IMAGE} AS base
|
||||
USER root
|
||||
|
||||
ARG CUDA_VERSION
|
||||
ARG PYTHON_VERSION
|
||||
|
||||
# TODO (huydhn): Only work with PyTorch manylinux builder
|
||||
ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
|
||||
|
||||
# Install some system dependencies and double check python version
|
||||
RUN if command -v apt-get >/dev/null; then \
|
||||
apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim; \
|
||||
else \
|
||||
dnf install -y git curl wget sudo vim; \
|
||||
fi \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
@ -118,17 +125,15 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
|
||||
if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
|
||||
echo "[INFO] Installing torch wheels to build vllm"; \
|
||||
torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
|
||||
vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
|
||||
audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
|
||||
uv pip install --system "${torch_whl}[opt-einsum]"; \
|
||||
uv pip install --system "${vision_whl}"; \
|
||||
uv pip install --system "${audio_whl}"; \
|
||||
vision_whl=$(find /dist -name 'torchvision*.whl' | head -n1 | xargs); \
|
||||
audio_whl=$(find /dist -name 'torchaudio*.whl' | head -n1 | xargs); \
|
||||
uv pip install --system "${torch_whl}[opt-einsum]" "${vision_whl}" "${audio_whl}" /dist/*.whl; \
|
||||
elif [ -n "$PINNED_TORCH_VERSION" ]; then \
|
||||
echo "[INFO] Installing pinned torch nightly version to build vllm: $PINNED_TORCH_VERSION"; \
|
||||
uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu128; \
|
||||
uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
else \
|
||||
echo "[INFO] Installing torch nightly with latest one to build vllm"; \
|
||||
uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128; \
|
||||
uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
fi
|
||||
|
||||
# Install numba 0.61.2 for cuda environment
|
||||
@ -137,12 +142,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
|
||||
# Install common dependencies from vllm common.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/common.txt
|
||||
|
||||
uv pip install --system -r requirements/common.txt
|
||||
|
||||
# Must put before installing xformers, so it can install the correct version of xfomrers.
|
||||
ARG exformer_cuda_arch_list='7.5;8.0+PTX;9.0a'
|
||||
ENV TORCH_CUDA_ARCH_LIST=${exformer_cuda_arch_list}
|
||||
ARG xformers_cuda_arch_list='7.5;8.0+PTX;9.0a'
|
||||
ENV TORCH_CUDA_ARCH_LIST=${xformers_cuda_arch_list}
|
||||
|
||||
ARG max_jobs=16
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
@ -153,8 +157,8 @@ RUN pip freeze | grep -E 'ninja'
|
||||
|
||||
# Build xformers with cuda and torch nightly/wheel
|
||||
# following official xformers guidance: https://github.com/facebookresearch/xformers#build
|
||||
# sha for https://github.com/facebookresearch/xformers/tree/v0.0.31
|
||||
ARG XFORMERS_COMMIT=eb0946a363464da96ea40afd1a7f72a907c25497
|
||||
# sha for https://github.com/facebookresearch/xformers/tree/v0.0.32.post2
|
||||
ARG XFORMERS_COMMIT=5d4b92a5e5a9c6c6d4878283f47d82e17995b468
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
@ -188,11 +192,6 @@ RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
|
||||
FROM base AS build
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN python3 use_existing_torch.py
|
||||
@ -251,9 +250,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
|
||||
fi
|
||||
|
||||
RUN echo "[DEBUG] Listing current directory:" && \
|
||||
RUN echo "[INFO] Listing current directory:" && \
|
||||
ls -al && \
|
||||
echo "[DEBUG] Showing torch_build_versions.txt content:" && \
|
||||
echo "[INFO] Showing torch_build_versions.txt content:" && \
|
||||
cat torch_build_versions.txt
|
||||
|
||||
#################### WHEEL BUILD IMAGE ####################
|
||||
@ -263,42 +262,40 @@ RUN echo "[DEBUG] Listing current directory:" && \
|
||||
# Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer
|
||||
FROM ${FINAL_BASE_IMAGE} AS vllm-base
|
||||
USER root
|
||||
|
||||
ARG CUDA_VERSION
|
||||
ARG PYTHON_VERSION
|
||||
ARG GET_PIP_URL
|
||||
|
||||
# TODO (huydhn): Only work with PyTorch manylinux builder
|
||||
ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
|
||||
|
||||
# prepare for environment starts
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||
|
||||
# Install Python and other dependencies if it does not existed
|
||||
RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
|
||||
echo "Installing Python ${PYTHON_VERSION}..." && \
|
||||
echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
|
||||
echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
|
||||
apt-get update -y && \
|
||||
apt-get install -y ccache software-properties-common git curl sudo && \
|
||||
for i in 1 2 3; do \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
||||
done && \
|
||||
apt-get update -y && \
|
||||
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
|
||||
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
|
||||
update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
|
||||
ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
|
||||
curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
|
||||
else \
|
||||
echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
|
||||
fi \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
# Install Python and other dependencies
|
||||
RUN if command -v apt-get >/dev/null; then \
|
||||
apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim \
|
||||
&& add-apt-repository -y ppa:deadsnakes/ppa \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
|
||||
else \
|
||||
dnf install -y git curl wget sudo vim; \
|
||||
fi \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
# Get the torch versions, and whls used in previous stagtes for consistency
|
||||
COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
|
||||
COPY --from=base /workspace/xformers-dist /wheels/xformers
|
||||
COPY --from=build /workspace/vllm-dist /wheels/vllm
|
||||
RUN echo "[DEBUG] Listing current directory before torch install step:" && \
|
||||
RUN echo "[INFO] Listing current directory before torch install step:" && \
|
||||
ls -al && \
|
||||
echo "[DEBUG] Showing torch_build_versions.txt content:" && \
|
||||
echo "[INFO] Showing torch_build_versions.txt content:" && \
|
||||
cat torch_build_versions.txt
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
@ -307,7 +304,6 @@ RUN echo "[DEBUG] Listing current directory before torch install step:" && \
|
||||
# or future versions of triton.
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
|
||||
|
||||
# Install uv for faster pip installs if not existed
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if ! python3 -m uv --version > /dev/null 2>&1; then \
|
||||
@ -327,15 +323,13 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
|
||||
torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
|
||||
vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
|
||||
audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
|
||||
vision_whl=$(find /dist -name 'torchvision*.whl' | head -n1 | xargs); \
|
||||
audio_whl=$(find /dist -name 'torchaudio*.whl' | head -n1 | xargs); \
|
||||
echo "[INFO] Use wheels to build : '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \
|
||||
uv pip install --system "${torch_whl}[opt-einsum]"; \
|
||||
uv pip install --system "${vision_whl}"; \
|
||||
uv pip install --system "${audio_whl}"; \
|
||||
uv pip install --system "${torch_whl}[opt-einsum]" "${vision_whl}" "${audio_whl}" /dist/*.whl; \
|
||||
else \
|
||||
echo "[INFO] Installing torch versions from torch_build_versions.txt"; \
|
||||
uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128; \
|
||||
uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
fi
|
||||
|
||||
# Install the vllm wheel from previous stage
|
||||
@ -346,9 +340,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system /wheels/xformers/*.whl --verbose
|
||||
|
||||
|
||||
# Build flashinfer from source.
|
||||
ARG torch_cuda_arch_list='8.0;8.9;9.0a'
|
||||
ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
|
||||
# install package for build flashinfer
|
||||
# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
|
||||
|
||||
@ -416,11 +409,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/nightly_torch_test.txt
|
||||
|
||||
# Workaround for #17068
|
||||
# pinned commit for v2.2.4
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@95d8aba8a8c75aedcaa6143713b11e745e7cd0d9#egg=mamba-ssm"
|
||||
|
||||
# Logging to confirm the torch versions
|
||||
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
|
||||
|
||||
|
||||
30
.github/scripts/generate_binary_build_matrix.py
vendored
30
.github/scripts/generate_binary_build_matrix.py
vendored
@ -16,18 +16,16 @@ from typing import Optional
|
||||
|
||||
|
||||
# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
|
||||
CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
|
||||
CUDA_ARCHES = ["12.6", "12.8", "13.0"]
|
||||
CUDA_STABLE = "12.8"
|
||||
CUDA_ARCHES_FULL_VERSION = {
|
||||
"12.6": "12.6.3",
|
||||
"12.8": "12.8.1",
|
||||
"12.9": "12.9.1",
|
||||
"13.0": "13.0.0",
|
||||
}
|
||||
CUDA_ARCHES_CUDNN_VERSION = {
|
||||
"12.6": "9",
|
||||
"12.8": "9",
|
||||
"12.9": "9",
|
||||
"13.0": "9",
|
||||
}
|
||||
|
||||
@ -40,7 +38,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]
|
||||
|
||||
CPU_S390X_ARCH = ["cpu-s390x"]
|
||||
|
||||
CUDA_AARCH64_ARCHES = ["12.9-aarch64", "13.0-aarch64"]
|
||||
CUDA_AARCH64_ARCHES = ["13.0-aarch64"]
|
||||
|
||||
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
@ -78,28 +76,11 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
),
|
||||
"12.9": (
|
||||
"nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
),
|
||||
"13.0": (
|
||||
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
@ -240,8 +221,6 @@ def generate_libtorch_matrix(
|
||||
if os == "linux":
|
||||
arches += CUDA_ARCHES
|
||||
arches += ROCM_ARCHES
|
||||
if "13.0" in arches:
|
||||
arches.remove("13.0")
|
||||
elif os == "windows":
|
||||
arches += CUDA_ARCHES
|
||||
if libtorch_variants is None:
|
||||
@ -343,7 +322,7 @@ def generate_wheels_matrix(
|
||||
# cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
|
||||
|
||||
if (
|
||||
arch_version in ["13.0", "12.9", "12.8", "12.6"]
|
||||
arch_version in ["13.0", "12.8", "12.6"]
|
||||
and os == "linux"
|
||||
or arch_version in CUDA_AARCH64_ARCHES
|
||||
):
|
||||
@ -407,6 +386,5 @@ def generate_wheels_matrix(
|
||||
|
||||
|
||||
validate_nccl_dep_consistency("13.0")
|
||||
validate_nccl_dep_consistency("12.9")
|
||||
validate_nccl_dep_consistency("12.8")
|
||||
validate_nccl_dep_consistency("12.6")
|
||||
|
||||
4
.github/scripts/generate_ci_workflows.py
vendored
4
.github/scripts/generate_ci_workflows.py
vendored
@ -22,7 +22,7 @@ LABEL_CIFLOW_BINARIES = "ciflow/binaries"
|
||||
LABEL_CIFLOW_PERIODIC = "ciflow/periodic"
|
||||
LABEL_CIFLOW_BINARIES_LIBTORCH = "ciflow/binaries_libtorch"
|
||||
LABEL_CIFLOW_BINARIES_WHEEL = "ciflow/binaries_wheel"
|
||||
LABEL_CIFLOW_ROCM = "ciflow/rocm-mi300"
|
||||
LABEL_CIFLOW_ROCM = "ciflow/rocm"
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -139,6 +139,8 @@ ROCM_SMOKE_WORKFLOWS = [
|
||||
),
|
||||
ciflow_config=CIFlowConfig(
|
||||
labels={
|
||||
LABEL_CIFLOW_BINARIES,
|
||||
LABEL_CIFLOW_BINARIES_WHEEL,
|
||||
LABEL_CIFLOW_ROCM,
|
||||
},
|
||||
isolated_workflow=True,
|
||||
|
||||
@ -171,7 +171,7 @@ jobs:
|
||||
- name: Teardown XPU
|
||||
uses: ./.github/actions/teardown-xpu
|
||||
{%- else %}
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: !{{ common.timeout_minutes }}
|
||||
!{{ upload.binary_env(config) }}
|
||||
steps:
|
||||
|
||||
@ -68,11 +68,6 @@ jobs:
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
!{{ common.checkout(deep_clone=False, directory="pytorch") }}
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
|
||||
2
.github/templates/upload.yml.j2
vendored
2
.github/templates/upload.yml.j2
vendored
@ -33,7 +33,7 @@
|
||||
{%- if is_windows %}
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
{%- endif %}
|
||||
|
||||
{%- else %}
|
||||
|
||||
248
.github/workflows/build-vllm-wheel.yml
vendored
Normal file
248
.github/workflows/build-vllm-wheel.yml
vendored
Normal file
@ -0,0 +1,248 @@
|
||||
name: Build vLLM wheels
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- .github/workflows/build-vllm-wheel.yml
|
||||
- .github/ci_commit_pins/vllm.txt
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/build-vllm-wheel.yml
|
||||
- .github/ci_commit_pins/vllm.txt
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build-wheel:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [ '3.12' ]
|
||||
# TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
|
||||
device: [ 'cu128', 'cu129' ]
|
||||
runner: [ 'linux.12xlarge.memory' ]
|
||||
include:
|
||||
- device: cu128
|
||||
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
|
||||
- device: cu129
|
||||
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
|
||||
name: "Build ${{ matrix.device }} vLLM wheel"
|
||||
runs-on: ${{ matrix.runner }}
|
||||
timeout-minutes: 480
|
||||
env:
|
||||
PY_VERS: ${{ matrix.python-version }}
|
||||
MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
|
||||
PLATFORM: 'manylinux_2_28_x86_64'
|
||||
BUILD_DEVICE: ${{ matrix.device }}
|
||||
steps:
|
||||
- name: Setup SSH (Click me for login details)
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||
with:
|
||||
submodules: false
|
||||
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
|
||||
- name: Get latest PyTorch nightly
|
||||
shell: bash
|
||||
run: |
|
||||
set -eux
|
||||
|
||||
# Keep PyTorch nightly wheel here so that we can install it later during
|
||||
# vLLM build process
|
||||
mkdir -p "${RUNNER_TEMP}/artifacts/"
|
||||
|
||||
container_name=$(docker run \
|
||||
--tty \
|
||||
--detach \
|
||||
-e PLATFORM \
|
||||
-v "${GITHUB_WORKSPACE}:/pytorch" \
|
||||
-v "${RUNNER_TEMP}/artifacts:/artifacts" \
|
||||
-w /artifacts/ \
|
||||
"${MANYLINUX_IMAGE}"
|
||||
)
|
||||
|
||||
# Determine python executable for given version (copied from build-triton-wheel)
|
||||
case $PY_VERS in
|
||||
3.10)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
|
||||
;;
|
||||
3.11)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp311-cp311/bin/python
|
||||
;;
|
||||
3.12)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp312-cp312/bin/python
|
||||
;;
|
||||
3.13)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp313-cp313/bin/python
|
||||
;;
|
||||
3.13t)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python
|
||||
;;
|
||||
3.14)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python
|
||||
;;
|
||||
3.14t)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python
|
||||
;;
|
||||
*)
|
||||
echo "Unsupported python version ${PY_VERS}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \
|
||||
--pre torch torchvision torchaudio \
|
||||
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
|
||||
|
||||
# I wonder if there is a command to both download and install the wheels
|
||||
# in one go
|
||||
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip download \
|
||||
--pre torch torchvision torchaudio \
|
||||
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
|
||||
|
||||
# Save this for later
|
||||
echo "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" >> "$GITHUB_ENV"
|
||||
echo "container_name=${container_name}" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Build vLLM wheel
|
||||
uses: ./.github/actions/build-external-packages
|
||||
with:
|
||||
build-targets: vllm
|
||||
docker-image: ${{ env.MANYLINUX_IMAGE }}
|
||||
cuda-arch-list: '8.0;8.9;9.0;10.0;12.0'
|
||||
torch-wheel-dir: ${{ runner.temp }}/artifacts
|
||||
output-dir: ${{ runner.temp }}/artifacts/externals
|
||||
|
||||
- name: Prepare vLLM wheel
|
||||
shell: bash
|
||||
run: |
|
||||
set -eux
|
||||
|
||||
# Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh
|
||||
docker exec -t "${container_name}" bash -c "
|
||||
set -eux
|
||||
|
||||
nightly=\$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2 | cut -d'.' -f4)
|
||||
|
||||
pushd externals/vllm/wheels
|
||||
for package in xformers flashinfer-python vllm; do
|
||||
pushd \$package
|
||||
auditwheel repair --plat \$PLATFORM *.whl \
|
||||
--exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
|
||||
repair_wheel=\$(find wheelhouse -name *\${PLATFORM}*)
|
||||
repair_wheel=\$(basename \${repair_wheel})
|
||||
popd
|
||||
|
||||
cp \${package}/wheelhouse/\${repair_wheel} .
|
||||
version=\$(unzip -p \$repair_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
|
||||
if [[ \$package == vllm ]]; then
|
||||
new_wheel=\${repair_wheel/\$version/1.0.0.\$nightly}
|
||||
else
|
||||
major_version=\$(echo \$version | tr '.+' '.' | cut -d'.' -f1-3)
|
||||
new_wheel=\${repair_wheel/\$version/\$major_version.\$nightly}
|
||||
fi
|
||||
|
||||
mv -- \$repair_wheel \$new_wheel
|
||||
rm -rf \$package
|
||||
done
|
||||
popd
|
||||
"
|
||||
|
||||
docker exec -t "${container_name}" chown -R 1000:1000 /artifacts
|
||||
|
||||
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
|
||||
with:
|
||||
name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
|
||||
if-no-files-found: error
|
||||
path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl
|
||||
|
||||
- name: Teardown Linux
|
||||
uses: pytorch/test-infra/.github/actions/teardown-linux@main
|
||||
if: always()
|
||||
|
||||
# Copied from build-triton-wheel workflow (mostly)
|
||||
upload-wheel:
|
||||
name: "Upload ${{ matrix.device }} vLLM wheel"
|
||||
needs:
|
||||
- build-wheel
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
device: [ 'cu128', 'cu129' ]
|
||||
env:
|
||||
BUILD_DEVICE: ${{ matrix.device }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
container:
|
||||
image: continuumio/miniconda3:4.12.0
|
||||
environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Configure AWS credentials(PyTorch account) for main
|
||||
if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
|
||||
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
|
||||
aws-region: us-east-1
|
||||
|
||||
- name: Configure AWS credentials(PyTorch account) for RC builds
|
||||
if: ${{ github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }}
|
||||
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels
|
||||
aws-region: us-east-1
|
||||
|
||||
- name: Download Build Artifacts
|
||||
uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
|
||||
with:
|
||||
# Download all available artifacts
|
||||
path: ${{ runner.temp }}/artifacts-all
|
||||
|
||||
- name: Select Wheel Artifacts
|
||||
shell: bash
|
||||
run: |
|
||||
set -eux
|
||||
mkdir -p "${RUNNER_TEMP}/artifacts/"
|
||||
mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
|
||||
|
||||
- name: Set DRY_RUN (only for tagged pushes)
|
||||
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
|
||||
shell: bash
|
||||
run: |
|
||||
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
|
||||
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
|
||||
shell: bash
|
||||
run: |
|
||||
set -ex
|
||||
|
||||
if [[ "${GITHUB_REF_NAME}" = *-rc[0-9]* ]]; then
|
||||
echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
|
||||
fi
|
||||
|
||||
- name: Upload binaries
|
||||
env:
|
||||
PACKAGE_TYPE: wheel
|
||||
UPLOAD_SUBFOLDER: ${{ env.BUILD_DEVICE }}
|
||||
PKG_DIR: ${{ runner.temp }}/artifacts
|
||||
shell: bash
|
||||
run: |
|
||||
set -ex
|
||||
bash .circleci/scripts/binary_upload.sh
|
||||
336
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
336
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -112,52 +112,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_10-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -178,7 +132,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -269,52 +223,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_11-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -335,7 +243,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -426,52 +334,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_12-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_12-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_12-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -492,7 +354,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -583,52 +445,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_13-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -649,7 +465,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -740,52 +556,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13t-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_13t-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13t-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -806,7 +576,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -897,52 +667,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_14-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -963,7 +687,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1054,52 +778,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14t-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_14t-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14t-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -1120,7 +798,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
38
.github/workflows/generated-linux-binary-libtorch-nightly.yml
generated
vendored
38
.github/workflows/generated-linux-binary-libtorch-nightly.yml
generated
vendored
@ -248,7 +248,7 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
libtorch-cuda12_9-shared-with-deps-release-build:
|
||||
libtorch-cuda13_0-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
@ -257,22 +257,22 @@ jobs:
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: libtorch-cuda12_9-shared-with-deps-release
|
||||
build_name: libtorch-cuda13_0-shared-with-deps-release
|
||||
build_environment: linux-binary-libtorch
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
libtorch-cuda12_9-shared-with-deps-release-test: # Testing
|
||||
libtorch-cuda13_0-shared-with-deps-release-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-cuda12_9-shared-with-deps-release-build
|
||||
- libtorch-cuda13_0-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
@ -280,38 +280,38 @@ jobs:
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
build_name: libtorch-cuda12_9-shared-with-deps-release
|
||||
build_name: libtorch-cuda13_0-shared-with-deps-release
|
||||
build_environment: linux-binary-libtorch
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading
|
||||
libtorch-cuda13_0-shared-with-deps-release-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-cuda12_9-shared-with-deps-release-test
|
||||
needs: libtorch-cuda13_0-shared-with-deps-release-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
build_name: libtorch-cuda12_9-shared-with-deps-release
|
||||
build_name: libtorch-cuda13_0-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
@ -342,7 +342,7 @@ jobs:
|
||||
needs:
|
||||
- libtorch-rocm6_3-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
@ -456,7 +456,7 @@ jobs:
|
||||
needs:
|
||||
- libtorch-rocm6_4-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
|
||||
504
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
504
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -241,72 +241,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_10-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_10-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -325,7 +259,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda13_0-test: # Testing
|
||||
@ -398,7 +332,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_10-rocm6_3-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
@ -509,7 +443,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_10-rocm6_4-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
@ -899,72 +833,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_11-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_11-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -983,7 +851,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda13_0-test: # Testing
|
||||
@ -1056,7 +924,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_11-rocm6_3-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
@ -1167,7 +1035,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_11-rocm6_4-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
@ -1557,72 +1425,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_12-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_12-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_12-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_12-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -1641,7 +1443,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda13_0-test: # Testing
|
||||
@ -1714,7 +1516,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_12-rocm6_3-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
@ -1825,7 +1627,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_12-rocm6_4-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
@ -2215,72 +2017,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_13-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_13-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -2299,7 +2035,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda13_0-test: # Testing
|
||||
@ -2372,7 +2108,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_13-rocm6_3-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
@ -2483,7 +2219,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_13-rocm6_4-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
@ -2873,72 +2609,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13t-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_13t-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
build_name: manywheel-py3_13t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_13t-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
build_name: manywheel-py3_13t-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13t-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -2957,7 +2627,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda13_0-test: # Testing
|
||||
@ -3030,7 +2700,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_13t-rocm6_3-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
@ -3141,7 +2811,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_13t-rocm6_4-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
@ -3531,72 +3201,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_14-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14"
|
||||
build_name: manywheel-py3_14-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_14-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14"
|
||||
build_name: manywheel-py3_14-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -3615,7 +3219,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda13_0-test: # Testing
|
||||
@ -3688,7 +3292,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_14-rocm6_3-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
@ -3799,7 +3403,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_14-rocm6_4-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
@ -4189,72 +3793,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14t-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_14t-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
build_name: manywheel-py3_14t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_14t-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
build_name: manywheel-py3_14t-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14t-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -4273,7 +3811,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda13_0-test: # Testing
|
||||
@ -4346,7 +3884,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_14t-rocm6_3-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
@ -4457,7 +3995,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_14t-rocm6_4-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
|
||||
6
.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
generated
vendored
6
.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
generated
vendored
@ -10,7 +10,9 @@ on:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- 'ciflow/rocm-mi300/*'
|
||||
- 'ciflow/binaries/*'
|
||||
- 'ciflow/binaries_wheel/*'
|
||||
- 'ciflow/rocm/*'
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
@ -67,7 +69,7 @@ jobs:
|
||||
needs:
|
||||
- manywheel-py3_9-rocm6_4-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.gfx942.1
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
|
||||
7
.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
generated
vendored
7
.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
generated
vendored
@ -46,7 +46,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -67,11 +67,6 @@ jobs:
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
|
||||
35
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
35
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
@ -63,11 +63,6 @@ jobs:
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -208,11 +203,6 @@ jobs:
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -353,11 +343,6 @@ jobs:
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -498,11 +483,6 @@ jobs:
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -643,11 +623,6 @@ jobs:
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -788,11 +763,6 @@ jobs:
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -933,11 +903,6 @@ jobs:
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
|
||||
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
|
||||
6
.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
generated
vendored
6
.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
generated
vendored
@ -64,7 +64,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Populate binary env
|
||||
shell: cmd
|
||||
@ -141,7 +141,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Populate binary env
|
||||
shell: cmd
|
||||
@ -201,7 +201,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: libtorch-cpu-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
6
.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
generated
vendored
6
.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
generated
vendored
@ -64,7 +64,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Populate binary env
|
||||
shell: cmd
|
||||
@ -141,7 +141,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Populate binary env
|
||||
shell: cmd
|
||||
@ -201,7 +201,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: libtorch-cpu-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
4
.github/workflows/generated-windows-binary-libtorch-debug-main.yml
generated
vendored
4
.github/workflows/generated-windows-binary-libtorch-debug-main.yml
generated
vendored
@ -51,7 +51,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -166,7 +166,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
|
||||
274
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
274
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
@ -58,7 +58,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -173,7 +173,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -283,7 +283,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: libtorch-cpu-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -306,7 +306,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -422,7 +422,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -533,7 +533,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: libtorch-cuda12_6-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -556,7 +556,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -672,7 +672,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -783,261 +783,11 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: libtorch-cuda12_8-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
libtorch-cuda12_9-shared-with-deps-debug-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: debug
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
if: always()
|
||||
with:
|
||||
name: libtorch-cuda12_9-shared-with-deps-debug
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
||||
|
||||
libtorch-cuda12_9-shared-with-deps-debug-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-cuda12_9-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: debug
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: libtorch-cuda12_9-shared-with-deps-debug
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Test PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
||||
libtorch-cuda12_9-shared-with-deps-debug-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-cuda12_9-shared-with-deps-debug-test
|
||||
with:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
LIBTORCH_CONFIG: debug
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda12_9-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
libtorch-cuda13_0-shared-with-deps-debug-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
@ -1056,7 +806,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -1172,7 +922,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -1283,7 +1033,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: libtorch-cuda13_0-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
4
.github/workflows/generated-windows-binary-libtorch-release-main.yml
generated
vendored
4
.github/workflows/generated-windows-binary-libtorch-release-main.yml
generated
vendored
@ -51,7 +51,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -166,7 +166,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
|
||||
274
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
274
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
@ -58,7 +58,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -173,7 +173,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -283,7 +283,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: libtorch-cpu-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -306,7 +306,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -422,7 +422,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -533,7 +533,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: libtorch-cuda12_6-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -556,7 +556,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -672,7 +672,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -783,261 +783,11 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: libtorch-cuda12_8-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
libtorch-cuda12_9-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
if: always()
|
||||
with:
|
||||
name: libtorch-cuda12_9-shared-with-deps-release
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
||||
|
||||
libtorch-cuda12_9-shared-with-deps-release-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-cuda12_9-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: libtorch-cuda12_9-shared-with-deps-release
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Test PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
||||
libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-cuda12_9-shared-with-deps-release-test
|
||||
with:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda12_9-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
libtorch-cuda13_0-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
@ -1056,7 +806,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -1172,7 +922,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -1283,7 +1033,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: libtorch-cuda13_0-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
1666
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
1666
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
15
CLAUDE.md
Normal file
15
CLAUDE.md
Normal file
@ -0,0 +1,15 @@
|
||||
# Testing
|
||||
|
||||
Use our test class and test runner:
|
||||
|
||||
```
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
class TestFeature(TestCase):
|
||||
...
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
```
|
||||
|
||||
To test Tensor equality, use assertEqual.
|
||||
@ -880,10 +880,21 @@ cmake_dependent_option(
|
||||
USE_FBGEMM_GENAI
|
||||
"Whether to build FBGEMM GenAI quantized GEMM kernels.\
|
||||
Will be disabled if not supported by the platform"
|
||||
OFF
|
||||
"USE_CUDA OR USE_ROCM"
|
||||
ON
|
||||
"USE_ROCM"
|
||||
OFF)
|
||||
|
||||
IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
|
||||
message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
|
||||
set(USE_FBGEMM_GENAI off)
|
||||
endif()
|
||||
|
||||
# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100
|
||||
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0a")
|
||||
message(WARNING "Setting USE_FBGEMM_GENAI to ON for CUDA build on SM100")
|
||||
set(USE_FBGEMM_GENAI ON)
|
||||
endif()
|
||||
|
||||
# CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
|
||||
# Eff Attention won't
|
||||
cmake_dependent_option(
|
||||
|
||||
@ -88,13 +88,13 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
|
||||
|
||||
* If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.
|
||||
|
||||
* When installing with `python -m pip install -e .` (in contrast to `python -m pip install .`) Python runtime will use
|
||||
* When installing with `python -m pip install -e . -v --no-build-isolation` (in contrast to `python -m pip install . -v --no-build-isolation`) Python runtime will use
|
||||
the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder)
|
||||
This way you do not need to repeatedly install after modifying Python files (`.py`).
|
||||
However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
|
||||
|
||||
|
||||
One way to avoid running `python -m pip install -e .` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
|
||||
One way to avoid running `python -m pip install -e . -v --no-build-isolation` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
|
||||
is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following:
|
||||
```bash
|
||||
pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd
|
||||
@ -116,7 +116,7 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
|
||||
|
||||
Next run `python setup.py clean`. After that, you can install in editable mode again.
|
||||
|
||||
* If you run into errors when running `python -m pip install -e .`, here are some debugging steps:
|
||||
* If you run into errors when running `python -m pip install -e . -v --no-build-isolation`, here are some debugging steps:
|
||||
1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure
|
||||
your CMake works and can compile this simple Hello World program without errors.
|
||||
2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many
|
||||
@ -129,10 +129,10 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows
|
||||
git clean -xdf
|
||||
python setup.py clean
|
||||
git submodule update --init --recursive
|
||||
python -m pip install -r requirements.txt
|
||||
python -m pip install --group dev
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
4. The main step within `python -m pip install -e .` is running `cmake --build build` from the `build` directory. If you want to
|
||||
4. The main step within `python -m pip install -e . -v --no-build-isolation` is running `make` from the `build` directory. If you want to
|
||||
experiment with some environment variables, you can pass them into the command:
|
||||
```bash
|
||||
ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e .
|
||||
@ -259,6 +259,7 @@ dependencies as well as the nightly binaries into the repo directory.
|
||||
support for PyTorch.
|
||||
* [tools](tools) - Code generation scripts for the PyTorch library.
|
||||
See [README](tools/README.md) of this directory for more details.
|
||||
* [torchgen](torchgen) - contains the logic and tooling for generating PyTorch's low-level C++ and Python bindings from operator definitions, typically specified in native_functions.yaml
|
||||
* [test](test) - Python unit tests for PyTorch Python frontend.
|
||||
* [test_torch.py](test/test_torch.py) - Basic tests for PyTorch
|
||||
functionality.
|
||||
@ -294,7 +295,7 @@ The following packages should be installed with `pip`:
|
||||
- `pytest` - recommended to run tests more selectively
|
||||
Running
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
pip install --group dev
|
||||
```
|
||||
will install these dependencies for you.
|
||||
|
||||
@ -645,9 +646,9 @@ can be selected interactively with your mouse to zoom in on a particular part of
|
||||
the program execution timeline. The `--native` command-line option tells
|
||||
`py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers
|
||||
for C++ code it may be necessary to compile PyTorch in debug mode by prepending
|
||||
your `python -m pip install -e .` call to compile PyTorch with `DEBUG=1`.
|
||||
Depending on your operating system it may also be necessary to run `py-spy` with
|
||||
root privileges.
|
||||
your `python -m pip install -e . -v --no-build-isolation` call to compile
|
||||
PyTorch with `DEBUG=1`. Depending on your operating system it may also be
|
||||
necessary to run `py-spy` with root privileges.
|
||||
|
||||
`py-spy` can also work in an `htop`-like "live profiling" mode and can be
|
||||
tweaked to adjust the stack sampling rate, see the `py-spy` readme for more
|
||||
@ -655,10 +656,10 @@ details.
|
||||
|
||||
## Managing multiple build trees
|
||||
|
||||
One downside to using `python -m pip install -e .` is that your development
|
||||
version of PyTorch will be installed globally on your account (e.g., if
|
||||
you run `import torch` anywhere else, the development version will be
|
||||
used).
|
||||
One downside to using `python -m pip install -e . -v --no-build-isolation` is
|
||||
that your development version of PyTorch will be installed globally on your
|
||||
account (e.g., if you run `import torch` anywhere else, the development version
|
||||
will be used).
|
||||
|
||||
If you want to manage multiple builds of PyTorch, you can make use of
|
||||
[venv environments](https://docs.python.org/3/library/venv.html) to maintain
|
||||
@ -719,7 +720,7 @@ options.
|
||||
|
||||
### Code completion and IDE support
|
||||
|
||||
When using `python -m pip install -e .`, PyTorch will generate
|
||||
When using `python -m pip install -e . -v --no-build-isolation`, PyTorch will generate
|
||||
a `compile_commands.json` file that can be used by many editors
|
||||
to provide command completion and error highlighting for PyTorch's
|
||||
C++ code. You need to `pip install ninja` to generate accurate
|
||||
|
||||
@ -243,7 +243,7 @@ git submodule update --init --recursive
|
||||
|
||||
```bash
|
||||
# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above
|
||||
pip install -r requirements.txt
|
||||
pip install --group dev
|
||||
```
|
||||
|
||||
**On Linux**
|
||||
@ -394,7 +394,7 @@ On macOS
|
||||
|
||||
```bash
|
||||
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
|
||||
MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build
|
||||
MACOSX_DEPLOYMENT_TARGET=11.0 CMAKE_ONLY=1 python setup.py build
|
||||
ccmake build # or cmake-gui build
|
||||
```
|
||||
|
||||
|
||||
@ -252,47 +252,80 @@ if(USE_MEM_EFF_ATTENTION)
|
||||
list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu})
|
||||
endif()
|
||||
|
||||
IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
|
||||
message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
|
||||
set(USE_FBGEMM_GENAI off)
|
||||
endif()
|
||||
|
||||
# FBGEMM GenAI
|
||||
IF(USE_FBGEMM_GENAI)
|
||||
set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
|
||||
set(FBGEMM_GENAI_DIR ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
|
||||
set(FBGEMM_GENAI_SRCS ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
|
||||
if(USE_CUDA)
|
||||
# To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
|
||||
# If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
|
||||
set(FBGEMM_CUTLASS_KERNELS_REGEX ".*mx8mx8bf16_grouped.*")
|
||||
file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
|
||||
list(FILTER fbgemm_genai_native_cuda_cu INCLUDE REGEX ${FBGEMM_CUTLASS_KERNELS_REGEX})
|
||||
|
||||
if(USE_ROCM)
|
||||
# Only include the kernels we want to build to avoid increasing binary size.
|
||||
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
|
||||
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
|
||||
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
|
||||
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
|
||||
file(GLOB_RECURSE fbgemm_genai_native_cuda_cpp
|
||||
"${FBGEMM_GENAI_SRCS}/common/*.cpp"
|
||||
)
|
||||
|
||||
# Add additional HIPCC compiler flags for performance
|
||||
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
|
||||
-mllvm
|
||||
-amdgpu-coerce-illegal-types=1
|
||||
-mllvm
|
||||
-enable-post-misched=0
|
||||
-mllvm
|
||||
-greedy-reverse-local-assignment=1
|
||||
-fhip-new-launch-api)
|
||||
# Combine all source files into a single list
|
||||
list(APPEND fbgemm_genai_all_sources
|
||||
${fbgemm_genai_native_cuda_cu}
|
||||
${fbgemm_genai_native_cuda_cpp}
|
||||
)
|
||||
|
||||
# Now, create the library and provide the sources at the same time
|
||||
add_library(fbgemm_genai OBJECT ${fbgemm_genai_all_sources})
|
||||
|
||||
hip_add_library(
|
||||
fbgemm_genai STATIC
|
||||
${fbgemm_genai_native_rocm_hip}
|
||||
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
|
||||
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
|
||||
|
||||
set(fbgemm_genai_mx8mx8bf16_grouped
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
|
||||
)
|
||||
|
||||
target_include_directories(fbgemm_genai PUBLIC
|
||||
# FBGEMM version of Composable Kernel is used due to some customizations
|
||||
${FBGEMM_THIRD_PARTY}/composable_kernel/include
|
||||
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
|
||||
${FBGEMM_GENAI_DIR}/include/
|
||||
${FBGEMM_GENAI_DIR}/common/include/
|
||||
${FBGEMM_THIRD_PARTY}/cutlass/include
|
||||
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
|
||||
${fbgemm_genai_mx8mx8bf16_grouped}
|
||||
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
|
||||
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
|
||||
)
|
||||
else()
|
||||
if(USE_ROCM)
|
||||
# Only include the kernels we want to build to avoid increasing binary size.
|
||||
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
|
||||
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
|
||||
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
|
||||
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
|
||||
|
||||
# Add additional HIPCC compiler flags for performance
|
||||
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
|
||||
-mllvm
|
||||
-amdgpu-coerce-illegal-types=1
|
||||
-mllvm
|
||||
-enable-post-misched=0
|
||||
-mllvm
|
||||
-greedy-reverse-local-assignment=1
|
||||
-fhip-new-launch-api)
|
||||
|
||||
hip_add_library(
|
||||
fbgemm_genai STATIC
|
||||
${fbgemm_genai_native_rocm_hip}
|
||||
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
|
||||
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
|
||||
|
||||
target_include_directories(fbgemm_genai PUBLIC
|
||||
# FBGEMM version of Composable Kernel is used due to some customizations
|
||||
${FBGEMM_THIRD_PARTY}/composable_kernel/include
|
||||
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
|
||||
${FBGEMM_THIRD_PARTY}/cutlass/include
|
||||
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
|
||||
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
|
||||
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@ -635,12 +668,26 @@ if(USE_CUDA AND NOT USE_ROCM)
|
||||
add_definitions(-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
|
||||
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
|
||||
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
|
||||
|
||||
# Add FBGEMM_GENAI include directories for torch_ops.h
|
||||
if(USE_FBGEMM_GENAI)
|
||||
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
|
||||
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
|
||||
endif()
|
||||
|
||||
if($ENV{ATEN_STATIC_CUDA})
|
||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
||||
${CUDA_LIBRARIES}
|
||||
CUDA::cusparse_static
|
||||
CUDA::cufft_static_nocallback
|
||||
)
|
||||
if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
|
||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
||||
${CUDA_LIBRARIES}
|
||||
CUDA::cusparse_static
|
||||
CUDA::cufft_static_nocallback)
|
||||
else()
|
||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
||||
${CUDA_LIBRARIES}
|
||||
CUDA::cusparse_static
|
||||
CUDA::cufft_static)
|
||||
endif()
|
||||
|
||||
if(NOT BUILD_LAZY_CUDA_LINALG)
|
||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
||||
CUDA::cusolver_static
|
||||
|
||||
@ -308,17 +308,44 @@ void fillVersion<DLManagedTensorVersioned>(
|
||||
// constructed out of ATen tensor
|
||||
template <class T>
|
||||
T* toDLPackImpl(const Tensor& src) {
|
||||
// create a new tensor with possibly normalized strides
|
||||
// gh-83069
|
||||
auto shape = src.sizes();
|
||||
auto strides = src.strides().vec();
|
||||
for (int i = 0; i < src.dim(); i++) {
|
||||
if (shape[i] < 2) {
|
||||
strides[i] = 1;
|
||||
auto view = src;
|
||||
|
||||
// Detect whether there is need to normalize the strides
|
||||
// Background: gh-83069
|
||||
//
|
||||
// However, normalizing strides can come at a high-cost
|
||||
// to slow down toDLPack conversion 3x, so we
|
||||
// only normalize if needed.
|
||||
//
|
||||
// The following code detects whether the src follows
|
||||
// a continuous pattern. If the src follows such pattern (common-case)
|
||||
// then we do not need to normalize the strides.
|
||||
bool need_normalize_strides = false;
|
||||
int64_t expected_stride = 1;
|
||||
for (int i = src.dim() - 1; i >= 0; i--) {
|
||||
// detect if we do not meet continuous pattern
|
||||
// and the size is 1, so there is opportunity to normalize
|
||||
if (src.stride(i) != expected_stride && src.size(i) == 1) {
|
||||
need_normalize_strides = true;
|
||||
break;
|
||||
}
|
||||
expected_stride *= src.size(i);
|
||||
}
|
||||
|
||||
// less common case, try normalizing the strides
|
||||
if (need_normalize_strides) {
|
||||
// create a new tensor with possibly normalized strides
|
||||
// gh-83069
|
||||
auto shape = src.sizes();
|
||||
auto strides = src.strides().vec();
|
||||
for (int i = 0; i < src.dim(); i++) {
|
||||
if (shape[i] < 2) {
|
||||
strides[i] = 1;
|
||||
}
|
||||
}
|
||||
view = src.as_strided(shape, strides, src.storage_offset());
|
||||
}
|
||||
|
||||
auto view = src.as_strided(shape, strides, src.storage_offset());
|
||||
ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
|
||||
atDLMTensor->handle = view;
|
||||
atDLMTensor->tensor.manager_ctx = atDLMTensor;
|
||||
|
||||
17
aten/src/ATen/DTensorState.cpp
Normal file
17
aten/src/ATen/DTensorState.cpp
Normal file
@ -0,0 +1,17 @@
|
||||
#include <ATen/DTensorState.h>
|
||||
|
||||
namespace at {
|
||||
|
||||
namespace {
|
||||
thread_local bool kDTensorAllowImplicitReplication = false;
|
||||
}
|
||||
|
||||
bool get_dtensor_allow_implicit_replication() {
|
||||
return kDTensorAllowImplicitReplication;
|
||||
}
|
||||
|
||||
void set_dtensor_allow_implicit_replication(bool enabled) {
|
||||
kDTensorAllowImplicitReplication = enabled;
|
||||
}
|
||||
|
||||
} // namespace at
|
||||
34
aten/src/ATen/DTensorState.h
Normal file
34
aten/src/ATen/DTensorState.h
Normal file
@ -0,0 +1,34 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
namespace at {
|
||||
|
||||
TORCH_API bool get_dtensor_allow_implicit_replication();
|
||||
TORCH_API void set_dtensor_allow_implicit_replication(bool enabled);
|
||||
|
||||
struct DTensorAllowImplicitReplication {
|
||||
DTensorAllowImplicitReplication()
|
||||
: prev_dtensor_allow_implicit_replication_(
|
||||
get_dtensor_allow_implicit_replication()) {
|
||||
set_dtensor_allow_implicit_replication(true);
|
||||
}
|
||||
|
||||
DTensorAllowImplicitReplication(const DTensorAllowImplicitReplication&) =
|
||||
delete;
|
||||
DTensorAllowImplicitReplication& operator=(
|
||||
const DTensorAllowImplicitReplication&) = delete;
|
||||
DTensorAllowImplicitReplication(DTensorAllowImplicitReplication&&) = delete;
|
||||
DTensorAllowImplicitReplication& operator=(
|
||||
DTensorAllowImplicitReplication&&) = delete;
|
||||
|
||||
~DTensorAllowImplicitReplication() {
|
||||
set_dtensor_allow_implicit_replication(
|
||||
prev_dtensor_allow_implicit_replication_);
|
||||
}
|
||||
|
||||
private:
|
||||
bool prev_dtensor_allow_implicit_replication_;
|
||||
};
|
||||
|
||||
} // namespace at
|
||||
@ -8,6 +8,7 @@
|
||||
#include <ATen/record_function.h>
|
||||
#include <ATen/SavedTensorHooks.h>
|
||||
#include <ATen/FunctionalTensorWrapper.h>
|
||||
#include <ATen/DTensorState.h>
|
||||
|
||||
namespace at {
|
||||
|
||||
@ -19,6 +20,7 @@ ThreadLocalState::ThreadLocalState()
|
||||
torch_dispatch_mode_state_(c10::impl::TorchDispatchModeTLS::get_state()), python_dispatcher_state_(c10::impl::PythonDispatcherTLS::get_state()),
|
||||
python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()),
|
||||
saved_tensors_default_hooks_state_(at::SavedTensorDefaultHooks::get_tls_state()), functionalization_reapply_views_state_(at::functionalization::impl::getFunctionalizationReapplyViewsTLS()),
|
||||
dtensor_allow_implicit_replication_(at::get_dtensor_allow_implicit_replication()),
|
||||
saved_objects_(at::impl::ThreadLocalPythonObjects::get_state()) {
|
||||
#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) && !defined(BUILD_LITE_INTERPRETER)
|
||||
for(size_t i=0; i<autocast_dtypes_.size(); i++) {
|
||||
@ -52,6 +54,8 @@ void ThreadLocalState::setThreadLocalState(
|
||||
|
||||
c10::impl::PythonDispatcherTLS::set_state(state.python_dispatcher_state_);
|
||||
|
||||
at::set_dtensor_allow_implicit_replication(state.dtensor_allow_implicit_replication_);
|
||||
|
||||
c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_);
|
||||
|
||||
c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_);
|
||||
|
||||
@ -75,6 +75,8 @@ class TORCH_API ThreadLocalState {
|
||||
|
||||
bool functionalization_reapply_views_state_;
|
||||
|
||||
bool dtensor_allow_implicit_replication_;
|
||||
|
||||
// TLS for arbitrary python objects that is registered via hooks
|
||||
at::impl::ThreadLocalPythonObjects saved_objects_;
|
||||
|
||||
|
||||
@ -1937,11 +1937,11 @@ void scaled_gemm(
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
|
||||
cublasLtMatmulDescAttributes_t matmulDescA = CUBLASLT_MATMUL_DESC_A_SCALE_POINTER;
|
||||
cublasLtMatmulDescAttributes_t matmulDescB = CUBLASLT_MATMUL_DESC_B_SCALE_POINTER;
|
||||
#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
|
||||
// hipblaslt supported row-wise before cublas, and did so their own way (via
|
||||
// the SCALE_POINTERSs), but then migrated to match how cublas does it (via
|
||||
// the SCALE_MODEs). Here we check for this early custom mode.
|
||||
bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
|
||||
#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
|
||||
if (use_rowwise) {
|
||||
matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
|
||||
matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
|
||||
@ -1956,8 +1956,12 @@ void scaled_gemm(
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
// rowwise isn't supported using cublaslt or older hipblaslt
|
||||
#elif (CUDA_VERSION < 12090) && !defined(USE_ROCM)
|
||||
// hipblaslt supported row-wise before cublas, and did so their own way (via
|
||||
// the SCALE_POINTERSs), but then migrated to match how cublas does it (via
|
||||
// the SCALE_MODEs). Here we check for this early custom mode.
|
||||
bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
|
||||
// rowwise isn't supported using older cublaslt or older hipblaslt
|
||||
TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
|
||||
#endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
|
||||
computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);
|
||||
|
||||
@ -117,6 +117,8 @@ namespace at::cuda {
|
||||
_(nvrtcGetPTXSize) \
|
||||
_(nvrtcGetPTX) \
|
||||
_(cuModuleLoadData) \
|
||||
_(cuModuleLoad) \
|
||||
_(cuGetErrorString) \
|
||||
_(cuModuleGetFunction) \
|
||||
_(HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR) \
|
||||
_(nvrtcGetErrorString) \
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
#include <ATen/functorch/BatchRulesHelper.h>
|
||||
#include <ATen/functorch/PlumbingHelper.h>
|
||||
#include <ATen/core/dispatch/Dispatcher.h>
|
||||
#include <ATen/DTensorState.h>
|
||||
|
||||
#include <utility>
|
||||
|
||||
@ -44,8 +45,13 @@ static std::tuple<Tensor, std::optional<int64_t>> embedding_batch_rule(
|
||||
const auto weight_ = reshape_dim_into(*weight_bdim, 0, weight);
|
||||
auto indices_ = moveBatchDimToFront(indices, indices_bdim);
|
||||
|
||||
const auto range = getStepTensor(indices, batch_size, num_embeddings);
|
||||
indices_ = indices_ + range;
|
||||
{
|
||||
// getStepTensor returns a regular Tensor. If indices_ is a DTensor
|
||||
// we want to allow this mixed DTensor-Tensor operation.
|
||||
at::DTensorAllowImplicitReplication guard;
|
||||
const auto range = getStepTensor(indices, batch_size, num_embeddings);
|
||||
indices_ = indices_ + range;
|
||||
}
|
||||
auto result = at::embedding_symint(weight_, indices_, std::move(padding_idx), scale_grad_by_freq, sparse);
|
||||
return std::make_tuple(std::move(result), 0);
|
||||
}
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
#include <ATen/native/mkldnn/Matmul.h>
|
||||
#include <ATen/native/mkldnn/Linear.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/GroupedMMUtils.h>
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
#include <cpuinfo.h>
|
||||
#endif
|
||||
@ -332,4 +333,23 @@ _scaled_mm_cpu(const Tensor& mat_a, const Tensor& mat_b,
|
||||
return _scaled_mm_out_cpu(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
|
||||
}
|
||||
|
||||
// TODO(vasiliy, future PR): figure out why we need to declare this function, when
|
||||
// other functions that live in ATen/native/*.cpp without declarations
|
||||
// or headers work just fine.
|
||||
Tensor _grouped_mm(const Tensor& mat_a, const Tensor& mat_b,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
std::optional<c10::ScalarType> out_dtype);
|
||||
|
||||
Tensor _grouped_mm(const Tensor& mat_a, const Tensor& mat_b,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
std::optional<c10::ScalarType> out_dtype) {
|
||||
_grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
|
||||
const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
|
||||
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
|
||||
_grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
167
aten/src/ATen/native/GroupedMMUtils.h
Normal file
167
aten/src/ATen/native/GroupedMMUtils.h
Normal file
@ -0,0 +1,167 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/CPUFunctions.h>
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/bmm.h>
|
||||
#include <ATen/ops/empty.h>
|
||||
#include <ATen/ops/empty_strided.h>
|
||||
#include <ATen/ops/mm.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
inline bool check_valid_strides_and_return_transposed(const Tensor& mat) {
|
||||
IntArrayRef tensor_strides = mat.strides();
|
||||
IntArrayRef tensor_sizes = mat.sizes();
|
||||
int end_dim = mat.dim() - 1;
|
||||
int alignment = 16 / mat.element_size();
|
||||
TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n");
|
||||
if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max<int64_t>(1, tensor_sizes[end_dim - 1]))) {
|
||||
TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes");
|
||||
return true;
|
||||
} else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max<int64_t>(1, tensor_sizes[end_dim]))) {
|
||||
TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes");
|
||||
return false;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes");
|
||||
}
|
||||
}
|
||||
|
||||
inline at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
c10::ScalarType out_dtype
|
||||
) {
|
||||
c10::SmallVector<int64_t, 3> out_size;
|
||||
const bool a_is_2d = mat_a.dim() == 2;
|
||||
const bool b_is_2d = mat_b.dim() == 2;
|
||||
if (a_is_2d) {
|
||||
if (b_is_2d) {
|
||||
out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)};
|
||||
} else {
|
||||
TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match");
|
||||
out_size = {mat_a.size(0), mat_b.size(-1)};
|
||||
}
|
||||
} else {
|
||||
if (b_is_2d) {
|
||||
// this case is not actually encountered for MoE gemms
|
||||
TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match");
|
||||
out_size = {mat_a.size(1), mat_b.size(1)};
|
||||
} else { // regular bmm
|
||||
TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match");
|
||||
out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)};
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef USE_ROCM
|
||||
// For TMA transfers, strides of output tensor have to be either
|
||||
// 1, or aligned to 16 bytes.
|
||||
const auto last_dim = out_size.size() - 1;
|
||||
const auto alignment = 16 / c10::elementSize(out_dtype);
|
||||
const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment;
|
||||
std::vector<int64_t> out_stride;
|
||||
if (a_is_2d != b_is_2d) {
|
||||
out_stride = {size_padded, 1};
|
||||
} else {
|
||||
out_stride = {out_size[1] * size_padded, size_padded, 1};
|
||||
}
|
||||
return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype));
|
||||
#else
|
||||
return at::empty(out_size, mat_a.options().dtype(out_dtype));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void _grouped_mm_validate_inputs(const Tensor& mat_a, const Tensor& mat_b,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
std::optional<c10::ScalarType> out_dtype) {
|
||||
TORCH_CHECK((mat_a.dtype() == at::kBFloat16) || (mat_a.dtype() == at::kFloat) || (mat_a.dtype() == at::kHalf), "Expected mat_a to be Float32, BFloat16 or Float16 matrix, got ", mat_a.scalar_type());
|
||||
TORCH_CHECK((mat_b.dtype() == at::kBFloat16) || (mat_b.dtype() == at::kFloat) || (mat_b.dtype() == at::kHalf), "Expected mat_b to be Float32, BFloat16 or Float16 matrix, got ", mat_b.scalar_type());
|
||||
TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
|
||||
TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
|
||||
const bool a_is_2d = mat_a.dim() == 2;
|
||||
const bool b_is_2d = mat_b.dim() == 2;
|
||||
if (!a_is_2d || !b_is_2d) {
|
||||
TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
|
||||
}
|
||||
|
||||
// check that the strides are valid, the fn will throw an error if not
|
||||
check_valid_strides_and_return_transposed(mat_a);
|
||||
check_valid_strides_and_return_transposed(mat_b);
|
||||
TORCH_CHECK(offs.has_value() == (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d");
|
||||
|
||||
if (offs.has_value()) {
|
||||
TORCH_CHECK(offs->dim() == 1, "offs has to be 1D");
|
||||
TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
|
||||
}
|
||||
TORCH_CHECK(!bias.has_value(), "Bias not supported yet");
|
||||
}
|
||||
|
||||
inline c10::ScalarType _resolve_grouped_mm_out_dtype(const Tensor& mat_a, const Tensor& mat_b,
|
||||
std::optional<c10::ScalarType> out_dtype) {
|
||||
const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
|
||||
// TODO(future PR): enable float32 output dtype for bfloat16 and float16 inputs
|
||||
TORCH_CHECK(out_dtype_ == mat_a.dtype(), "Grouped gemm output dtype must match `mat_a` dtype");
|
||||
return out_dtype_;
|
||||
}
|
||||
|
||||
|
||||
inline void _grouped_mm_fallback(const Tensor& mat_a, const Tensor& mat_b,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
std::optional<c10::ScalarType> out_dtype,
|
||||
Tensor out) {
|
||||
LOG(INFO) << "fallback path for `torch._grouped_mm`, performance may not be optimal";
|
||||
const bool a_is_2d = mat_a.dim() == 2;
|
||||
const bool b_is_2d = mat_b.dim() == 2;
|
||||
if (a_is_2d && !b_is_2d) {
|
||||
// 2d x 3d with offsets
|
||||
int group_start_idx = 0;
|
||||
auto offs_cpu = offs.value().cpu();
|
||||
for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) {
|
||||
int group_end_idx = offs_cpu[group_idx].item<int>();
|
||||
auto mat_a_slice = mat_a.slice(0, group_start_idx, group_end_idx);
|
||||
auto out_slice = out.slice(0, group_start_idx, group_end_idx);
|
||||
at::mm_out(out_slice, mat_a_slice, mat_b[group_idx]);
|
||||
group_start_idx = group_end_idx;
|
||||
}
|
||||
|
||||
} else if (!a_is_2d && b_is_2d) {
|
||||
// 3d x 2d with offsets
|
||||
int group_start_idx = 0;
|
||||
auto offs_cpu = offs.value().cpu();
|
||||
for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) {
|
||||
int group_end_idx = offs_cpu[group_idx].item<int>();
|
||||
auto mat_b_slice = mat_b.slice(1, group_start_idx, group_end_idx);
|
||||
auto out_slice = out.slice(1, group_start_idx, group_end_idx);
|
||||
at::mm_out(out_slice, mat_a[group_idx], mat_b_slice);
|
||||
group_start_idx = group_end_idx;
|
||||
}
|
||||
|
||||
} else if (a_is_2d && b_is_2d) {
|
||||
// 2d x 2d with offsets
|
||||
int group_start_idx = 0;
|
||||
auto offs_cpu = offs.value().cpu();
|
||||
for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) {
|
||||
int group_end_idx = offs_cpu[group_idx].item<int>();
|
||||
auto mat_a_slice = mat_a.slice(1, group_start_idx, group_end_idx);
|
||||
auto mat_b_slice = mat_b.slice(0, group_start_idx, group_end_idx);
|
||||
auto out_slice = out[group_idx];
|
||||
at::mm_out(out_slice, mat_a_slice, mat_b_slice);
|
||||
group_start_idx = group_end_idx;
|
||||
}
|
||||
|
||||
} else {
|
||||
// 3d x 3d without offsets - regular bmm
|
||||
at::bmm_out(out, mat_a, mat_b);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace at::native
|
||||
@ -47,10 +47,14 @@ TORCH_META_FUNC(nll_loss_forward)
|
||||
TORCH_CHECK(
|
||||
target.dim() <= 1,
|
||||
"0D or 1D target tensor expected, multi-target not supported");
|
||||
|
||||
auto no_batch_dim = self.dim() == 1 && target.dim() == 0;
|
||||
if (self.dim() == 1 && target.dim() == 1) {
|
||||
TORCH_CHECK_VALUE(
|
||||
target.size(0) == 1,
|
||||
"For 1D input, 1D target must have size 1, but got target size: ",
|
||||
target.size(0));
|
||||
}
|
||||
TORCH_CHECK(
|
||||
no_batch_dim || (self.size(0) == target.size(0)),
|
||||
self.dim() == 1 || (self.size(0) == target.size(0)),
|
||||
"size mismatch (got input: ",
|
||||
self.sizes(),
|
||||
", target: ",
|
||||
|
||||
@ -1640,6 +1640,9 @@ Tensor zeros_symint(
|
||||
std::optional<Layout> layout,
|
||||
std::optional<Device> device,
|
||||
std::optional<bool> pin_memory) {
|
||||
for (const auto& dim_size : size) {
|
||||
TORCH_CHECK(dim_size >= 0, "zeros: Dimension size must be non-negative.");
|
||||
}
|
||||
Layout layout_ = layout.value_or(Layout::Strided);
|
||||
if (at::sparse_csr::is_sparse_compressed(layout_)) {
|
||||
return zeros_sparse_compressed_symint(
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
#include <ATen/cuda/tunable/TunableGemm.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <c10/util/MaybeOwned.h>
|
||||
#include <ATen/native/GroupedMMUtils.h>
|
||||
#include <ATen/native/cuda/RowwiseScaledMM.h>
|
||||
#include <ATen/native/cuda/ScaledGroupMM.h>
|
||||
#include <ATen/native/cuda/GroupMM.h>
|
||||
@ -1079,6 +1080,16 @@ static bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=fals
|
||||
#endif
|
||||
}
|
||||
|
||||
static bool _grouped_mm_allowed_device() {
|
||||
#ifdef USE_ROCM
|
||||
return false;
|
||||
#else
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
// CUDA capability 8.0 and greater
|
||||
return dprops->major >= 8;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef USE_ROCM
|
||||
static bool _scaled_mm_is_fnuz() {
|
||||
return at::detail::getCUDAHooks().isGPUArch({"gfx942"});
|
||||
@ -1540,71 +1551,8 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
}
|
||||
|
||||
namespace {
|
||||
at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
std::optional<c10::ScalarType> out_dtype
|
||||
) {
|
||||
c10::SmallVector<int64_t, 3> out_size;
|
||||
const bool a_is_2d = mat_a.dim() == 2;
|
||||
const bool b_is_2d = mat_b.dim() == 2;
|
||||
if (a_is_2d) {
|
||||
if (b_is_2d) {
|
||||
out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)};
|
||||
} else {
|
||||
TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match");
|
||||
out_size = {mat_a.size(0), mat_b.size(-1)};
|
||||
}
|
||||
} else {
|
||||
if (b_is_2d) {
|
||||
// this case is not actually encountered for MoE gemms
|
||||
TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match");
|
||||
out_size = {mat_a.size(1), mat_b.size(1)};
|
||||
} else { // regular bmm
|
||||
TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match");
|
||||
out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)};
|
||||
}
|
||||
}
|
||||
|
||||
const auto out_dtype_ = out_dtype.value_or(kBFloat16);
|
||||
TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
|
||||
|
||||
#ifndef USE_ROCM
|
||||
// For TMA transfers, strides of output tensor have to be either
|
||||
// 1, or aligned to 16 bytes.
|
||||
const auto last_dim = out_size.size() - 1;
|
||||
const auto alignment = 16 / c10::elementSize(out_dtype_);
|
||||
const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment;
|
||||
std::vector<int64_t> out_stride;
|
||||
if (a_is_2d != b_is_2d) {
|
||||
out_stride = {size_padded, 1};
|
||||
} else {
|
||||
out_stride = {out_size[1] * size_padded, size_padded, 1};
|
||||
}
|
||||
return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_));
|
||||
#else
|
||||
return at::empty(out_size, mat_a.options().dtype(out_dtype_));
|
||||
#endif
|
||||
}
|
||||
|
||||
bool check_valid_strides_and_return_transposed(const Tensor& mat) {
|
||||
IntArrayRef tensor_strides = mat.strides();
|
||||
IntArrayRef tensor_sizes = mat.sizes();
|
||||
int end_dim = mat.dim() - 1;
|
||||
int alignment = 16 / mat.element_size();
|
||||
TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n");
|
||||
if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max<int64_t>(1, tensor_sizes[end_dim - 1]))) {
|
||||
TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes");
|
||||
return true;
|
||||
} else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max<int64_t>(1, tensor_sizes[end_dim]))) {
|
||||
TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes");
|
||||
return false;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes");
|
||||
}
|
||||
}
|
||||
|
||||
void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
|
||||
void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
|
||||
// Checks scales for 2d or 3d target tensors (`mat`).
|
||||
if (mat.dim() == 2) {
|
||||
TORCH_CHECK(
|
||||
scale.dim() == 1,
|
||||
@ -1638,9 +1586,66 @@ namespace {
|
||||
"scale must have the same first dimension as mat for arg ",
|
||||
arg_idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
|
||||
// Checks scales for 2d or 3d target tensors (`mat`).
|
||||
if (mat.dim() == 2) {
|
||||
// For MXFP8, 2d tensors have variable size groups represented as subtensors,
|
||||
// that are converted to blocked padded format individually,
|
||||
// so we can't check the scale sizes without doing a d2h sync to get the group sizes here.
|
||||
TORCH_CHECK(
|
||||
scale.dim() == mat.dim(),
|
||||
"for mxfp8, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(), " and scale.dim() = ", scale.dim(), " for arg ", arg_idx);
|
||||
|
||||
// LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/32, 4))
|
||||
// RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/32, 4))
|
||||
// * weight is transposed prior to the call, scale stays non-transposed.
|
||||
bool LHS = arg_idx == 0;
|
||||
int scale_dim_to_check = 0;
|
||||
int mat_dim_to_check = LHS ? 0 : 1;
|
||||
TORCH_CHECK(
|
||||
scale.size(scale_dim_to_check) >= mat.size(mat_dim_to_check),
|
||||
"for mxfp8, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
|
||||
"must have scale.shape[", scale_dim_to_check, "] >= ", mat.size(mat_dim_to_check), " but got scale.shape=(", scale.size(0), ", ", scale.size(1), ")");
|
||||
} else {
|
||||
// For MXFP8, 3d tensors have static group sizes (stack of 2d tensors),
|
||||
// so we can check the exact expected scale sizes here without a d2h sync.
|
||||
auto round_up = [](auto x, auto y) {
|
||||
return ((x + y - 1) / y) * y;
|
||||
};
|
||||
|
||||
// TODO: this is for 3d tensor in 2d-3d case specifically.
|
||||
// We'll need to support 3d-3d and 3d-2d cases once mxfp8 grouped gemm supports them.
|
||||
int64_t G = mat.size(0);
|
||||
int64_t K = mat.size(1);
|
||||
int64_t N = mat.size(2);
|
||||
int64_t blocked_scale_K = round_up(K/32, 4);
|
||||
int64_t blocked_scale_N = round_up(N, 128);
|
||||
|
||||
// fbgemm expects stack of flattened blocked scales for 3d tensor, shape (G, blocked_scale_K * blocked_scale_N).
|
||||
TORCH_CHECK(
|
||||
scale.dim() == mat.dim() - 1,
|
||||
"for mxfp8 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N), but scale is ", scale.dim(), "D for arg ", arg_idx
|
||||
);
|
||||
TORCH_CHECK(
|
||||
scale.size(0) == G && scale.size(1) == blocked_scale_K * blocked_scale_N,
|
||||
"for mxfp8, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ") for arg ", arg_idx
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
|
||||
bool using_fp8_rowwise = scale.scalar_type() == kFloat;
|
||||
bool using_mxfp8 = scale.scalar_type() == at::kFloat8_e8m0fnu;
|
||||
if (using_fp8_rowwise) {
|
||||
_check_scales_fp8_rowwise(mat, scale, dim, arg_idx, scale_multiplier);
|
||||
} else if (using_mxfp8) {
|
||||
_check_scales_mxfp8(mat, scale, dim, arg_idx);
|
||||
} else {
|
||||
TORCH_CHECK(false, "scale must be float32 or float8_e8m0fnu, but got ", scale.dtype());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Tensor
|
||||
@ -1665,8 +1670,8 @@ const std::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& scale_result,
|
||||
std::optional<c10::ScalarType> out_dtype,
|
||||
bool use_fast_accum) {
|
||||
bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/false);
|
||||
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+");
|
||||
bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true);
|
||||
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = [9.0, 10.0], or ROCm MI300+");
|
||||
|
||||
TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
|
||||
TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed");
|
||||
@ -1699,16 +1704,47 @@ bool use_fast_accum) {
|
||||
TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
|
||||
}
|
||||
|
||||
// Both Per-Tensor and Row-wise scaling expect fp32 tensors
|
||||
// FP8 per-tensor and per-row scaling expect fp32 scales.
|
||||
// MXFP8 expects float8_e8m0fnu scales.
|
||||
TORCH_CHECK(
|
||||
scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat,
|
||||
"Both scale_a and scale_b must be float (fp32) tensors.");
|
||||
(scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat) ||
|
||||
(scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu),
|
||||
"For FP8 tensorwise and rowwise, both scales must both be float32 tensors. For MXFP8, scales must both be float8_e8m0fnu tensors.");
|
||||
|
||||
const int scale_multiplier = (mat_a.dim() == 2 && mat_b.dim() == 2) ? offs->size(0) : 1;
|
||||
check_scale(mat_a, scale_a, 0 ,0, scale_multiplier);
|
||||
check_scale(mat_b, scale_b, 1, 1, scale_multiplier);
|
||||
|
||||
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
|
||||
const auto out_dtype_ = out_dtype.value_or(kBFloat16);
|
||||
TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
|
||||
|
||||
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
|
||||
|
||||
#if defined(USE_FBGEMM_GENAI) && defined(USE_CUDA) && !defined(USE_ROCM)
|
||||
// MXFP8 grouped GEMM dispatching
|
||||
bool is_mx8mx8bf16 = (
|
||||
mat_a.scalar_type() == at::kFloat8_e4m3fn && mat_b.scalar_type() == at::kFloat8_e4m3fn &&
|
||||
scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu
|
||||
);
|
||||
TORCH_CHECK(out_dtype == at::kBFloat16, "Only bf16 out_dtype is supported for MXFP8 grouped gemm");
|
||||
|
||||
if (is_mx8mx8bf16) {
|
||||
bool b_is_3d = mat_b.dim() == 3;
|
||||
bool is_2d_2d = a_is_2d && b_is_2d;
|
||||
bool is_2d_3d = a_is_2d && b_is_3d;
|
||||
TORCH_CHECK(is_2d_2d || is_2d_3d, "MXFP8 grouped GEMM currently only supports 2d-2d and 2d-3d cases");
|
||||
TORCH_CHECK(offs.has_value(), "MXFP8 2d-2d and 2d-3d grouped GEMMs requires offsets");
|
||||
|
||||
fbgemm_gpu::mx8mx8bf16_grouped_mm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_b,
|
||||
offs.value(),
|
||||
out);
|
||||
return out;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
|
||||
@ -1741,6 +1777,7 @@ bool use_fast_accum) {
|
||||
#else
|
||||
TORCH_CHECK(false, "grouped gemm is not supported without USE_FBGEMM_GENAI on ROCM")
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
@ -1750,33 +1787,21 @@ const std::optional<at::Tensor>& offs,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
std::optional<c10::ScalarType> out_dtype) {
|
||||
#ifndef USE_ROCM
|
||||
bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true);
|
||||
TORCH_CHECK(allowed_device, "torch._grouped_mm is only supported on CUDA devices with compute capability = 9.0, 10.0");
|
||||
|
||||
TORCH_CHECK(mat_a.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_a.scalar_type());
|
||||
TORCH_CHECK(mat_b.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_b.scalar_type());
|
||||
TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
|
||||
TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
|
||||
const bool a_is_2d = mat_a.dim() == 2;
|
||||
const bool b_is_2d = mat_b.dim() == 2;
|
||||
if (!a_is_2d || !b_is_2d) {
|
||||
TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
|
||||
_grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
|
||||
bool a_b_and_out_are_bf16 = (
|
||||
mat_a.dtype() == at::kBFloat16 &&
|
||||
mat_b.dtype() == at::kBFloat16 &&
|
||||
out_dtype.value_or(at::kBFloat16) == at::kBFloat16
|
||||
);
|
||||
bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16;
|
||||
const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
|
||||
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
|
||||
if (use_fast_path) {
|
||||
// fast path, no d2h sync needed
|
||||
at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
|
||||
} else {
|
||||
_grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
|
||||
}
|
||||
|
||||
// check that the strides are valid, the fn will throw an error if not
|
||||
check_valid_strides_and_return_transposed(mat_a);
|
||||
check_valid_strides_and_return_transposed(mat_b);
|
||||
TORCH_CHECK(offs.has_value() == (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d");
|
||||
|
||||
if (offs.has_value()) {
|
||||
TORCH_CHECK(offs->dim() == 1, "offs has to be 1D");
|
||||
TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
|
||||
}
|
||||
TORCH_CHECK(!bias.has_value(), "Bias not supported yet");
|
||||
|
||||
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
|
||||
|
||||
at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
|
||||
return out;
|
||||
#else
|
||||
TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
|
||||
|
||||
@ -1412,7 +1412,7 @@
|
||||
- func: cat(Tensor[] tensors, int dim=0) -> Tensor
|
||||
structured_delegate: cat.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: cat_sparse
|
||||
SparseCPU, SparseCUDA: cat_sparse
|
||||
QuantizedCPU: cat_quantized_cpu
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested
|
||||
tags: core
|
||||
@ -7158,6 +7158,7 @@
|
||||
- func: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
|
||||
variants: function
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: _grouped_mm
|
||||
CUDA: _grouped_mm_cuda
|
||||
|
||||
# NOTE [ Sparse: autograd and API ]
|
||||
|
||||
@ -76,13 +76,14 @@ bool priority_order_init_ = false;
|
||||
// TODO(eqy): more benchmarking to determine whether this should include sm86/89
|
||||
// Needs to be kept in-sync with test_fused_chocie in test_transformers.py
|
||||
bool check_prefer_cudnn_attention() {
|
||||
static const bool prefer_cudnn = c10::utils::check_env("TORCH_CUDNN_SDPA_PREFERRED") == true;
|
||||
static const bool prefer_cudnn = c10::utils::check_env("TORCH_CUDNN_SDPA_PREFERRED") != false;
|
||||
if (!prefer_cudnn) {
|
||||
return false;
|
||||
}
|
||||
#if (defined(CUDNN_VERSION) && (CUDNN_VERSION > 90000))
|
||||
#if (defined(CUDNN_VERSION) && (CUDNN_VERSION >= 90900))
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
return dprops->major >= 9 && !dprops->minor;
|
||||
auto major = dprops->major;
|
||||
return (major == 9 || major == 10) && !dprops->minor;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
|
||||
@ -948,6 +948,7 @@ def define_buck_targets(
|
||||
[
|
||||
("torch/csrc/api/include", "torch/**/*.h"),
|
||||
("", "torch/csrc/**/*.h"),
|
||||
("", "torch/csrc/**/*.hpp"),
|
||||
("", "torch/nativert/**/*.h"),
|
||||
("", "torch/headeronly/**/*.h"),
|
||||
("", "torch/script.h"),
|
||||
@ -2033,6 +2034,7 @@ def define_buck_targets(
|
||||
("", "caffe2/utils/*.h"),
|
||||
("", "caffe2/core/*.h"),
|
||||
("", "torch/csrc/*.h"),
|
||||
("", "torch/csrc/*.hpp"),
|
||||
("", "torch/csrc/api/include/torch/*.h"),
|
||||
("", "torch/csrc/autograd/*.h"),
|
||||
("", "torch/csrc/autograd/*/*.h"),
|
||||
|
||||
@ -512,6 +512,7 @@ libtorch_distributed_base_sources = [
|
||||
"torch/csrc/distributed/c10d/TCPStore.cpp",
|
||||
"torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
|
||||
"torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp",
|
||||
"torch/csrc/distributed/c10d/Types.cpp",
|
||||
"torch/csrc/distributed/c10d/Utils.cpp",
|
||||
"torch/csrc/distributed/c10d/Work.cpp",
|
||||
"torch/csrc/distributed/c10d/comm.cpp",
|
||||
@ -635,6 +636,12 @@ libtorch_nativert_sources = [
|
||||
"torch/nativert/graph/passes/pass_manager/GraphPasses.cpp",
|
||||
"torch/nativert/graph/passes/pass_manager/PassManager.cpp",
|
||||
"torch/nativert/kernels/KernelHandlerRegistry.cpp",
|
||||
"torch/nativert/kernels/TritonKernel.cpp",
|
||||
"torch/nativert/executor/triton/CpuTritonKernelManager.cpp",
|
||||
]
|
||||
|
||||
libtorch_nativert_cuda_sources = [
|
||||
"torch/nativert/executor/triton/CudaTritonKernelManager.cpp",
|
||||
]
|
||||
|
||||
torch_mobile_tracer_sources = [
|
||||
@ -770,7 +777,7 @@ libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + lib
|
||||
|
||||
libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_sources + [
|
||||
"torch/csrc/cuda/nccl.cpp",
|
||||
]
|
||||
] + libtorch_nativert_cuda_sources
|
||||
|
||||
torch_cpp_srcs = [
|
||||
"torch/csrc/api/src/cuda.cpp", # this just forwards stuff, no real CUDA
|
||||
@ -1087,6 +1094,7 @@ aten_cpu_source_non_codegen_list = [
|
||||
"aten/src/ATen/DeviceAccelerator.cpp",
|
||||
"aten/src/ATen/Context.cpp",
|
||||
"aten/src/ATen/DLConvertor.cpp",
|
||||
"aten/src/ATen/DTensorState.cpp",
|
||||
"aten/src/ATen/EmptyTensor.cpp",
|
||||
"aten/src/ATen/ExpandUtils.cpp",
|
||||
"aten/src/ATen/CachedTensorUtils.cpp",
|
||||
|
||||
@ -33,7 +33,8 @@ bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
|
||||
}
|
||||
|
||||
// Return a SymBool with underlying symbolic expression that represents
|
||||
// contiguity. Guaranteed not to add guards.
|
||||
// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions
|
||||
// or symbolic True.
|
||||
inline static c10::SymBool _compute_contiguous_sym(
|
||||
ArrayRef<c10::SymInt> sizes,
|
||||
ArrayRef<c10::SymInt> strides,
|
||||
@ -76,6 +77,8 @@ inline static c10::SymBool _compute_contiguous_sym(
|
||||
return true;
|
||||
};
|
||||
|
||||
// We try to minimize creating large symbolic expressions when not needed to
|
||||
// avoid symbolic evaluation perf issues.
|
||||
if (is_contiguous_or_false()) {
|
||||
return c10::SymBool(true);
|
||||
}
|
||||
@ -94,6 +97,9 @@ inline static c10::SymBool _compute_contiguous_sym(
|
||||
return is_contiguous_cond.sym_or(is_empty);
|
||||
}
|
||||
|
||||
// When T is SymInt this function may throw a data dependent error.
|
||||
// _compute_channels_last_contiguous_2d_sym does not. Only use this function
|
||||
// when inputs are hinted.
|
||||
template <typename T>
|
||||
bool _compute_channels_last_contiguous_2d(
|
||||
ArrayRef<T> sizes,
|
||||
@ -105,8 +111,8 @@ bool _compute_channels_last_contiguous_2d(
|
||||
T expected = 1;
|
||||
for (auto& d : {1, 3, 2, 0}) {
|
||||
const auto& size_d = sizes[d];
|
||||
if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
|
||||
if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) {
|
||||
if (size_d != 1) {
|
||||
if (strides[d] != expected) {
|
||||
return false;
|
||||
}
|
||||
expected *= size_d;
|
||||
@ -123,6 +129,65 @@ bool _compute_channels_last_contiguous_2d(
|
||||
}
|
||||
}
|
||||
|
||||
// Return a SymBool with underlying symbolic expression that represents
|
||||
// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions
|
||||
// or symbolic True.
|
||||
inline static c10::SymBool _compute_channels_last_contiguous_2d_sym(
|
||||
ArrayRef<c10::SymInt> sizes,
|
||||
ArrayRef<c10::SymInt> strides) {
|
||||
switch (sizes.size()) {
|
||||
case 4: {
|
||||
// When this function return True, result always true. When it return
|
||||
// False, result could be False or data dependent.
|
||||
auto guard_or_false = [&]() {
|
||||
c10::SymInt expected = 1;
|
||||
for (auto& d : {1, 3, 2, 0}) {
|
||||
const auto& size_d = sizes[d];
|
||||
// Not taking this branch could make this return False instead of True
|
||||
// but not vice-versa. so its ok.
|
||||
if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) {
|
||||
continue;
|
||||
}
|
||||
// Taking this branch could make this return False instead of True
|
||||
// but not vice-versa. so its ok.
|
||||
if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) {
|
||||
return false;
|
||||
}
|
||||
expected *= size_d;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
// We try to minimize creating large symbolic expressions when not needed
|
||||
// to avoid symbolic evaluation perf issues.
|
||||
if (guard_or_false()) {
|
||||
return c10::SymBool(true);
|
||||
}
|
||||
|
||||
// Result is either false, or data dependent.
|
||||
c10::SymInt expected_stride = 1;
|
||||
c10::SymBool cond = true;
|
||||
|
||||
for (auto& d : {1, 3, 2, 0}) {
|
||||
const auto& size_d = sizes[d];
|
||||
cond = cond.sym_and(
|
||||
size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride)));
|
||||
expected_stride *= size_d;
|
||||
}
|
||||
return cond;
|
||||
}
|
||||
// NOLINTNEXTLINE(bugprone-branch-clone)
|
||||
case 3:
|
||||
// TODO dim == 3 case will be enabled once it is fully tested
|
||||
return c10::SymBool(false);
|
||||
default:
|
||||
return c10::SymBool(false);
|
||||
}
|
||||
}
|
||||
|
||||
// When T is SymInt this function may throw a data dependent error.
|
||||
// _compute_channels_last_contiguous_3d_sym does not. Only use this function
|
||||
// when inputs are hinted.
|
||||
template <typename T>
|
||||
bool _compute_channels_last_contiguous_3d(
|
||||
ArrayRef<T> sizes,
|
||||
@ -134,8 +199,8 @@ bool _compute_channels_last_contiguous_3d(
|
||||
T expected = 1;
|
||||
for (auto& d : {1, 4, 3, 2, 0}) {
|
||||
const auto& size_d = sizes[d];
|
||||
if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
|
||||
if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) {
|
||||
if (size_d != 1) {
|
||||
if (strides[d] != expected) {
|
||||
return false;
|
||||
}
|
||||
expected *= size_d;
|
||||
@ -152,6 +217,59 @@ bool _compute_channels_last_contiguous_3d(
|
||||
}
|
||||
}
|
||||
|
||||
inline static c10::SymBool _compute_channels_last_contiguous_3d_sym(
|
||||
ArrayRef<c10::SymInt> sizes,
|
||||
ArrayRef<c10::SymInt> strides) {
|
||||
switch (sizes.size()) {
|
||||
case 5: {
|
||||
// When this function return True, result always true. When it return
|
||||
// False, result could be False or data dependent.
|
||||
auto guard_or_false = [&]() {
|
||||
c10::SymInt expected = 1;
|
||||
for (auto& d : {1, 4, 3, 2, 0}) {
|
||||
const auto& size_d = sizes[d];
|
||||
// Not taking this branch could make this return False instead of True
|
||||
// but not vice-versa. so its ok.
|
||||
if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) {
|
||||
continue;
|
||||
}
|
||||
// Taking this branch could make this return False instead of True
|
||||
// but not vice-versa. so its ok.
|
||||
if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) {
|
||||
return false;
|
||||
}
|
||||
expected *= size_d;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
// We try to minimize creating large symbolic expressions when not needed
|
||||
// to avoid symbolic evaluation perf issues.
|
||||
if (guard_or_false()) {
|
||||
return c10::SymBool(true);
|
||||
}
|
||||
|
||||
// Result is either false, or data dependent.
|
||||
c10::SymInt expected_stride = 1;
|
||||
c10::SymBool cond = true;
|
||||
|
||||
for (auto& d : {1, 4, 3, 2, 0}) {
|
||||
const auto& size_d = sizes[d];
|
||||
cond = cond.sym_and(
|
||||
size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride)));
|
||||
expected_stride *= size_d;
|
||||
}
|
||||
return cond;
|
||||
}
|
||||
// NOLINTNEXTLINE(bugprone-branch-clone)
|
||||
case 4:
|
||||
// TODO dim == 4 case will be enabled once it is fully tested
|
||||
return c10::SymBool(false);
|
||||
default:
|
||||
return c10::SymBool(false);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool _compute_non_overlapping_and_dense(
|
||||
ArrayRef<T> sizes,
|
||||
|
||||
@ -71,6 +71,27 @@ normalize_sym_sizes_strides(SymIntArrayRef sizes, SymIntArrayRef strides) {
|
||||
return std::tuple<SymNode, std::vector<SymNode>, std::vector<SymNode>>(
|
||||
std::move(base), std::move(size_nodes), std::move(stride_nodes));
|
||||
}
|
||||
namespace {
|
||||
bool all_hinted(
|
||||
const c10::SymIntArrayRef& sizes,
|
||||
const c10::SymIntArrayRef& strides) {
|
||||
auto all_hinted = true;
|
||||
for (const auto& s : sizes) {
|
||||
if (!s.has_hint()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (all_hinted) {
|
||||
for (const auto& s : strides) {
|
||||
if (!s.has_hint()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return all_hinted;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// Special treatment because of numel
|
||||
SymBool SymbolicShapeMeta::compute_contiguous() const {
|
||||
@ -88,24 +109,7 @@ SymBool SymbolicShapeMeta::compute_contiguous() const {
|
||||
return maybe_as_bool.value();
|
||||
}
|
||||
|
||||
auto all_hinted = true;
|
||||
for (const auto& s : sizes) {
|
||||
if (!s.has_hint()) {
|
||||
all_hinted = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (all_hinted) {
|
||||
for (const auto& s : strides) {
|
||||
if (!s.has_hint()) {
|
||||
all_hinted = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (all_hinted) {
|
||||
if (all_hinted(sizes, strides)) {
|
||||
// We avoid going through the slow path if everything is hinted,
|
||||
// because evaluating a large SymPy expression can be expensive.
|
||||
// TODO exclude backed_size_oblivious from this path.
|
||||
@ -115,6 +119,56 @@ SymBool SymbolicShapeMeta::compute_contiguous() const {
|
||||
return result;
|
||||
}
|
||||
|
||||
SymBool SymbolicShapeMeta::compute_channels_last_contiguous_2d() const {
|
||||
if (!strides_valid_) {
|
||||
return false;
|
||||
}
|
||||
c10::SymIntArrayRef sizes(sizes_);
|
||||
c10::SymIntArrayRef strides(strides_);
|
||||
|
||||
auto result = _compute_channels_last_contiguous_2d_sym(sizes, strides);
|
||||
|
||||
// If the result is already determined without guarding, just return it.
|
||||
auto maybe_as_bool = result.maybe_as_bool();
|
||||
if (maybe_as_bool.has_value()) {
|
||||
return maybe_as_bool.value();
|
||||
}
|
||||
|
||||
if (all_hinted(sizes, strides)) {
|
||||
// We avoid going through the slow path if everything is hinted,
|
||||
// because evaluating a large SymPy expression can be expensive.
|
||||
// TODO exclude backed_size_oblivious from this path.
|
||||
return _compute_channels_last_contiguous_2d<SymInt>(sizes_, strides_);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
SymBool SymbolicShapeMeta::compute_channels_last_contiguous_3d() const {
|
||||
if (!strides_valid_) {
|
||||
return false;
|
||||
}
|
||||
c10::SymIntArrayRef sizes(sizes_);
|
||||
c10::SymIntArrayRef strides(strides_);
|
||||
|
||||
auto result = _compute_channels_last_contiguous_3d_sym(sizes, strides);
|
||||
|
||||
// If the result is already determined without guarding, just return it.
|
||||
auto maybe_as_bool = result.maybe_as_bool();
|
||||
if (maybe_as_bool.has_value()) {
|
||||
return maybe_as_bool.value();
|
||||
}
|
||||
|
||||
if (all_hinted(sizes, strides)) {
|
||||
// We avoid going through the slow path if everything is hinted,
|
||||
// because evaluating a large SymPy expression can be expensive.
|
||||
// TODO exclude backed_size_oblivious from this path.
|
||||
return _compute_channels_last_contiguous_3d<SymInt>(sizes_, strides_);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// The rest of them
|
||||
#define DEFINE_EAGER_SYMBOOL_COMPUTE(name, fallback) \
|
||||
SymBool SymbolicShapeMeta::name() const { \
|
||||
@ -143,8 +197,6 @@ SymBool SymbolicShapeMeta::compute_contiguous() const {
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
DEFINE_EAGER_SYMBOOL_COMPUTE(compute_channels_last_contiguous_2d, _compute_channels_last_contiguous_2d)
|
||||
DEFINE_EAGER_SYMBOOL_COMPUTE(compute_channels_last_contiguous_3d, _compute_channels_last_contiguous_3d)
|
||||
DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_2d, is_channels_last_strides_2d)
|
||||
DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_3d, is_channels_last_strides_3d)
|
||||
|
||||
|
||||
@ -25,6 +25,7 @@ CUDAAllocatorConfig::CUDAAllocatorConfig()
|
||||
#endif
|
||||
m_release_lock_on_cudamalloc(false),
|
||||
m_pinned_use_cuda_host_register(false),
|
||||
m_graph_capture_record_stream_reuse(false),
|
||||
m_pinned_use_background_threads(false) {
|
||||
m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
|
||||
}
|
||||
@ -373,6 +374,9 @@ void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
|
||||
} else if (config_item_view == "pinned_use_background_threads") {
|
||||
i = parsePinnedUseBackgroundThreads(config, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (config_item_view == "graph_capture_record_stream_reuse") {
|
||||
i = parseGraphCaptureRecordStreamReuse(config, i);
|
||||
used_native_specific_option = true;
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false, "Unrecognized CachingAllocator option: ", config_item_view);
|
||||
@ -406,6 +410,23 @@ size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i) {
|
||||
consumeToken(config, ++i, ':');
|
||||
if (++i < config.size()) {
|
||||
TORCH_CHECK(
|
||||
(config[i] == "True" || config[i] == "False"),
|
||||
"Expected a single True/False argument for graph_capture_record_stream_reuse");
|
||||
m_graph_capture_record_stream_reuse = (config[i] == "True");
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false, "Error, expecting graph_capture_record_stream_reuse value", "");
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i) {
|
||||
|
||||
@ -53,6 +53,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
return instance().m_release_lock_on_cudamalloc;
|
||||
}
|
||||
|
||||
static bool graph_capture_record_stream_reuse() {
|
||||
return instance().m_graph_capture_record_stream_reuse;
|
||||
}
|
||||
|
||||
/** Pinned memory allocator settings */
|
||||
static bool pinned_use_cuda_host_register() {
|
||||
return instance().m_pinned_use_cuda_host_register;
|
||||
@ -142,6 +146,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
size_t parsePinnedUseBackgroundThreads(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i);
|
||||
size_t parseGraphCaptureRecordStreamReuse(
|
||||
const std::vector<std::string>& config,
|
||||
size_t i);
|
||||
|
||||
std::atomic<size_t> m_max_split_size;
|
||||
std::atomic<size_t> m_max_non_split_rounding_size;
|
||||
@ -153,6 +160,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
m_expandable_segments_handle_type;
|
||||
std::atomic<bool> m_release_lock_on_cudamalloc;
|
||||
std::atomic<bool> m_pinned_use_cuda_host_register;
|
||||
std::atomic<bool> m_graph_capture_record_stream_reuse;
|
||||
std::atomic<bool> m_pinned_use_background_threads;
|
||||
std::string m_last_allocator_settings;
|
||||
std::mutex m_last_allocator_settings_mutex;
|
||||
|
||||
@ -1167,8 +1167,13 @@ class DeviceCachingAllocator {
|
||||
// tracks which pools we can use as a last resort before ooming
|
||||
ska::flat_hash_set<MempoolId_t, MempoolIdHash> use_on_oom_pools;
|
||||
|
||||
// See free() for this thing's purpose
|
||||
std::vector<Block*> needs_events_deferred_until_no_capture;
|
||||
// Map of blocks whose freeing is deferred until after CUDA graph capture.
|
||||
// - Key: Block* to be freed.
|
||||
// - Value: List of "empty nodes" inserted as free markers during capture.
|
||||
// If the vector is empty, the block must always be deferred until capture
|
||||
// ends.
|
||||
ska::flat_hash_map<Block*, std::vector<cudaGraphNode_t>> deferred_blocks;
|
||||
|
||||
// outstanding cuda events
|
||||
ska::flat_hash_map<
|
||||
cuda::CUDAStream,
|
||||
@ -1329,6 +1334,11 @@ class DeviceCachingAllocator {
|
||||
// capture. Cross-stream memory use is uncommon, so the deferral's
|
||||
// effect on memory use during capture should be small.
|
||||
process_events(context);
|
||||
} else {
|
||||
if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) {
|
||||
// We check if there is some block that is safe to reuse on this stream
|
||||
free_safe_blocks_in_capture(context, stream);
|
||||
}
|
||||
}
|
||||
size_t size = round_size(orig_size);
|
||||
auto& pool = get_pool(size, stream);
|
||||
@ -1619,6 +1629,248 @@ class DeviceCachingAllocator {
|
||||
return block;
|
||||
}
|
||||
|
||||
// Insert "free marker" (empty nodes) into the CUDA graph for all streams that
|
||||
// have used the block, including the allocation stream. These nodes mark the
|
||||
// last use of the block in the capture graph. Returns a vector of the
|
||||
// inserted nodes, or an empty vector if any stream is not capturing.
|
||||
std::vector<cudaGraphNode_t> insert_free_marker(Block* block) {
|
||||
std::vector<cudaGraphNode_t> empty_nodes;
|
||||
|
||||
auto try_add_empty_node = [&](cudaStream_t stream) -> bool {
|
||||
cudaStreamCaptureStatus status{};
|
||||
cudaGraph_t graph{};
|
||||
const cudaGraphNode_t* deps = nullptr;
|
||||
size_t num_deps = 0;
|
||||
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
|
||||
C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
|
||||
stream, &status, nullptr, &graph, &deps, nullptr, &num_deps));
|
||||
#else
|
||||
C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
|
||||
stream, &status, nullptr, &graph, &deps, &num_deps));
|
||||
#endif
|
||||
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
status != cudaStreamCaptureStatusInvalidated,
|
||||
"Invalid stream capture status");
|
||||
|
||||
if (status == cudaStreamCaptureStatusNone) {
|
||||
return false;
|
||||
}
|
||||
|
||||
cudaGraphNode_t node{};
|
||||
C10_CUDA_CHECK(cudaGraphAddEmptyNode(&node, graph, deps, num_deps));
|
||||
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
|
||||
C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies(
|
||||
stream, &node, nullptr, 1, cudaStreamSetCaptureDependencies));
|
||||
#else
|
||||
C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies(
|
||||
stream, &node, 1, cudaStreamSetCaptureDependencies));
|
||||
#endif
|
||||
empty_nodes.push_back(node);
|
||||
return true;
|
||||
};
|
||||
|
||||
// If any stream is not currently capturing, return an empty node vector.
|
||||
// An empty vector indicates that the block should be deferred for freeing
|
||||
// until after capture.
|
||||
|
||||
// Attempt to add an empty node for the allocation stream.
|
||||
if (!try_add_empty_node(block->stream)) {
|
||||
return {};
|
||||
}
|
||||
// Attempt to add empty nodes for all streams that have used the block.
|
||||
for (const auto& s : block->stream_uses) {
|
||||
if (!try_add_empty_node(s.stream())) {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
return empty_nodes;
|
||||
}
|
||||
|
||||
// Returns the current set of "terminal" nodes in the CUDA graph for a given
|
||||
// stream. These represent the current endpoints of the stream, and may
|
||||
// include additional nodes if the graph branches. Any new work captured will
|
||||
// be attached after one or more of these terminals.
|
||||
std::vector<cudaGraphNode_t> get_terminals(cudaStream_t stream) {
|
||||
std::vector<cudaGraphNode_t> result;
|
||||
|
||||
cudaStreamCaptureStatus status{};
|
||||
cudaGraph_t graph{};
|
||||
const cudaGraphNode_t* dependencies = nullptr;
|
||||
size_t num_dependencies = 0;
|
||||
|
||||
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
|
||||
C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
|
||||
stream,
|
||||
&status,
|
||||
nullptr,
|
||||
&graph,
|
||||
&dependencies,
|
||||
nullptr,
|
||||
&num_dependencies));
|
||||
#else
|
||||
C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
|
||||
stream, &status, nullptr, &graph, &dependencies, &num_dependencies));
|
||||
#endif
|
||||
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
status == cudaStreamCaptureStatusActive,
|
||||
"Invalid stream capture status");
|
||||
|
||||
for (size_t i = 0; i < num_dependencies; i++) {
|
||||
auto node = dependencies[i];
|
||||
if (node != nullptr) {
|
||||
result.push_back(node);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns the set of "reusable" free markers (empty nodes) in the current
|
||||
// CUDA graph capture. A free marker is considered reusable if it is a
|
||||
// predecessor of every terminal node.
|
||||
// This ensures that all future captured work will occur after the free
|
||||
// marker, making it safe to reuse.
|
||||
ska::flat_hash_set<cudaGraphNode_t> get_reusable_empty_nodes(
|
||||
cudaStream_t stream) {
|
||||
auto terminals = get_terminals(stream);
|
||||
if (terminals.empty()) {
|
||||
// No terminal nodes found; nothing to free.
|
||||
return {};
|
||||
}
|
||||
|
||||
auto get_dependencies = [](cudaGraphNode_t node,
|
||||
cudaGraphNode_t* pDependencies,
|
||||
size_t* pNumDependencies) -> void {
|
||||
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
|
||||
C10_CUDA_CHECK(cudaGraphNodeGetDependencies(
|
||||
node, pDependencies, nullptr, pNumDependencies));
|
||||
#else
|
||||
C10_CUDA_CHECK(
|
||||
cudaGraphNodeGetDependencies(node, pDependencies, pNumDependencies));
|
||||
#endif
|
||||
};
|
||||
|
||||
// Helper to retrieve all parent nodes (dependencies) of a given node.
|
||||
auto get_parents =
|
||||
[&](cudaGraphNode_t node) -> std::vector<cudaGraphNode_t> {
|
||||
size_t count = 0;
|
||||
get_dependencies(node, nullptr, &count);
|
||||
std::vector<cudaGraphNode_t> out(count);
|
||||
if (count) {
|
||||
get_dependencies(node, out.data(), &count);
|
||||
out.resize(count);
|
||||
}
|
||||
return out;
|
||||
};
|
||||
|
||||
// Helper to determine if a node is an empty node (used as a free marker).
|
||||
auto is_empty_node = [](cudaGraphNode_t n) -> bool {
|
||||
cudaGraphNodeType type{};
|
||||
C10_CUDA_CHECK(cudaGraphNodeGetType(n, &type));
|
||||
return type == cudaGraphNodeTypeEmpty;
|
||||
};
|
||||
|
||||
// For each terminal node, perform a reverse DFS to count, for each empty
|
||||
// node, how many terminals it can reach (i.e., for how many terminals it is
|
||||
// a predecessor). An empty node is reusable if it is a predecessor of all
|
||||
// terminal nodes.
|
||||
ska::flat_hash_map<cudaGraphNode_t, size_t> num_terminals_reachable;
|
||||
|
||||
for (auto terminal : terminals) {
|
||||
ska::flat_hash_set<cudaGraphNode_t> visited;
|
||||
ska::flat_hash_set<cudaGraphNode_t> empty_nodes;
|
||||
|
||||
std::function<void(cudaGraphNode_t)> reverse_dfs =
|
||||
[&](cudaGraphNode_t node) {
|
||||
if (!visited.insert(node).second)
|
||||
return;
|
||||
|
||||
if (is_empty_node(node)) {
|
||||
num_terminals_reachable[node]++;
|
||||
empty_nodes.insert(node);
|
||||
}
|
||||
auto parents = get_parents(node);
|
||||
for (auto p : parents) {
|
||||
reverse_dfs(p);
|
||||
}
|
||||
};
|
||||
|
||||
reverse_dfs(terminal);
|
||||
}
|
||||
|
||||
ska::flat_hash_set<cudaGraphNode_t> reusable_empty_nodes;
|
||||
for (auto [node, count] : num_terminals_reachable) {
|
||||
if (count == terminals.size()) {
|
||||
reusable_empty_nodes.insert(node);
|
||||
}
|
||||
}
|
||||
|
||||
return reusable_empty_nodes;
|
||||
}
|
||||
|
||||
// A block is considered reusable during CUDA graph capture if every free
|
||||
// marker (empty node) associated with the block is a predecessor of every
|
||||
// terminal node.
|
||||
//
|
||||
// This ensures that any new operation added to the graph will be attached
|
||||
// after all terminal nodes, which themselves are after all free markers. As a
|
||||
// result, all future work is guaranteed to occur after the block's last use
|
||||
// on every stream, so the block's previous lifetime ends before any new
|
||||
// lifetime begins. This check relies solely on the DAG topology and does not
|
||||
// require event queries, making it safe to use during capture.
|
||||
//
|
||||
// This function iterates over all deferred blocks, determines if their empty
|
||||
// nodes are reusable according to the above criteria, and frees the block if
|
||||
// so.
|
||||
void free_safe_blocks_in_capture(
|
||||
const std::shared_ptr<GatheredContext>& context,
|
||||
cudaStream_t stream) {
|
||||
auto reusable_empty_nodes = get_reusable_empty_nodes(stream);
|
||||
|
||||
// If there are no reusable empty nodes (e.g., not currently capturing),
|
||||
// there is nothing to do.
|
||||
if (reusable_empty_nodes.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<Block*> blocks_to_erase;
|
||||
|
||||
for (auto& [block, inserted_empty_nodes] : deferred_blocks) {
|
||||
// Skip this block if it has no empty nodes, as we defer its freeing until
|
||||
// after graph capture. Also skip if the block was not allocated on the
|
||||
// current stream; such blocks will be freed when
|
||||
// free_safe_blocks_in_capture is attempted on that stream.
|
||||
if (inserted_empty_nodes.empty() || block->stream != stream) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool is_reusable = true;
|
||||
|
||||
for (const auto& node : inserted_empty_nodes) {
|
||||
if (reusable_empty_nodes.find(node) == reusable_empty_nodes.end()) {
|
||||
is_reusable = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_reusable) {
|
||||
// Clear stream uses since the graph ensures proper synchronization.
|
||||
// No need to insert events.
|
||||
block->stream_uses.clear();
|
||||
|
||||
free_block(block, context);
|
||||
blocks_to_erase.push_back(block);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove blocks that were freed from the deferred_blocks map.
|
||||
for (auto* block : blocks_to_erase) {
|
||||
deferred_blocks.erase(block);
|
||||
}
|
||||
}
|
||||
|
||||
void free(Block* block) {
|
||||
std::shared_ptr<GatheredContext> context =
|
||||
maybeGatherContext(RecordContext::ALL);
|
||||
@ -1654,14 +1906,22 @@ class DeviceCachingAllocator {
|
||||
if (block->size >= CUDAAllocatorConfig::max_split_size())
|
||||
stats.oversize_allocations.decrease(1);
|
||||
|
||||
// If the block has been used on more than one stream, handle accordingly.
|
||||
if (!block->stream_uses.empty()) {
|
||||
if (C10_UNLIKELY(!captures_underway.empty())) {
|
||||
// It's forbidden to cudaEventQuery an event recorded during CUDA graph
|
||||
// capture. We conservatively defer recording end-of-life events until
|
||||
// the next call to process_events() (which won't happen until no
|
||||
// captures are underway)
|
||||
needs_events_deferred_until_no_capture.push_back(block);
|
||||
if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) {
|
||||
// insert_free_marker returns a vector of free markers,
|
||||
// or an empty vector if any associated stream is not currently
|
||||
// capturing. The empty vector means that we will defer the free until
|
||||
// capture is finished.
|
||||
deferred_blocks.emplace(block, insert_free_marker(block));
|
||||
} else {
|
||||
// If graph_capture_record_stream_reuse is not enabled, always defer
|
||||
// the free until capture is finished.
|
||||
deferred_blocks.emplace(block, std::vector<cudaGraphNode_t>{});
|
||||
}
|
||||
} else {
|
||||
// If not in a capture, insert events for the block.
|
||||
insert_events(block);
|
||||
}
|
||||
} else {
|
||||
@ -3287,8 +3547,8 @@ class DeviceCachingAllocator {
|
||||
|
||||
void insert_events_deferred_until_no_capture(
|
||||
const std::shared_ptr<GatheredContext>& context) {
|
||||
if (C10_UNLIKELY(!needs_events_deferred_until_no_capture.empty())) {
|
||||
for (auto* block : needs_events_deferred_until_no_capture) {
|
||||
if (C10_UNLIKELY(!deferred_blocks.empty())) {
|
||||
for (auto& [block, inserted_empty_nodes] : deferred_blocks) {
|
||||
TORCH_INTERNAL_ASSERT(!block->stream_uses.empty());
|
||||
// only streams recorded before cudagraph will be used to insert events
|
||||
// since we know all streams recorded during cudagraph must have
|
||||
@ -3300,7 +3560,7 @@ class DeviceCachingAllocator {
|
||||
free_block(block, context);
|
||||
}
|
||||
}
|
||||
needs_events_deferred_until_no_capture.clear();
|
||||
deferred_blocks.clear();
|
||||
}
|
||||
}
|
||||
|
||||
@ -3731,6 +3991,8 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
md.pinned_use_host_register =
|
||||
CUDAAllocatorConfig::pinned_use_cuda_host_register();
|
||||
md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
|
||||
md.graph_capture_record_stream_reuse =
|
||||
CUDAAllocatorConfig::graph_capture_record_stream_reuse();
|
||||
md.roundup_power2_divisions =
|
||||
CUDAAllocatorConfig::roundup_power2_divisions();
|
||||
|
||||
|
||||
@ -163,6 +163,7 @@ struct AllocatorConfigInfo {
|
||||
bool expandable_segments;
|
||||
bool release_lock_on_malloc;
|
||||
bool pinned_use_host_register;
|
||||
bool graph_capture_record_stream_reuse;
|
||||
std::string last_allocator_settings;
|
||||
std::vector<size_t> roundup_power2_divisions;
|
||||
};
|
||||
|
||||
@ -46,7 +46,7 @@ def define_targets(rules):
|
||||
"util/typeid_test.cpp",
|
||||
],
|
||||
),
|
||||
copts = ["-Wno-deprecated-declarations"],
|
||||
copts = ["-Wno-deprecated-declarations", "-Wno-ctad-maybe-unsupported"],
|
||||
deps = [
|
||||
":Macros",
|
||||
":complex_math_test_common",
|
||||
|
||||
@ -1638,6 +1638,10 @@ if(USE_CUDA)
|
||||
# order of the libraries in the linker call matters here when statically
|
||||
# linking; libculibos and cublas must be last.
|
||||
target_link_libraries(torch_cuda PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
|
||||
if(USE_FBGEMM_GENAI)
|
||||
# Link fbgemm_genai to torch_cuda (only for (1) CUDA build for SM100).
|
||||
target_link_libraries(torch_cuda PRIVATE fbgemm_genai)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# ---[ XPU library.
|
||||
@ -1759,9 +1763,10 @@ if(USE_ROCM)
|
||||
target_link_libraries(torch_hip PRIVATE ${Caffe2_HIP_DEPENDENCY_LIBS})
|
||||
|
||||
if(USE_FBGEMM_GENAI)
|
||||
target_link_libraries(torch_hip PRIVATE fbgemm_genai)
|
||||
if(USE_ROCM)
|
||||
target_link_libraries(torch_hip PRIVATE fbgemm_genai)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Since PyTorch files contain HIP headers, this is also needed to capture the includes.
|
||||
# ROCM_INCLUDE_DIRS is defined in LoadHIP.cmake
|
||||
target_include_directories(torch_hip PRIVATE ${Caffe2_HIP_INCLUDE} ${ROCM_INCLUDE_DIRS})
|
||||
|
||||
@ -1666,9 +1666,9 @@ if(USE_KINETO)
|
||||
set(CMAKE_REQUIRED_LINK_OPTIONS "")
|
||||
if(NOT EXCEPTIONS_WORK)
|
||||
message(FATAL_ERROR
|
||||
"Detected that statically linking against CUPTI causes exceptions to stop working. "
|
||||
"See https://github.com/pytorch/pytorch/issues/57744 for more details. "
|
||||
"Perhaps try: USE_CUPTI_SO=1 CMAKE_FRESH=1 python setup.py develop")
|
||||
"Detected that statically linking against CUPTI causes exceptions to stop working. "
|
||||
"See https://github.com/pytorch/pytorch/issues/57744 for more details. "
|
||||
"Perhaps try: USE_CUPTI_SO=1 CMAKE_FRESH=1 python -m pip install -e . -v --no-build-isolation")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
@ -46,8 +46,8 @@ IF(NOT MKLDNN_FOUND)
|
||||
endif()
|
||||
endif()
|
||||
ExternalProject_Add(xpu_mkldnn_proj
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN
|
||||
GIT_TAG v3.8.1
|
||||
GIT_REPOSITORY https://github.com/uxlfoundation/oneDNN
|
||||
GIT_TAG v3.9.1
|
||||
PREFIX ${XPU_MKLDNN_DIR_PREFIX}
|
||||
BUILD_IN_SOURCE 0
|
||||
CMAKE_ARGS -DCMAKE_C_COMPILER=icx
|
||||
|
||||
@ -282,9 +282,15 @@ endif()
|
||||
# cufft
|
||||
add_library(caffe2::cufft INTERFACE IMPORTED)
|
||||
if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
|
||||
set_property(
|
||||
TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
|
||||
CUDA::cufft_static_nocallback)
|
||||
if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
|
||||
set_property(
|
||||
TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
|
||||
CUDA::cufft_static_nocallback)
|
||||
else()
|
||||
set_property(
|
||||
TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
|
||||
CUDA::cufft_static)
|
||||
endif()
|
||||
else()
|
||||
set_property(
|
||||
TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
|
||||
|
||||
242
docs/source/compile/programming_model.error_on_graph_break.md
Normal file
242
docs/source/compile/programming_model.error_on_graph_break.md
Normal file
@ -0,0 +1,242 @@
|
||||
---
|
||||
file_format: mystnb
|
||||
kernelspec:
|
||||
name: python3
|
||||
mystnb:
|
||||
execution_timeout: 30
|
||||
execution_show_tb: True
|
||||
merge_streams: True
|
||||
---
|
||||
|
||||
```{code-cell}
|
||||
:tags: [remove-cell]
|
||||
import torch
|
||||
|
||||
import header_code
|
||||
torch._logging.set_logs(graph_breaks=True)
|
||||
```
|
||||
|
||||
# Toggling `error_on_graph_break`
|
||||
|
||||
**Summary:**
|
||||
|
||||
- When `fullgraph=False`, we can use `torch._dynamo.error_on_graph_break()` for more flexibility in
|
||||
dealing with graph breaks.
|
||||
|
||||
So far, we have introduced two ways in dealing with graph breaks in `torch.compile`:
|
||||
1. `fullgraph=True` errors on the first graph break and additionally guarantees that only one graph is traced from the code.
|
||||
2. `fullgraph=False` continues tracing even when encountering graph breaks.
|
||||
|
||||
What if we want to disallow graph breaks for most of the code, but there are a few problematic functions where the graph breaks are hard to remove,
|
||||
and we are okay with having those graph breaks? We can use `torch._dynamo.error_on_graph_break()` to achieve this.
|
||||
|
||||
`torch.compile` has an `error_on_graph_break` setting (initially set to `False`).
|
||||
If a graph break or compiler error occurs in code while `error_on_graph_break` is set to `False`, then `torch.compile` will attempt to continue compilation after the graph break/error.
|
||||
If `error_on_graph_break` is set to `True`, then `torch.compile` will abort compilation and propagate the error to user code.
|
||||
|
||||
A significant difference between `error_on_graph_break=True` and `fullgraph=True` is that the former **does not guarantee that a single graph will be captured**.
|
||||
`error_on_graph_break` **can be arbitrarily toggled during compile time** by using the `torch._dynamo.error_on_graph_break()` context manager/decorator.
|
||||
In comparison, once `fullgraph` is set to `True`, it cannot be set back to `False`.
|
||||
Finally, `error_on_graph_break` has lower precedence than `fullgraph` - `error_on_graph_break` only takes effect when `fullgraph=False`.
|
||||
|
||||
|
||||
## `error_on_graph_break(False)` example
|
||||
|
||||
```{code-cell}
|
||||
@torch._dynamo.error_on_graph_break(False)
|
||||
def code_with_a_difficult_graph_break(x):
|
||||
x = x + 1
|
||||
torch._dynamo.graph_break()
|
||||
return x + 2
|
||||
|
||||
def inner(x):
|
||||
return code_with_a_difficult_graph_break(x)
|
||||
|
||||
# NOTE: fullgraph=False
|
||||
@torch._dynamo.error_on_graph_break(True)
|
||||
@torch.compile
|
||||
def fn(x):
|
||||
return inner(x)
|
||||
|
||||
# No error, but there is a graph break
|
||||
fn(torch.randn(3))
|
||||
```
|
||||
|
||||
Using `error_on_graph_break(False)` under `error_on_graph_break(True)` is helpful for when we want to minimize graph breaks (i.e. follow the `fullgraph=True` programming model),
|
||||
but there are some sections of code with non-performance-critical graph breaks that are difficult to work around.
|
||||
|
||||
`error_on_graph_break()` can be used as a context manager as well:
|
||||
|
||||
```{code-cell}
|
||||
# NOTE: fullgraph=False
|
||||
@torch._dynamo.error_on_graph_break(True)
|
||||
@torch.compile
|
||||
def fn(x):
|
||||
x = x + 1
|
||||
with torch._dynamo.error_on_graph_break(False):
|
||||
torch._dynamo.graph_break() # no error
|
||||
return x + 2
|
||||
|
||||
# No error, but there is a graph break
|
||||
fn(torch.randn(3))
|
||||
```
|
||||
|
||||
You can use monkey patching to toggle `error_on_graph_break` for code where you cannot edit the source (e.g. framework code):
|
||||
|
||||
```{code-cell}
|
||||
class ThirdPartyModule(torch.nn.Module):
|
||||
def forward(self, x):
|
||||
x = x + 1
|
||||
torch._dynamo.graph_break()
|
||||
return x + 2
|
||||
|
||||
tp_mod = ThirdPartyModule()
|
||||
tp_mod.forward = torch._dynamo.error_on_graph_break(False)(tp_mod.forward)
|
||||
|
||||
@torch._dynamo.error_on_graph_break(True)
|
||||
@torch.compile
|
||||
def fn(x):
|
||||
return tp_mod.forward(x)
|
||||
|
||||
# No error, but there is a graph break
|
||||
fn(torch.randn(3))
|
||||
```
|
||||
|
||||
## `error_on_graph_break(True)` example
|
||||
|
||||
```{code-cell}
|
||||
@torch._dynamo.error_on_graph_break(True)
|
||||
def inner2(x):
|
||||
x = x + 1
|
||||
torch._dynamo.graph_break() # error
|
||||
return x + 2
|
||||
|
||||
def inner(x):
|
||||
return inner2(x)
|
||||
|
||||
# fullgraph=False, error_on_graph_break=False
|
||||
@torch.compile
|
||||
def fn(x):
|
||||
x = x + 4
|
||||
torch._dynamo.graph_break() # no error
|
||||
return inner(x)
|
||||
|
||||
try:
|
||||
fn(torch.randn(3))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
Using `error_on_graph_break(True)` under `error_on_graph_break(False)` is helpful for when we want to use `torch.compile` flexibly (i.e. follow the `fullgraph=False` programming model),
|
||||
but there are some sections of the code that are performance-critical and we want to ensure that those sections do not contain graph breaks.
|
||||
|
||||
## `error_on_graph_break` nesting behavior
|
||||
|
||||
`torch._dynamo.error_on_graph_break()` affects the `error_on_graph_break` setting of nested calls as well:
|
||||
|
||||
```{code-cell}
|
||||
def inner(x):
|
||||
x = x + 1
|
||||
torch._dynamo.graph_break()
|
||||
return x + 2
|
||||
|
||||
def inner2(x):
|
||||
with torch._dynamo.error_on_graph_break(False):
|
||||
return inner(x)
|
||||
|
||||
@torch._dynamo.error_on_graph_break(True)
|
||||
@torch.compile
|
||||
def fn(x):
|
||||
return inner2(x)
|
||||
|
||||
# no error
|
||||
fn(torch.randn(3))
|
||||
```
|
||||
|
||||
`torch._dynamo.error_on_graph_break()` can be used under another `torch._dynamo.error_on_graph_break()` region:
|
||||
|
||||
```{code-cell}
|
||||
def inner(x):
|
||||
x = x + 1
|
||||
with torch._dynamo.error_on_graph_break(False):
|
||||
torch._dynamo.graph_break()
|
||||
return x + 2
|
||||
|
||||
def inner2(x):
|
||||
with torch._dynamo.error_on_graph_break(True):
|
||||
return inner(x)
|
||||
|
||||
@torch.compile
|
||||
def fn(x):
|
||||
return inner2(x)
|
||||
|
||||
# no error
|
||||
fn(torch.randn(3))
|
||||
```
|
||||
|
||||
## Interaction with `fullgraph`
|
||||
|
||||
`fullgraph=True` takes higher precedence than `error_on_graph_break`:
|
||||
|
||||
|
||||
```{code-cell}
|
||||
@torch._dynamo.error_on_graph_break(False)
|
||||
def inner(x):
|
||||
x = x + 1
|
||||
torch._dynamo.graph_break()
|
||||
return x + 2
|
||||
|
||||
@torch.compile(fullgraph=True)
|
||||
def fn(x):
|
||||
return inner(x)
|
||||
|
||||
try:
|
||||
fn(torch.randn(3))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
`fullgraph=True` cannot be toggled back to `fullgraph=False`:
|
||||
|
||||
```{code-cell}
|
||||
@torch.compile(fullgraph=False)
|
||||
def inner(x):
|
||||
x = x + 1
|
||||
torch._dynamo.graph_break()
|
||||
return x + 2
|
||||
|
||||
@torch.compile(fullgraph=True)
|
||||
def fn(x):
|
||||
return inner(x)
|
||||
|
||||
try:
|
||||
fn(torch.randn(3))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
```{code-cell}
|
||||
@torch.compile(fullgraph=True)
|
||||
def inner(x):
|
||||
x = x + 1
|
||||
torch._dynamo.graph_break()
|
||||
return x + 2
|
||||
|
||||
@torch.compile(fullgraph=False)
|
||||
def fn(x):
|
||||
return inner(x)
|
||||
|
||||
try:
|
||||
fn(torch.randn(3))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
## Summary of `fullgraph=True/False` vs `error_on_graph_break`
|
||||
|
||||
Here is a table summarizing the differences between `fullgraph=True/False` and `error_on_graph_break`:
|
||||
|
||||
| | `error_on_graph_break=True` | `error_on_graph_break=False` (default) |
|
||||
| --- | --- | --- |
|
||||
| `fullgraph=True` | Graph breaks result in errors. Only the first graph break will be reported. **One graph guarantee.**<br><br>`fullgraph` cannot be toggled to `False`. `error_on_graph_break` has no effect.<br><br>User code must be fully compatible with `torch.compile`. Guarantees no performance hits from graph breaks (because there are no graph breaks).<br><br>Ideal for code sensitive to graph breaks: framework/library code or cases where getting maximum performance is required. Prevents downstream user code from inadvertently allowing graph breaks. | Same as `fullgraph=True` and `error_on_graph_break=True` as `error_on_graph_break` has no effect when `fullgraph=True`. |
|
||||
| `fullgraph=False` (default) | Graph breaks result in errors. Only the first graph break will be reported. **No one graph guarantee.**<br><br>`error_on_graph_break` can be toggled to `False`.<br><br>User code must be fully compatible with `torch.compile`. Guarantees no performance hits from graph breaks (because there are no graph breaks).<br><br>Ideal for user code sensitive to graph breaks. `error_on_graph_break` can be toggled to `False` to deal with sections that have graph breaks that are difficult to work around. | Will continue to compile after encountering graph breaks. All graph breaks will be reported.<br><br>`error_on_graph_break` can be toggled to `True`.<br><br>Doesn’t require many user code changes to work. Performance may be negatively impacted due to graph breaks.<br><br>Ideal for out-of-the-box use cases, on “non-weird” code, or where squeezing maximal performance is not necessary |
|
||||
@ -19,6 +19,7 @@ The strategy for using `torch.compile(fullgraph=False)` is as follows:
|
||||
```{toctree}
|
||||
programming_model.where_to_apply_compile
|
||||
programming_model.compiler_disable
|
||||
programming_model.error_on_graph_break
|
||||
programming_model.nested_graph_breaks
|
||||
programming_model.skipped_functions
|
||||
```
|
||||
|
||||
@ -645,6 +645,7 @@ export/programming_model
|
||||
export/ir_spec
|
||||
export/pt2_archive
|
||||
export/draft_export
|
||||
export/joint_with_descriptors
|
||||
cond
|
||||
generated/exportdb/index
|
||||
torch.compiler_aot_inductor
|
||||
|
||||
111
docs/source/export/joint_with_descriptors.md
Normal file
111
docs/source/export/joint_with_descriptors.md
Normal file
@ -0,0 +1,111 @@
|
||||
# Joint with descriptors
|
||||
|
||||
Joint with descriptors is an experimental API for exporting a traced joint
|
||||
graph that supports all of torch.compile's features in full generality and,
|
||||
after processing, can be converted back into a differentiable callable that
|
||||
can be executed as normal. For example, it is used to implement autoparallel,
|
||||
a system that takes a model and reshards inputs and parameters to make it
|
||||
a distributed SPMD program.
|
||||
|
||||
```{eval-rst}
|
||||
.. currentmodule:: torch._functorch.aot_autograd
|
||||
.. autofunction:: aot_export_joint_with_descriptors
|
||||
.. autofunction:: aot_compile_joint_with_descriptors
|
||||
```
|
||||
|
||||
## Descriptors
|
||||
|
||||
```{eval-rst}
|
||||
.. currentmodule:: torch._functorch._aot_autograd.descriptors
|
||||
|
||||
.. autoclass:: AOTInput
|
||||
:members:
|
||||
|
||||
.. autoclass:: AOTOutput
|
||||
:members:
|
||||
|
||||
.. autoclass:: BackwardTokenAOTInput
|
||||
:members:
|
||||
|
||||
.. autoclass:: BackwardTokenAOTOutput
|
||||
:members:
|
||||
|
||||
.. autoclass:: BufferAOTInput
|
||||
:members:
|
||||
|
||||
.. autoclass:: DummyAOTInput
|
||||
:members:
|
||||
|
||||
.. autoclass:: DummyAOTOutput
|
||||
:members:
|
||||
|
||||
.. autoclass:: GradAOTOutput
|
||||
:members:
|
||||
|
||||
.. autoclass:: InputMutationAOTOutput
|
||||
:members:
|
||||
|
||||
.. autoclass:: IntermediateBaseAOTOutput
|
||||
:members:
|
||||
|
||||
.. autoclass:: ParamAOTInput
|
||||
:members:
|
||||
|
||||
.. autoclass:: PhiloxBackwardBaseOffsetAOTInput
|
||||
:members:
|
||||
|
||||
.. autoclass:: PhiloxBackwardSeedAOTInput
|
||||
:members:
|
||||
|
||||
.. autoclass:: PhiloxForwardBaseOffsetAOTInput
|
||||
:members:
|
||||
|
||||
.. autoclass:: PhiloxForwardSeedAOTInput
|
||||
:members:
|
||||
|
||||
.. autoclass:: PhiloxUpdatedBackwardOffsetAOTOutput
|
||||
:members:
|
||||
|
||||
.. autoclass:: PhiloxUpdatedForwardOffsetAOTOutput
|
||||
:members:
|
||||
|
||||
.. autoclass:: PlainAOTInput
|
||||
:members:
|
||||
|
||||
.. autoclass:: PlainAOTOutput
|
||||
:members:
|
||||
|
||||
.. autoclass:: SavedForBackwardsAOTOutput
|
||||
:members:
|
||||
|
||||
.. autoclass:: SubclassGetAttrAOTInput
|
||||
:members:
|
||||
|
||||
.. autoclass:: SubclassGetAttrAOTOutput
|
||||
:members:
|
||||
|
||||
.. autoclass:: SubclassSizeAOTInput
|
||||
:members:
|
||||
|
||||
.. autoclass:: SubclassSizeAOTOutput
|
||||
:members:
|
||||
|
||||
.. autoclass:: SubclassStrideAOTInput
|
||||
:members:
|
||||
|
||||
.. autoclass:: SubclassStrideAOTOutput
|
||||
:members:
|
||||
|
||||
.. autoclass:: SyntheticBaseAOTInput
|
||||
:members:
|
||||
|
||||
.. autoclass:: ViewBaseAOTInput
|
||||
:members:
|
||||
```
|
||||
|
||||
## FX utilities
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: torch._functorch._aot_autograd.fx_utils
|
||||
:members:
|
||||
```
|
||||
@ -608,6 +608,14 @@ Available options:
|
||||
for processing events. This avoids any slow path associated with querying/processing of
|
||||
events in the fast allocation path. This feature is disabled by default.
|
||||
|
||||
* ``graph_capture_record_stream_reuse`` (experimental, default: `False`)
|
||||
If set to `True`, the CUDA caching allocator will attempt to reclaim device memory during
|
||||
CUDA Graph capture by using the graph topology (instead of CUDA events) to determine
|
||||
when a freed block is safe to reuse. This can reduce peak memory during long captures that free
|
||||
and reallocate buffers across multiple streams, especially when the capture DAG frequently
|
||||
reaches joined frontiers. Note: Enabling this option can significantly increase the time spent
|
||||
capturing the graph.
|
||||
|
||||
.. note::
|
||||
|
||||
Some stats reported by the
|
||||
|
||||
@ -16,6 +16,39 @@ requires = [
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
# This list should be kept in sync with the requirements-build.txt
|
||||
# in PyTorch root until the project fully migrates to pyproject.toml
|
||||
# after which this can be removed as it is already specified in the
|
||||
# [build-system] section
|
||||
"setuptools>=70.1.0,<80.0", # setuptools develop deprecated on 80.0
|
||||
"cmake>=3.27",
|
||||
"ninja",
|
||||
"numpy",
|
||||
"packaging",
|
||||
"pyyaml",
|
||||
"requests",
|
||||
"six", # dependency chain: NNPACK -> PeachPy -> six
|
||||
"typing-extensions>=4.10.0",
|
||||
|
||||
# This list should be kept in sync with the requirements.txt in
|
||||
# PyTorch root until the project fully migrates to pyproject.toml
|
||||
"build[uv]",
|
||||
"expecttest>=0.3.0",
|
||||
"filelock",
|
||||
"fsspec>=0.8.5",
|
||||
"hypothesis",
|
||||
"jinja2",
|
||||
"lintrunner; platform_machine != 's390x' and platform_machine != 'riscv64'",
|
||||
"networkx>=2.5.1",
|
||||
"optree>=0.13.0",
|
||||
"psutil",
|
||||
"sympy>=1.13.3",
|
||||
"typing-extensions>=4.13.2",
|
||||
"wheel",
|
||||
]
|
||||
|
||||
[project]
|
||||
name = "torch"
|
||||
description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
|
||||
|
||||
4
setup.py
4
setup.py
@ -58,8 +58,8 @@
|
||||
# USE_FBGEMM=0
|
||||
# disables the FBGEMM build
|
||||
#
|
||||
# USE_FBGEMM_GENAI=1
|
||||
# enables the FBGEMM GenAI kernels to build
|
||||
# USE_FBGEMM_GENAI=0
|
||||
# disables the FBGEMM GenAI build
|
||||
#
|
||||
# USE_KINETO=0
|
||||
# disables usage of libkineto library for profiling
|
||||
|
||||
@ -40,8 +40,16 @@ set(NATIVERT_TEST_SRCS
|
||||
${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/GraphPasses.cpp
|
||||
${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/PassManager.cpp
|
||||
${TORCH_ROOT}/torch/nativert/kernels/KernelHandlerRegistry.cpp
|
||||
${TORCH_ROOT}/torch/nativert/kernels/TritonKernel.cpp
|
||||
${TORCH_ROOT}/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
|
||||
${TORCH_ROOT}/torch/nativert/executor/DelegateExecutor.cpp
|
||||
)
|
||||
|
||||
if(USE_CUDA)
|
||||
list(APPEND NATIVERT_TEST_SRCS ${TORCH_ROOT}/torch/nativert/executor/triton/CudaTritonKernelManager.cpp)
|
||||
endif(MSVC)
|
||||
|
||||
|
||||
add_executable(test_nativert
|
||||
${TORCH_ROOT}/test/cpp/common/main.cpp
|
||||
${NATIVERT_TEST_SRCS}
|
||||
|
||||
@ -0,0 +1,14 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <torch/nativert/kernels/TritonKernel.h>
|
||||
|
||||
using namespace ::testing;
|
||||
using namespace torch::nativert;
|
||||
|
||||
TEST(TritonKernelManagerRegistrationTests, TestRegister) {
|
||||
#ifndef USE_CUDA
|
||||
EXPECT_TRUE(create_cuda_triton_kernel_manager == nullptr);
|
||||
#else
|
||||
EXPECT_FALSE(create_cuda_triton_kernel_manager == nullptr);
|
||||
#endif // USE_CUDA
|
||||
}
|
||||
@ -12,7 +12,7 @@ from torch.distributed.fsdp import fully_shard
|
||||
from torch.distributed.tensor.debug import CommDebugMode
|
||||
from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
|
||||
from torch.testing._internal.common_fsdp import FSDPTest, get_devtype, MLPStack
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
ModelArgs,
|
||||
Transformer,
|
||||
@ -123,6 +123,7 @@ class TestClipGradNormWorldSize4(_TestClipGradNormBase):
|
||||
return min(torch.get_device_module(device_type).device_count(), 4)
|
||||
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@xfailIf(TEST_XPU) # https://github.com/intel/torch-xpu-ops/issues/1661
|
||||
def test_clip_grad_norm_2d(self):
|
||||
for norm_type in (2, 1, 3, float("inf")):
|
||||
dp_size = 2
|
||||
|
||||
@ -5,6 +5,7 @@ import functools
|
||||
import itertools
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from typing import Callable, Optional, Union
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
@ -54,7 +55,7 @@ from torch.testing._internal.common_fsdp import (
|
||||
patch_reshard,
|
||||
patch_unshard,
|
||||
)
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
ModelArgs,
|
||||
Transformer,
|
||||
@ -414,6 +415,7 @@ class TestFullyShardCommunication(FSDPTest):
|
||||
)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@xfailIf(TEST_XPU) # https://github.com/intel/torch-xpu-ops/issues/1571
|
||||
def test_set_reduce_scatter_divide_factor(self):
|
||||
self.run_subtests(
|
||||
{"divide_factor": [self.world_size * 2, self.world_size]},
|
||||
@ -1454,6 +1456,9 @@ class TestFullyShardForceSumReduction(FSDPTest):
|
||||
|
||||
# Test reduce-scatter only on plain FSDP on 2 GPUs
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(
|
||||
TEST_XPU, "Related environment variable is not supported with XCCL"
|
||||
)
|
||||
def test_fully_shard_force_sum_reduce_scatter(self):
|
||||
torch.manual_seed(42)
|
||||
model_args = ModelArgs()
|
||||
@ -1506,6 +1511,9 @@ class TestFullyShardForceSumReduction(FSDPTest):
|
||||
|
||||
# Test both reduce-scatter and all-reduce on HSDP (DDP+FSDP) on 4 GPUs
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@unittest.skipIf(
|
||||
TEST_XPU, "Related environment variable is not supported with XCCL"
|
||||
)
|
||||
def test_fully_shard_force_sum_both_reductions(self):
|
||||
mesh = init_device_mesh(
|
||||
device_type.type, (2, self.world_size // 2), mesh_dim_names=("ddp", "fsdp")
|
||||
|
||||
@ -133,7 +133,7 @@ class TestFullyShardCompile(FSDPTest):
|
||||
device_type.type,
|
||||
self.rank % torch.get_device_module(device_type).device_count(),
|
||||
)
|
||||
if not sm_is_or_higher_than(device, 8, 0):
|
||||
if device_type.type == "cuda" and not sm_is_or_higher_than(device, 8, 0):
|
||||
self.skipTest("bf16 requires sm >= 8.0")
|
||||
|
||||
def test_dynamo_trace_use_training_state(self):
|
||||
|
||||
@ -24,7 +24,7 @@ from torch.testing._internal.common_fsdp import (
|
||||
patch_register_post_backward_hook_backward,
|
||||
reduce_scatter_with_assert,
|
||||
)
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
|
||||
|
||||
|
||||
device_type = torch.device(get_devtype())
|
||||
@ -36,6 +36,7 @@ class TestFullyShardFrozen(FSDPTest):
|
||||
return min(4, torch.get_device_module(device_type).device_count())
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@xfailIf(TEST_XPU) # https://github.com/pytorch/pytorch/issues/156782
|
||||
def test_train_mixed_requires_grad_per_group(self):
|
||||
"""
|
||||
Tests training parity with DDP when mixing frozen and non-frozen
|
||||
|
||||
@ -8,7 +8,12 @@ import torch
|
||||
from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, OffloadPolicy
|
||||
from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
|
||||
from torch.testing._internal.common_fsdp import FSDPTest, get_devtype
|
||||
from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU
|
||||
from torch.testing._internal.common_utils import (
|
||||
run_tests,
|
||||
TEST_CUDA,
|
||||
TEST_HPU,
|
||||
TEST_XPU,
|
||||
)
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
ModelArgs,
|
||||
Transformer,
|
||||
@ -236,14 +241,15 @@ class TestFullyShardMemory(FSDPTest):
|
||||
|
||||
def _get_peak_active_memory_mb(self) -> int:
|
||||
mem_stats = torch.get_device_module(device_type).memory_stats()
|
||||
if TEST_CUDA:
|
||||
|
||||
if TEST_CUDA or TEST_XPU:
|
||||
return round(mem_stats["active_bytes.all.peak"] / 1e6)
|
||||
if TEST_HPU:
|
||||
return round(mem_stats["MaxInUse"] / 1e6)
|
||||
|
||||
def _get_curr_active_memory_mb(self) -> int:
|
||||
mem_stats = torch.get_device_module(device_type).memory_stats()
|
||||
if TEST_CUDA:
|
||||
if TEST_CUDA or TEST_XPU:
|
||||
return round(mem_stats["active_bytes.all.current"] / 1e6)
|
||||
if TEST_HPU:
|
||||
return round(mem_stats["InUse"] / 1e6)
|
||||
|
||||
@ -28,7 +28,13 @@ from torch.testing._internal.common_fsdp import (
|
||||
patch_reduce_scatter,
|
||||
reduce_scatter_with_assert,
|
||||
)
|
||||
from torch.testing._internal.common_utils import run_tests, skipIfRocm, TEST_HPU
|
||||
from torch.testing._internal.common_utils import (
|
||||
run_tests,
|
||||
skipIfRocmVersionLessThan,
|
||||
TEST_HPU,
|
||||
TEST_XPU,
|
||||
xfailIf,
|
||||
)
|
||||
|
||||
|
||||
device_type = torch.device(get_devtype())
|
||||
@ -86,9 +92,10 @@ class TestFullyShardMixedPrecisionTraining(FSDPTest):
|
||||
use_shard_placement_fn_vals.append(True)
|
||||
return use_shard_placement_fn_vals
|
||||
|
||||
@skipIfRocm # regressed in ROCm 6.4, but ROCm 6.5 fixes it
|
||||
@skipIfRocmVersionLessThan((7, 0))
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
|
||||
@xfailIf(TEST_XPU) # https://github.com/pytorch/pytorch/issues/156782
|
||||
def test_compute_dtype(self):
|
||||
use_shard_placement_fn_vals = (
|
||||
self._get_use_shard_placement_fn_vals_for_bf16_reduce()
|
||||
@ -166,9 +173,10 @@ class TestFullyShardMixedPrecisionTraining(FSDPTest):
|
||||
self.assertEqual(fsdp_loss, ref_loss)
|
||||
check_sharded_parity(self, ref_model, model)
|
||||
|
||||
@skipIfRocm # regressed in ROCm 6.4, but ROCm 6.5 fixes it
|
||||
@skipIfRocmVersionLessThan((7, 0))
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
|
||||
@xfailIf(TEST_XPU) # https://github.com/pytorch/pytorch/issues/156782
|
||||
def test_reduce_dtype(self):
|
||||
self.run_subtests(
|
||||
{
|
||||
@ -291,6 +299,7 @@ class TestFullyShardMixedPrecisionTraining(FSDPTest):
|
||||
check_sharded_parity(self, ref_model, model)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@xfailIf(TEST_XPU) # https://github.com/pytorch/pytorch/issues/156782
|
||||
def test_grad_acc_with_reduce_dtype(self):
|
||||
"""
|
||||
Tests that gradient accumulation without reduce-scatter when using
|
||||
@ -610,7 +619,7 @@ class TestFullyShardMixedPrecisionCasts(FSDPTestMultiThread):
|
||||
torch.bfloat16, torch.bfloat16, torch.bfloat16, True
|
||||
)
|
||||
model = Model()
|
||||
inp = Input(torch.randn(2, 10).cuda())
|
||||
inp = Input(torch.randn(2, 10).to(device_type))
|
||||
|
||||
fully_shard(model, mp_policy=mp_policy)
|
||||
loss = model(inp).sum()
|
||||
|
||||
@ -42,7 +42,9 @@ from torch.testing._internal.common_utils import (
|
||||
get_cycles_per_ms,
|
||||
run_tests,
|
||||
TEST_HPU,
|
||||
TEST_XPU,
|
||||
wrapSwapTensorsTest,
|
||||
xfailIf,
|
||||
)
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
ModelArgs,
|
||||
@ -324,7 +326,7 @@ class TestFullyShard1DTrainingCore(FSDPTest):
|
||||
self.assertEqual(losses[0], losses[1])
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(TEST_HPU, "Sleep kernel not supported for HPU")
|
||||
@unittest.skipIf(TEST_HPU or TEST_XPU, "Sleep kernel not supported for HPU/XPU")
|
||||
@compiled_fsdp_test(compile_compute_on_module=Transformer)
|
||||
def test_train_parity_multi_group(self):
|
||||
"""
|
||||
@ -347,7 +349,7 @@ class TestFullyShard1DTrainingCore(FSDPTest):
|
||||
)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
|
||||
@unittest.skipIf(TEST_HPU or TEST_XPU, "sleep kernel not supported on HPU/XPU")
|
||||
def test_train_parity_multi_group_cpu_offload_eager(self):
|
||||
"""
|
||||
Tests train parity against DDP when using multiple parameter groups for
|
||||
@ -371,7 +373,7 @@ class TestFullyShard1DTrainingCore(FSDPTest):
|
||||
)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
|
||||
@unittest.skipIf(TEST_HPU or TEST_XPU, "sleep kernel not supported on HPU/XPU")
|
||||
@compiled_fsdp_test(compile_compute_on_module=Transformer)
|
||||
def test_train_parity_multi_group_unshard_async_op(self):
|
||||
"""
|
||||
@ -495,6 +497,7 @@ class TestFullyShard1DTrainingCore(FSDPTest):
|
||||
self.assertEqual(losses[0], losses[1])
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(TEST_XPU, "Sleep is not supported on XPU")
|
||||
def test_non_root_forward_backward(self):
|
||||
"""
|
||||
Tests running forward/backward through the root and then through a
|
||||
@ -625,7 +628,7 @@ class TestFullyShard1DTrainingCore(FSDPTest):
|
||||
self.assertEqual(losses[0], losses[1])
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
|
||||
@unittest.skipIf(TEST_HPU or TEST_XPU, "Sleep is not supported on HPU/XPU")
|
||||
def test_post_optim_event(self):
|
||||
torch.manual_seed(42)
|
||||
model_args = ModelArgs(dropout_p=0.0)
|
||||
@ -678,6 +681,7 @@ class TestFullyShard1DTrainingCompose(FSDPTest):
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@compiled_fsdp_test(compile_compute_on_module=Transformer)
|
||||
@xfailIf(TEST_XPU) # https://github.com/intel/torch-xpu-ops/issues/1661
|
||||
def test_train_parity_with_activation_checkpointing(self):
|
||||
"""
|
||||
Tests train parity against DDP when composing with activation
|
||||
@ -930,6 +934,7 @@ class TestFullyShardGradientAccumulation(FSDPTest):
|
||||
return min(4, torch.get_device_module(device_type).device_count())
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@xfailIf(TEST_XPU) # https://github.com/pytorch/pytorch/issues/156782
|
||||
def test_gradient_accumulation(self):
|
||||
"""
|
||||
Tests gradient accumulation with/without gradient reduction and
|
||||
@ -1111,6 +1116,7 @@ class TestFullyShardGradientAccumulation(FSDPTest):
|
||||
_optim.zero_grad(set_to_none=(iter_idx % 2))
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@xfailIf(TEST_XPU) # https://github.com/pytorch/pytorch/issues/156782
|
||||
def test_1f1b_microbatching(self):
|
||||
self.run_subtests(
|
||||
{
|
||||
|
||||
@ -7,7 +7,6 @@ import torch.nn as nn
|
||||
from torch.distributed._tools.mem_tracker import MemTracker
|
||||
from torch.testing._internal.common_utils import (
|
||||
run_tests,
|
||||
skipIfRocm,
|
||||
skipIfTorchDynamo,
|
||||
TEST_CUDA,
|
||||
TEST_XPU,
|
||||
@ -34,7 +33,6 @@ class TestMemTracker(TestCase):
|
||||
@unittest.skipIf(
|
||||
not TEST_CUDA and not TEST_XPU, "Neither CUDA or XPU is not available"
|
||||
)
|
||||
@skipIfRocm()
|
||||
def test_accelerator_tracker_equivalence(
|
||||
self,
|
||||
):
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Owner(s): ["oncall: distributed checkpointing"]
|
||||
|
||||
import tempfile
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import torch
|
||||
from torch.distributed.checkpoint.metadata import MetadataIndex
|
||||
@ -23,38 +23,70 @@ class TestQuantizedHfStorage(TestCase):
|
||||
self.temp_dir.cleanup()
|
||||
|
||||
def test_dequantization(self):
|
||||
"""Test that quantized tensors are properly dequantized during read operations."""
|
||||
"""Test quantized tensors with weights and scales in both same and different files."""
|
||||
reader = QuantizedHuggingFaceStorageReader(self.path, thread_count=1)
|
||||
|
||||
# Test data
|
||||
quantized_tensor = torch.ones(4, 4, dtype=torch.float32)
|
||||
scale_inv = torch.tensor([[2.0]], dtype=torch.float32)
|
||||
# Test data for two different weights
|
||||
quantized_tensor1 = torch.ones(4, 4, dtype=torch.float32)
|
||||
quantized_tensor2 = (
|
||||
torch.ones(4, 4, dtype=torch.float32) * 3.0
|
||||
) # Different values
|
||||
scale_inv1 = torch.tensor([[2.0]], dtype=torch.float32)
|
||||
scale_inv2 = torch.tensor([[0.5]], dtype=torch.float32) # Different scale
|
||||
|
||||
# Mock the safetensors file for reading data
|
||||
mock_file = MagicMock()
|
||||
# Define weight and scale tensor names
|
||||
weight1_fqn = "model.layers.0.self_attn.q_proj.weight" # Scale in same file
|
||||
scale1_fqn = "model.layers.0.self_attn.q_proj.weight_scale_inv"
|
||||
weight2_fqn = (
|
||||
"model.layers.0.self_attn.k_proj.weight" # Scale in different file
|
||||
)
|
||||
scale2_fqn = "model.layers.0.self_attn.k_proj.weight_scale_inv"
|
||||
|
||||
# Mock get_slice to return a tensor that can be sliced
|
||||
def mock_get_slice(tensor_name):
|
||||
mock_tensor = MagicMock()
|
||||
mock_tensor.__getitem__ = lambda self, slices: quantized_tensor
|
||||
return mock_tensor
|
||||
|
||||
mock_file.get_slice = mock_get_slice
|
||||
mock_file.get_tensor.return_value = scale_inv
|
||||
file1_name = "model-00001-of-00002.safetensors"
|
||||
file2_name = "model-00002-of-00002.safetensors"
|
||||
|
||||
# Setup weight-scale mapping and file locations
|
||||
reader._weight_scale_mapping = {
|
||||
"model.layers.0.self_attn.kv_b_proj.weight": "model.layers.0.self_attn.kv_b_proj.weight_scale_inv",
|
||||
weight1_fqn: scale1_fqn,
|
||||
weight2_fqn: scale2_fqn,
|
||||
}
|
||||
reader._weight_map = {
|
||||
weight1_fqn: file1_name, # Weight in file 1
|
||||
scale1_fqn: file1_name, # Scale also in file 1 (same file scenario)
|
||||
weight2_fqn: file1_name, # Weight in file 1
|
||||
scale2_fqn: file2_name, # Scale in file 2 (different file scenario)
|
||||
}
|
||||
|
||||
# Create a read request for quantized tensor
|
||||
read_item = ReadItem(
|
||||
# Mock the main safetensors file (file1)
|
||||
mock_file1 = MagicMock()
|
||||
|
||||
# Mock get_slice to return different tensors based on tensor name
|
||||
def mock_get_slice(tensor_name):
|
||||
mock_tensor = MagicMock()
|
||||
if tensor_name == weight1_fqn:
|
||||
mock_tensor.__getitem__ = lambda _, __: quantized_tensor1
|
||||
elif tensor_name == weight2_fqn:
|
||||
mock_tensor.__getitem__ = lambda _, __: quantized_tensor2
|
||||
return mock_tensor
|
||||
|
||||
mock_file1.get_slice = mock_get_slice
|
||||
|
||||
# Mock get_tensor for same-file scale (scale1)
|
||||
mock_file1.get_tensor.return_value = scale_inv1
|
||||
|
||||
# Mock the cross-file safetensors file (file2) for scale2
|
||||
mock_file2 = MagicMock()
|
||||
mock_file2.get_tensor.return_value = scale_inv2
|
||||
|
||||
# Test 1: Same-file scenario (weight1 + scale1 both in file1)
|
||||
read_item1 = ReadItem(
|
||||
type=LoadItemType.TENSOR,
|
||||
storage_index=MetadataIndex(
|
||||
fqn="model.layers.0.self_attn.kv_b_proj.weight",
|
||||
fqn=weight1_fqn,
|
||||
offset=torch.Size([0, 0]),
|
||||
),
|
||||
dest_index=MetadataIndex(
|
||||
fqn="model.layers.0.self_attn.kv_b_proj.weight",
|
||||
fqn=weight1_fqn,
|
||||
offset=torch.Size([0, 0]),
|
||||
),
|
||||
storage_offsets=[0, 0],
|
||||
@ -62,22 +94,73 @@ class TestQuantizedHfStorage(TestCase):
|
||||
lengths=[4, 4],
|
||||
)
|
||||
|
||||
# Mock planner
|
||||
target_tensor = torch.zeros(4, 4, dtype=torch.float32)
|
||||
mock_planner = MagicMock()
|
||||
mock_planner.resolve_tensor.return_value = target_tensor
|
||||
target_tensor1 = torch.zeros(4, 4, dtype=torch.float32)
|
||||
mock_planner1 = MagicMock()
|
||||
mock_planner1.resolve_tensor.return_value = target_tensor1
|
||||
|
||||
# Test the _process_read_request method
|
||||
reader._process_read_request(mock_file, read_item, mock_planner)
|
||||
# Process first weight (same file scenario)
|
||||
reader._process_read_request(mock_file1, read_item1, mock_planner1)
|
||||
|
||||
# Verify the tensor was dequantized (ones * 2.0 = twos)
|
||||
expected_result = torch.ones(4, 4, dtype=torch.float32) * 2.0
|
||||
mock_planner.commit_tensor.assert_called_once()
|
||||
# Verify first tensor was dequantized (ones * 2.0 = twos)
|
||||
expected_result1 = torch.ones(4, 4, dtype=torch.float32) * 2.0
|
||||
mock_planner1.commit_tensor.assert_called_once()
|
||||
|
||||
# Check that target_tensor was updated correctly
|
||||
args, _ = mock_planner.commit_tensor.call_args
|
||||
committed_tensor = args[1] # second argument is the tensor
|
||||
torch.testing.assert_close(committed_tensor, expected_result)
|
||||
# Check that target_tensor1 was updated correctly
|
||||
args1, _ = mock_planner1.commit_tensor.call_args
|
||||
committed_tensor1 = args1[1]
|
||||
torch.testing.assert_close(committed_tensor1, expected_result1)
|
||||
|
||||
# Test 2: Cross-file scenario (weight2 in file1, scale2 in file2)
|
||||
read_item2 = ReadItem(
|
||||
type=LoadItemType.TENSOR,
|
||||
storage_index=MetadataIndex(
|
||||
fqn=weight2_fqn,
|
||||
offset=torch.Size([0, 0]),
|
||||
),
|
||||
dest_index=MetadataIndex(
|
||||
fqn=weight2_fqn,
|
||||
offset=torch.Size([0, 0]),
|
||||
),
|
||||
storage_offsets=[0, 0],
|
||||
dest_offsets=[0, 0],
|
||||
lengths=[4, 4],
|
||||
)
|
||||
|
||||
target_tensor2 = torch.zeros(4, 4, dtype=torch.float32)
|
||||
mock_planner2 = MagicMock()
|
||||
mock_planner2.resolve_tensor.return_value = target_tensor2
|
||||
|
||||
# Mock the entire safetensors module since it may not be available in test environment
|
||||
mock_safetensors = MagicMock()
|
||||
mock_safe_open = MagicMock()
|
||||
mock_safetensors.safe_open = mock_safe_open
|
||||
|
||||
# Set up the mock to return a context manager that yields mock_file2
|
||||
mock_safe_open.return_value.__enter__.return_value = mock_file2
|
||||
mock_safe_open.return_value.__exit__.return_value = False
|
||||
|
||||
# Mock the module import and safe_open function
|
||||
with patch.dict("sys.modules", {"safetensors": mock_safetensors}):
|
||||
# Process second weight (cross-file scenario)
|
||||
reader._process_read_request(mock_file1, read_item2, mock_planner2)
|
||||
|
||||
# Verify safe_open was called with the correct file path
|
||||
expected_path = f"{self.path}/{file2_name}"
|
||||
mock_safe_open.assert_called_once()
|
||||
call_args = mock_safe_open.call_args[0]
|
||||
self.assertEqual(str(call_args[0]), expected_path)
|
||||
|
||||
# Verify the scale tensor was loaded from the correct file
|
||||
mock_file2.get_tensor.assert_called_once_with(scale2_fqn)
|
||||
|
||||
# Verify second tensor was dequantized (3.0 * 0.5 = 1.5)
|
||||
expected_result2 = torch.ones(4, 4, dtype=torch.float32) * 3.0 * 0.5 # 1.5
|
||||
mock_planner2.commit_tensor.assert_called_once()
|
||||
|
||||
# Check that target_tensor2 was updated correctly
|
||||
args2, _ = mock_planner2.commit_tensor.call_args
|
||||
committed_tensor2 = args2[1]
|
||||
torch.testing.assert_close(committed_tensor2, expected_result2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -116,7 +116,6 @@ class DistributedUtilTest(TestCase):
|
||||
timeout=1,
|
||||
)
|
||||
|
||||
@skipIfRocm
|
||||
def test_create_store_timeout_on_worker(self):
|
||||
with self.assertRaises(DistNetworkError):
|
||||
# use any available port (port 0) since timeout is expected
|
||||
|
||||
@ -44,8 +44,11 @@ class TestFlattenParams(FSDPTest):
|
||||
return 1
|
||||
|
||||
def _get_default_config(self):
|
||||
device_type = (
|
||||
acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
)
|
||||
return {
|
||||
"device": torch.device("cuda"),
|
||||
"device": torch.device(device_type),
|
||||
"sharding_strategy": HandleShardingStrategy.FULL_SHARD,
|
||||
"offload_params": False,
|
||||
"mp_param_dtype": None,
|
||||
|
||||
@ -31,6 +31,8 @@ if TEST_WITH_DEV_DBG_ASAN:
|
||||
)
|
||||
sys.exit(0)
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
|
||||
class Model(nn.Module):
|
||||
def __init__(
|
||||
@ -47,7 +49,6 @@ class Model(nn.Module):
|
||||
nn.AdaptiveAvgPool2d(output_size=(1, 1)),
|
||||
nn.Flatten(),
|
||||
)
|
||||
self.device = torch.cuda.current_device()
|
||||
self.head = nn.Linear(64, 10)
|
||||
if with_fsdp and freeze_after_wrap_fsdp:
|
||||
self.fsdp_wrap(fsdp_kwargs)
|
||||
@ -145,7 +146,7 @@ class TestFreezingWeights(FSDPTest):
|
||||
forward_prefetch,
|
||||
):
|
||||
torch.manual_seed(0)
|
||||
batch = torch.randn(size=(2, 3, 224, 224)).cuda()
|
||||
batch = torch.randn(size=(2, 3, 224, 224)).to(device_type)
|
||||
|
||||
fsdp_kwargs = {
|
||||
"device_id": self.rank,
|
||||
@ -164,7 +165,7 @@ class TestFreezingWeights(FSDPTest):
|
||||
disable_autograd,
|
||||
fsdp_kwargs,
|
||||
)
|
||||
model = model.cuda()
|
||||
model = model.to(device_type)
|
||||
|
||||
# freezing the trunk using requires_grad.
|
||||
if freezing_method == FreezingMethod.RequiresGrad:
|
||||
@ -178,7 +179,7 @@ class TestFreezingWeights(FSDPTest):
|
||||
else:
|
||||
model = DistributedDataParallel(model, **ddp_kwargs)
|
||||
|
||||
target = torch.tensor([0, 1], dtype=torch.long).cuda()
|
||||
target = torch.tensor([0, 1], dtype=torch.long).to(device_type)
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
|
||||
|
||||
|
||||
@ -49,6 +49,8 @@ if TEST_WITH_DEV_DBG_ASAN:
|
||||
)
|
||||
sys.exit(0)
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def patch_allreduce(new_allreduce):
|
||||
@ -97,7 +99,7 @@ class ShardingStrategyMode(Enum):
|
||||
class TestFSDPHybridShard(FSDPTest):
|
||||
@property
|
||||
def world_size(self):
|
||||
return max(torch.cuda.device_count(), 2)
|
||||
return max(torch.accelerator.device_count(), 2)
|
||||
|
||||
@property
|
||||
def process_group(self):
|
||||
@ -105,7 +107,7 @@ class TestFSDPHybridShard(FSDPTest):
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_raises_manual_wrap_hybrid_shard_when_none_policy(self):
|
||||
model = MyModel().cuda()
|
||||
model = MyModel().to(device_type)
|
||||
err_ctx = self.assertRaisesRegex(
|
||||
ValueError,
|
||||
"requires explicit specification of process group or device_mesh.",
|
||||
@ -119,8 +121,8 @@ class TestFSDPHybridShard(FSDPTest):
|
||||
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_hsdp_save_load_state_dict(self):
|
||||
model = MyModel().cuda()
|
||||
num_node_devices = torch.cuda.device_count()
|
||||
model = MyModel().to(device_type)
|
||||
num_node_devices = torch.accelerator.device_count()
|
||||
shard_rank_lists = (
|
||||
list(range(0, num_node_devices // 2)),
|
||||
list(range(num_node_devices // 2, num_node_devices)),
|
||||
@ -161,7 +163,7 @@ class TestFSDPHybridShard(FSDPTest):
|
||||
msd = model.state_dict()
|
||||
osd = FSDP.optim_state_dict(model, optim)
|
||||
|
||||
load_model = fsdp_ctor(MyModel().cuda())
|
||||
load_model = fsdp_ctor(MyModel().to(device_type))
|
||||
load_optim = torch.optim.AdamW(load_model.parameters())
|
||||
with FSDP.state_dict_type(load_model, StateDictType.SHARDED_STATE_DICT):
|
||||
load_model.load_state_dict(msd)
|
||||
@ -170,8 +172,8 @@ class TestFSDPHybridShard(FSDPTest):
|
||||
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_hsdp_sync_module_state(self):
|
||||
model = MyModel().cuda()
|
||||
num_node_devices = torch.cuda.device_count()
|
||||
model = MyModel().to(device_type)
|
||||
num_node_devices = torch.accelerator.device_count()
|
||||
shard_rank_lists = (
|
||||
list(range(0, num_node_devices // 2)),
|
||||
list(range(num_node_devices // 2, num_node_devices)),
|
||||
@ -214,7 +216,7 @@ class TestFSDPHybridShard(FSDPTest):
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_invalid_pg_specification_raises(self):
|
||||
pol = ModuleWrapPolicy({nn.Linear})
|
||||
model = MyModel().cuda()
|
||||
model = MyModel().to(device_type)
|
||||
with self.assertRaisesRegex(
|
||||
ValueError, "Expected process_group to be passed in"
|
||||
):
|
||||
@ -260,7 +262,7 @@ class TestFSDPHybridShard(FSDPTest):
|
||||
use_device_mesh: bool,
|
||||
):
|
||||
if use_device_mesh:
|
||||
device_mesh = init_device_mesh("cuda", (1, self.world_size))
|
||||
device_mesh = init_device_mesh(device_type, (1, self.world_size))
|
||||
else:
|
||||
device_mesh = None
|
||||
hsdp_model = self._init_hsdp_model(
|
||||
@ -316,7 +318,7 @@ class TestFSDPHybridShard(FSDPTest):
|
||||
patch_allreduce(patched_allreduce),
|
||||
patch_reduce_scatter(patched_reduce_scatter),
|
||||
):
|
||||
inp = hsdp_model.get_input(device=torch.cuda.current_device())
|
||||
inp = hsdp_model.get_input(device=torch.accelerator.current_device_index())
|
||||
out = hsdp_model(inp[0], inp[1])
|
||||
loss = hsdp_model.get_loss(inp, out)
|
||||
loss.backward()
|
||||
@ -365,7 +367,7 @@ class TestFSDPHybridShard(FSDPTest):
|
||||
hsdp_optim = torch.optim.Adam(hsdp_model.parameters(), lr=1e-2)
|
||||
torch.manual_seed(global_pg.rank() + 1)
|
||||
for _ in range(5):
|
||||
inp = fsdp_model.module.get_input(torch.device("cuda"))
|
||||
inp = fsdp_model.module.get_input(torch.device(device_type))
|
||||
losses: list[torch.Tensor] = []
|
||||
for model, optim in ((fsdp_model, fsdp_optim), (hsdp_model, hsdp_optim)):
|
||||
optim.zero_grad()
|
||||
@ -381,7 +383,7 @@ class TestFSDPHybridShard(FSDPTest):
|
||||
)
|
||||
hsdp_kwargs = {
|
||||
"auto_wrap_policy": auto_wrap_policy,
|
||||
"device_id": torch.cuda.current_device(),
|
||||
"device_id": torch.accelerator.current_device_index(),
|
||||
"use_orig_params": use_orig_params,
|
||||
}
|
||||
fsdp_model = TransformerWithSharedParams.init(
|
||||
@ -408,7 +410,7 @@ class TestFSDPHybridShard(FSDPTest):
|
||||
{TransformerEncoderLayer, TransformerDecoderLayer},
|
||||
)
|
||||
hsdp_kwargs = {
|
||||
"device_id": torch.cuda.current_device(),
|
||||
"device_id": torch.accelerator.current_device_index(),
|
||||
"auto_wrap_policy": auto_wrap_policy,
|
||||
"sharding_strategy": hsdp_sharding_strategy,
|
||||
"use_orig_params": use_orig_params,
|
||||
@ -435,7 +437,7 @@ class TestFSDPHybridShard(FSDPTest):
|
||||
# Use `FULL_SHARD` for the embedding and output projection
|
||||
hsdp_model = FSDP(
|
||||
model,
|
||||
device_id=torch.cuda.current_device(),
|
||||
device_id=torch.accelerator.current_device_index(),
|
||||
sharding_strategy=ShardingStrategy.FULL_SHARD,
|
||||
use_orig_params=use_orig_params,
|
||||
)
|
||||
|
||||
@ -36,6 +36,8 @@ if TEST_WITH_DEV_DBG_ASAN:
|
||||
)
|
||||
sys.exit(0)
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
|
||||
class Model(torch.nn.Module):
|
||||
def __init__(self) -> None:
|
||||
@ -94,9 +96,9 @@ class ModelWithIgnoredModules(Model):
|
||||
class TestFSDPIgnoredModules(FSDPTest):
|
||||
@property
|
||||
def world_size(self):
|
||||
return min(torch.cuda.device_count(), 2)
|
||||
return min(torch.accelerator.device_count(), 2)
|
||||
|
||||
def _train_model(self, model, optim, num_iters, device=torch.device("cuda")):
|
||||
def _train_model(self, model, optim, num_iters, device=torch.device(device_type)):
|
||||
for _ in range(num_iters):
|
||||
module = model.module if isinstance(model, FSDP) else model
|
||||
inp = module.get_input(device)
|
||||
@ -198,7 +200,7 @@ class TestFSDPIgnoredModules(FSDPTest):
|
||||
# Initialize an FSDP-wrapped nested model that first wraps the nested
|
||||
# sequential's second linear layer (`layer1[1]`) and then wraps the
|
||||
# overall model while ignoring the nested sequential (`layer1`)
|
||||
model = Model().cuda()
|
||||
model = Model().to(device_type)
|
||||
fsdp_fn = functools.partial(FSDP, use_orig_params=use_orig_params)
|
||||
model.layer1[1] = fsdp_fn(model.layer1[1])
|
||||
if ignore_modules:
|
||||
@ -246,7 +248,7 @@ class TestFSDPIgnoredModules(FSDPTest):
|
||||
)
|
||||
|
||||
def _test_ignored_states_auto_wrap(self, policy, ignore_bias: bool):
|
||||
model = Model().cuda()
|
||||
model = Model().to(device_type)
|
||||
ignored_states = [model.layer1[1].weight]
|
||||
if ignore_bias:
|
||||
ignored_states.append(model.layer1[1].bias)
|
||||
@ -285,7 +287,7 @@ class TestFSDPIgnoredModules(FSDPTest):
|
||||
def test_ignored_modules_invalid(self):
|
||||
"""Tests that passing an FSDP module as an ignored module or the
|
||||
top-level module itself errors."""
|
||||
model = Model().cuda()
|
||||
model = Model().to(device_type)
|
||||
wrap_cls = FSDP
|
||||
model.layer1 = wrap_cls(model.layer1)
|
||||
# Passing an FSDP module as an ignored module should error
|
||||
@ -302,7 +304,7 @@ class TestFSDPIgnoredModules(FSDPTest):
|
||||
):
|
||||
# FSDP does not allow to wrap the same model twice, so create
|
||||
# a new local model here.
|
||||
new_model = Model().cuda()
|
||||
new_model = Model().to(device_type)
|
||||
wrap_cls(new_model, ignored_modules=[new_model])
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@ -334,7 +336,7 @@ class TestFSDPIgnoredModules(FSDPTest):
|
||||
# we wrap `layer3` with FSDP, where `layer3` is registered as a module
|
||||
# after `layer1`, which has the variable number of ignored modules
|
||||
wrap_cls = FSDP
|
||||
model = ModelWithIgnoredModules(num_ignored=self.rank + 1).cuda()
|
||||
model = ModelWithIgnoredModules(num_ignored=self.rank + 1).to(device_type)
|
||||
layer1_ignored_modules = [
|
||||
m for m in model.layer1.modules() if isinstance(m, IgnoredModule)
|
||||
]
|
||||
@ -370,7 +372,7 @@ class TestFSDPIgnoredModules(FSDPTest):
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@parametrize("ignore_modules", [True, False])
|
||||
def test_ignored_modules_not_under_wrapped_root(self, ignore_modules: bool):
|
||||
model = Model().cuda()
|
||||
model = Model().to(device_type)
|
||||
ignored_modules = list(model.layer1.children())[1:]
|
||||
|
||||
ignore_kwargs = (
|
||||
@ -409,7 +411,7 @@ class TestFSDPIgnoredModules(FSDPTest):
|
||||
)
|
||||
|
||||
def _test_ignored_states_check(self, ignore_modules: bool):
|
||||
model = Model().cuda()
|
||||
model = Model().to(device_type)
|
||||
ignored_modules = list(model.layer1.children())[1:]
|
||||
ignored_params = {p for m in ignored_modules for p in m.parameters()}
|
||||
ignored_states = ignored_params.union(set(ignored_modules))
|
||||
|
||||
@ -14,6 +14,7 @@ from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
run_tests,
|
||||
TEST_CUDA,
|
||||
TEST_HPU,
|
||||
TEST_WITH_DEV_DBG_ASAN,
|
||||
)
|
||||
@ -31,11 +32,14 @@ if TEST_WITH_DEV_DBG_ASAN:
|
||||
)
|
||||
sys.exit(0)
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
|
||||
def get_cur_mem(rank, result, prefix):
|
||||
"""Collect memory allocated values in a result dict in MB"""
|
||||
torch._C._cuda_clearCublasWorkspaces()
|
||||
result[prefix] = round(torch.cuda.memory_allocated() / 1024 / 1024)
|
||||
if TEST_CUDA:
|
||||
torch._C._cuda_clearCublasWorkspaces()
|
||||
result[prefix] = round(torch.accelerator.memory_allocated() / 1024 / 1024)
|
||||
|
||||
|
||||
class Model(nn.Module):
|
||||
@ -110,14 +114,14 @@ class TestFSDPMemory(FSDPTest):
|
||||
|
||||
def _dist_train(self, with_checkpoint, expected, model_hidden_dim, iterations):
|
||||
gpu_id = self.rank
|
||||
batch = torch.randn(size=(2, 3, 224, 224)).cuda()
|
||||
batch = torch.randn(size=(2, 3, 224, 224)).to(device_type)
|
||||
|
||||
model = create_model(
|
||||
with_fsdp=True,
|
||||
with_checkpoint=with_checkpoint,
|
||||
model_hidden_dim=model_hidden_dim,
|
||||
)
|
||||
model = model.cuda()
|
||||
model = model.to(device_type)
|
||||
model = FSDP(model)
|
||||
|
||||
# We enable momentum so that after the first iteration, the optimizer state is added
|
||||
@ -133,7 +137,7 @@ class TestFSDPMemory(FSDPTest):
|
||||
get_cur_mem(gpu_id, results, f"iter {iteration}: after fwd")
|
||||
|
||||
out = sum(o.sum() for o in out[0])
|
||||
fake_loss = criterion(out, torch.tensor(0.0).cuda())
|
||||
fake_loss = criterion(out, torch.tensor(0.0).to(device_type))
|
||||
get_cur_mem(gpu_id, results, f"iter {iteration}: after loss")
|
||||
|
||||
fake_loss.backward()
|
||||
@ -167,8 +171,8 @@ class TestFSDPMemory(FSDPTest):
|
||||
|
||||
model = create_model(
|
||||
with_fsdp=False, with_checkpoint=False, model_hidden_dim=model_hidden_dim
|
||||
).cuda()
|
||||
model_size_mb = round(torch.cuda.memory_allocated() / 1024 / 1024)
|
||||
).to(device_type)
|
||||
model_size_mb = round(torch.accelerator.memory_allocated() / 1024 / 1024)
|
||||
del model
|
||||
|
||||
sharded_model_size_mb = int(model_size_mb / self.world_size)
|
||||
|
||||
@ -43,6 +43,8 @@ if TEST_WITH_DEV_DBG_ASAN:
|
||||
)
|
||||
sys.exit(0)
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
|
||||
def _reset_params_if_meta(is_meta: bool, model: nn.Module):
|
||||
# For torchdistX init, we don't need to call reset_params, as
|
||||
@ -117,7 +119,7 @@ def _init_with_reset_params(module: nn.Module):
|
||||
)
|
||||
)
|
||||
if has_meta_states:
|
||||
device = torch.device("cuda", torch.cuda.current_device())
|
||||
device = torch.device(device_type, torch.accelerator.current_device_index())
|
||||
module.to_empty(device=device, recurse=False)
|
||||
module.reset_parameters()
|
||||
|
||||
@ -164,13 +166,13 @@ class TestFSDPWithMetaDevice(FSDPTest):
|
||||
|
||||
# Test to make sure it is the same model parameters as regular FSDP
|
||||
# approach.
|
||||
regular = MyModel(device="cuda")
|
||||
regular = MyModel(device=device_type)
|
||||
_reset_params_if_meta(is_meta, regular)
|
||||
fsdp_regular = FSDP(regular, auto_wrap_policy=always_wrap)
|
||||
regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
|
||||
|
||||
self._compare_fsdp(fsdp_meta, fsdp_regular)
|
||||
inp = torch.randn(10, 2, device="cuda")
|
||||
inp = torch.randn(10, 2, device=device_type)
|
||||
fsdp_meta(inp).sum().backward()
|
||||
fsdp_regular(inp).sum().backward()
|
||||
meta_opt.step()
|
||||
@ -182,7 +184,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
|
||||
model = meta_module_fn()
|
||||
fsdp_meta = FSDP(model, param_init_fn=init_fn)
|
||||
meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3)
|
||||
regular = MyModel(device="cuda")
|
||||
regular = MyModel(device=device_type)
|
||||
_reset_params_if_meta(is_meta, regular)
|
||||
fsdp_regular = FSDP(regular, auto_wrap_policy=always_wrap)
|
||||
regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
|
||||
@ -217,7 +219,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
|
||||
)
|
||||
def test_simple_model_with_torchdistX_default_init(self):
|
||||
def meta_module_fn():
|
||||
return deferred_init.deferred_init(MyModel, device="cuda")
|
||||
return deferred_init.deferred_init(MyModel, device=device_type)
|
||||
|
||||
self._test_simple_model_with_meta_device(meta_module_fn)
|
||||
|
||||
@ -228,7 +230,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
|
||||
)
|
||||
def test_simple_model_with_torchdistX_init_fn(self):
|
||||
def meta_module_fn():
|
||||
return deferred_init.deferred_init(MyModel, device="cuda")
|
||||
return deferred_init.deferred_init(MyModel, device=device_type)
|
||||
|
||||
self._test_simple_model_with_meta_device(
|
||||
meta_module_fn, init_fn=_init_with_torchdistX
|
||||
@ -248,7 +250,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
|
||||
param_init_fn=init_fn,
|
||||
)
|
||||
meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3)
|
||||
module_regular = NestedModel(device="cuda")
|
||||
module_regular = NestedModel(device=device_type)
|
||||
_reset_params_if_meta(is_meta, module_regular)
|
||||
fsdp_regular = FSDP(
|
||||
module_regular,
|
||||
@ -269,7 +271,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
|
||||
|
||||
# Init and reset parameters before wrapping so that reset_params
|
||||
# matches up with meta device's initialization.
|
||||
module_regular = NestedModel(device="cuda")
|
||||
module_regular = NestedModel(device=device_type)
|
||||
_reset_params_if_meta(is_meta, module_regular)
|
||||
with enable_wrap(wrapper_cls=FSDP):
|
||||
module_regular.lin1 = wrap(module_regular.lin1)
|
||||
@ -279,7 +281,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
|
||||
|
||||
# Compare it before training
|
||||
self._compare_fsdp(fsdp_meta, fsdp_regular)
|
||||
inp = torch.randn(10, 2, device="cuda")
|
||||
inp = torch.randn(10, 2, device=device_type)
|
||||
fsdp_meta(inp).sum().backward()
|
||||
fsdp_regular(inp).sum().backward()
|
||||
meta_opt.step()
|
||||
@ -317,7 +319,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
|
||||
@parametrize("auto_wrap", [True, False])
|
||||
def test_nested_model_with_torchdistX_default_init(self, auto_wrap):
|
||||
def meta_module_fn():
|
||||
return deferred_init.deferred_init(NestedModel, device="cuda")
|
||||
return deferred_init.deferred_init(NestedModel, device=device_type)
|
||||
|
||||
self._test_nested_model_with_meta_device(
|
||||
auto_wrap=auto_wrap, meta_module_fn=meta_module_fn
|
||||
@ -331,7 +333,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
|
||||
@parametrize("auto_wrap", [True, False])
|
||||
def test_nested_model_with_torchdistX_init_fn(self, auto_wrap):
|
||||
def meta_module_fn():
|
||||
return deferred_init.deferred_init(NestedModel, device="cuda")
|
||||
return deferred_init.deferred_init(NestedModel, device=device_type)
|
||||
|
||||
self._test_nested_model_with_meta_device(
|
||||
auto_wrap=auto_wrap,
|
||||
@ -351,7 +353,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
|
||||
)
|
||||
def test_bad_arg_torchdistx(self):
|
||||
def meta_module_fn():
|
||||
return deferred_init.deferred_init(NestedModel, "cuda")
|
||||
return deferred_init.deferred_init(NestedModel, device_type)
|
||||
|
||||
self._test_bad_arg(meta_module_fn)
|
||||
|
||||
@ -401,7 +403,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
|
||||
# TODO: `module.to_empty()` is not generally correct for meta
|
||||
# device initialization.
|
||||
# https://github.com/pytorch/pytorch/issues/90465
|
||||
module.to_empty(device=torch.device("cuda"))
|
||||
module.to_empty(device=torch.device(device_type))
|
||||
module.apply(model._module_init_fn)
|
||||
|
||||
model = Model()
|
||||
@ -414,7 +416,7 @@ class TestFSDPWithMetaDevice(FSDPTest):
|
||||
param_dtype=torch.float32, reduce_dtype=torch.float16
|
||||
),
|
||||
param_init_fn=_param_init_fn,
|
||||
device_id=torch.cuda.current_device(),
|
||||
device_id=torch.accelerator.current_device_index(),
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -38,7 +38,6 @@ from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
run_tests,
|
||||
skipIfRocm,
|
||||
TEST_WITH_DEV_DBG_ASAN,
|
||||
)
|
||||
|
||||
@ -514,7 +513,6 @@ class TestFSDPOptimState(FSDPTest):
|
||||
continue
|
||||
self.assertEqual(full_osd_value, ref_osd_pg[name])
|
||||
|
||||
@skipIfRocm
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@parametrize("state_dict_type", STATE_DICT_TYPES)
|
||||
@parametrize("use_multiple_param_groups", [False, True])
|
||||
|
||||
@ -678,6 +678,9 @@ class RingFlexAttentionTest(DTensorTestBase):
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@with_comms
|
||||
@unittest.skipIf(
|
||||
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
|
||||
)
|
||||
def test_ring_flex_attention(self) -> None:
|
||||
self.run_subtests(
|
||||
{"qkv_size": [128 * self.world_size, 2048]},
|
||||
@ -694,6 +697,9 @@ class RingFlexAttentionTest(DTensorTestBase):
|
||||
# TODO: merge with the above test
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@with_comms
|
||||
@unittest.skipIf(
|
||||
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
|
||||
)
|
||||
def test_ring_flex_attention_document_mask(self) -> None:
|
||||
random.seed(10)
|
||||
|
||||
|
||||
@ -848,6 +848,30 @@ class DTensorMeshTest(DTensorTestBase):
|
||||
self.assertEqual(local_shard.shape, (4, 3))
|
||||
self.assertEqual(local_shard, torch.ones(4, 3) + torch.ones(3))
|
||||
|
||||
@with_comms
|
||||
def test_vmap_embedding(self):
|
||||
mesh = self.build_device_mesh()
|
||||
batch_size, seq_len = 2, 6
|
||||
output_dim = 32
|
||||
|
||||
indices = torch.zeros(*(batch_size, seq_len), dtype=torch.int64)
|
||||
indices[0, 1] = 1
|
||||
indices[1, 3] = 1
|
||||
indices[1, 5] = 1
|
||||
indices = DTensor.from_local(indices, mesh, [Shard(0)])
|
||||
|
||||
emb = torch.randn(
|
||||
*(batch_size, 8, output_dim),
|
||||
dtype=torch.float32,
|
||||
)
|
||||
emb = DTensor.from_local(emb, mesh, [Shard(0)])
|
||||
result = torch.vmap(F.embedding)(indices, emb)
|
||||
expected = [F.embedding(indices[i], emb[i]) for i in range(batch_size)]
|
||||
expected = torch.stack(expected)
|
||||
local_result = result.to_local()
|
||||
local_expected = expected.to_local()
|
||||
self.assertEqual(local_result, local_expected)
|
||||
|
||||
@with_comms
|
||||
def test_auto_implicit_replication(self):
|
||||
mesh = self.build_device_mesh()
|
||||
|
||||
@ -131,7 +131,7 @@ class DTensorConstructorTest(DTensorTestBase):
|
||||
|
||||
@with_comms
|
||||
def test_zeros_full_mesh(self):
|
||||
# construct a cuda device 1d mesh
|
||||
# construct a gpu device 1d mesh
|
||||
mesh = self.build_device_mesh()
|
||||
placements = [Shard(0)]
|
||||
size = [32, 3]
|
||||
@ -157,7 +157,7 @@ class DTensorConstructorTest(DTensorTestBase):
|
||||
self.assertEqual(local_tensor.size(), torch.Size([7, 3]))
|
||||
self.assertEqual(torch.zeros(7, 3), local_tensor)
|
||||
|
||||
# construct a cuda device mesh with 2d: shard, replicate
|
||||
# construct a gpu device mesh with 2d: shard, replicate
|
||||
mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).reshape(2, 2))
|
||||
placements = [Shard(0), Replicate()]
|
||||
size = [32, 4]
|
||||
@ -168,7 +168,7 @@ class DTensorConstructorTest(DTensorTestBase):
|
||||
self.assertEqual(local_tensor.size(), torch.Size([16, 4]))
|
||||
self.assertEqual(local_tensor, torch.zeros([16, 4]))
|
||||
|
||||
# construct a cuda device mesh with 2d: shard, shard
|
||||
# construct a gpu device mesh with 2d: shard, shard
|
||||
placements = [Shard(0), Shard(1)]
|
||||
size = [32, 4]
|
||||
dist_tensor = zeros(size, device_mesh=mesh, placements=placements)
|
||||
@ -197,7 +197,7 @@ class DTensorConstructorTest(DTensorTestBase):
|
||||
@with_comms
|
||||
def test_zeros_submesh(self):
|
||||
# default world_size is 4
|
||||
# construct a cuda device 1d mesh, with no sub pg initialized
|
||||
# construct a gpu device 1d mesh, with no sub pg initialized
|
||||
sub_mesh_list = [0, 3]
|
||||
mesh = DeviceMesh(self.device_type, sub_mesh_list)
|
||||
placements = [Shard(0)]
|
||||
@ -213,7 +213,7 @@ class DTensorConstructorTest(DTensorTestBase):
|
||||
self.assertEqual(local_tensor.size(), torch.Size([0]))
|
||||
self.assertEqual(local_tensor, torch.zeros(0))
|
||||
|
||||
# construct a cuda device 1d mesh: unevenly, with subpg initialized
|
||||
# construct a gpu device 1d mesh: unevenly, with subpg initialized
|
||||
sub_mesh_list = [0, 1, 3]
|
||||
mesh = DeviceMesh(self.device_type, sub_mesh_list)
|
||||
placements = [Shard(0)]
|
||||
@ -233,7 +233,7 @@ class DTensorConstructorTest(DTensorTestBase):
|
||||
self.assertEqual(local_tensor.size(), torch.Size([0]))
|
||||
self.assertEqual(local_tensor, torch.tensor([]))
|
||||
|
||||
# construct a cuda device 2d mesh, with no subpg initialized
|
||||
# construct a gpu device 2d mesh, with no subpg initialized
|
||||
sub_mesh_list = [[0], [3]]
|
||||
mesh = DeviceMesh(self.device_type, sub_mesh_list)
|
||||
placements = [Shard(0), Shard(1)]
|
||||
|
||||
@ -24,7 +24,7 @@ from torch.distributed.tensor.parallel import (
|
||||
RowwiseParallel,
|
||||
SequenceParallel,
|
||||
)
|
||||
from torch.testing._internal.common_utils import run_tests, skipIfRocm
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
DTensorTestBase,
|
||||
skip_unless_torch_gpu,
|
||||
@ -695,7 +695,6 @@ class DistMathOpsTest(DTensorTestBase):
|
||||
self.assertEqual(grad1_norm.device_mesh, mesh_y)
|
||||
|
||||
@with_comms
|
||||
@skipIfRocm
|
||||
def test_foreach_add_different_mesh(self):
|
||||
mesh_shape = (2, self.world_size // 2)
|
||||
mesh_2d = init_device_mesh(
|
||||
|
||||
@ -44,7 +44,7 @@ class DistTensorRandomInitTest(DTensorTestBase):
|
||||
shard_spec = [Shard(0)]
|
||||
input_size = (8, 4)
|
||||
|
||||
# NOTE: currently random initialization on cuda device has different
|
||||
# NOTE: currently random initialization on gpu device has different
|
||||
# behavior from other devices. Unify the test once the behavior is unified.
|
||||
if not is_rng_supported_mesh(device_mesh):
|
||||
input_tensor = torch.randn(*input_size, device=self.device_type)
|
||||
@ -97,7 +97,7 @@ class DistTensorRandomInitTest(DTensorTestBase):
|
||||
def test_init_with_user_generator(self):
|
||||
device_mesh = self.build_device_mesh()
|
||||
torch.manual_seed(42)
|
||||
rng = torch.Generator(device="cuda").manual_seed(42)
|
||||
rng = torch.Generator(device=self.device_type).manual_seed(42)
|
||||
t1 = torch.distributed.tensor.empty(
|
||||
(8, 3), device_mesh=device_mesh, placements=[Shard(0)]
|
||||
)
|
||||
@ -126,7 +126,7 @@ class DistTensorRandomInitTest(DTensorTestBase):
|
||||
# The DTensor random ops will use the same generator as the default one on the device.
|
||||
|
||||
# Note: this behavior changed, and now the guideline is to set the same RNG seed on all SPMD ranks.
|
||||
torch.cuda.manual_seed(0)
|
||||
torch.get_device_module(self.device_type).manual_seed(0)
|
||||
device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
|
||||
size = [1024, 2048]
|
||||
meta_dtensor = distribute_tensor(
|
||||
@ -592,8 +592,8 @@ class DistTensorRandomOpsTest3D(DTensorTestBase):
|
||||
def world_size(self):
|
||||
return 8
|
||||
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(8)
|
||||
@with_comms
|
||||
def test_hsdp_tp_model_meta_init(self):
|
||||
# initialize the 3-d device mesh
|
||||
global_mesh = init_device_mesh(
|
||||
|
||||
@ -43,6 +43,7 @@ from torch.testing._internal.common_utils import (
|
||||
retry_on_connect_failures,
|
||||
run_tests,
|
||||
TEST_WITH_DEV_DBG_ASAN,
|
||||
TEST_XPU,
|
||||
TestCase,
|
||||
)
|
||||
from torch.utils.checkpoint import checkpoint
|
||||
@ -63,6 +64,8 @@ else:
|
||||
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
|
||||
def gpus_for_rank(world_size):
|
||||
"""Multigpu tests are designed to simulate the multi nodes with multi
|
||||
@ -70,8 +73,9 @@ def gpus_for_rank(world_size):
|
||||
On a single node, all visible GPUs are evenly
|
||||
divided to subsets, each process only uses a subset.
|
||||
"""
|
||||
visible_devices = list(range(torch.cuda.device_count()))
|
||||
gpus_per_process = torch.cuda.device_count() // world_size
|
||||
device_count = torch.accelerator.device_count()
|
||||
visible_devices = list(range(device_count))
|
||||
gpus_per_process = device_count // world_size
|
||||
gpus_for_rank = []
|
||||
for rank in range(world_size):
|
||||
gpus_for_rank.append(
|
||||
@ -293,6 +297,23 @@ class ConvNet(nn.Module):
|
||||
return self.conv3(x)
|
||||
|
||||
|
||||
# A model involving FFTs, used to test DDP with complex tensors
|
||||
class FFTModel(nn.Module):
|
||||
def __init__(self, hin, win, n_features):
|
||||
super().__init__()
|
||||
self.hin = hin
|
||||
self.win = win
|
||||
self.weight = nn.Parameter(
|
||||
torch.ones((n_features, n_features, hin, win // 2 + 1), dtype=torch.cfloat)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
xc = torch.fft.rfft2(x, s=(self.hin, self.win), dim=(-2, -1), norm="ortho")
|
||||
xcw = torch.einsum("nchw,cohw->nohw", xc, self.weight)
|
||||
x = torch.fft.irfft2(xcw, dim=(-2, -1), norm="ortho")
|
||||
return x
|
||||
|
||||
|
||||
class Task(nn.Module):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
@ -384,7 +405,7 @@ class CommonDistributedDataParallelTest:
|
||||
gradient_as_bucket_view=gradient_as_bucket_view,
|
||||
)
|
||||
|
||||
input = torch.randn(global_batch_size, 2).cuda(devices[0])
|
||||
input = torch.randn(global_batch_size, 2).to(devices[0])
|
||||
target = torch.randn(global_batch_size, 4)
|
||||
|
||||
return model, ddp_model, input, target
|
||||
@ -418,10 +439,10 @@ class CommonDistributedDataParallelTest:
|
||||
allow_none_grads=False,
|
||||
):
|
||||
# to reproduce the same training results
|
||||
torch.cuda.set_device(self.rank)
|
||||
torch.accelerator.set_device_index(self.rank)
|
||||
torch.manual_seed(31415)
|
||||
model = copy.deepcopy(input_model).cuda()
|
||||
ddp_model = copy.deepcopy(input_model).cuda()
|
||||
model = copy.deepcopy(input_model).to(device_type)
|
||||
ddp_model = copy.deepcopy(input_model).to(device_type)
|
||||
ddp_model = nn.parallel.DistributedDataParallel(
|
||||
ddp_model,
|
||||
bucket_cap_mb=1,
|
||||
@ -537,8 +558,8 @@ class CommonDistributedDataParallelTest:
|
||||
def _prepare_dummy_data(self):
|
||||
ddp_bs = 16
|
||||
bs = ddp_bs * self.world_size
|
||||
input = torch.rand((bs, 20), device="cuda", requires_grad=True)
|
||||
target = torch.randn((bs, 20), device="cuda")
|
||||
input = torch.rand((bs, 20), device=device_type, requires_grad=True)
|
||||
target = torch.randn((bs, 20), device=device_type)
|
||||
offset = self.rank * ddp_bs
|
||||
ddp_input = input[offset : offset + ddp_bs]
|
||||
ddp_target = target[offset : offset + ddp_bs]
|
||||
@ -698,7 +719,7 @@ class CommonDistributedDataParallelTest:
|
||||
Test that checkpointing with weight sharing works.
|
||||
"""
|
||||
process_group = self._get_process_group()
|
||||
torch.cuda.set_device(self.rank)
|
||||
torch.accelerator.set_device_index(self.rank)
|
||||
for use_bucket_view, static_graph in product((False, True), (False, True)):
|
||||
torch.manual_seed(31415)
|
||||
l1 = nn.Linear(20, 20)
|
||||
@ -721,7 +742,7 @@ class CommonDistributedDataParallelTest:
|
||||
same layer twice and having weights shared across layers.
|
||||
"""
|
||||
process_group = self._get_process_group()
|
||||
torch.cuda.set_device(self.rank)
|
||||
torch.accelerator.set_device_index(self.rank)
|
||||
for use_bucket_view in (True, False):
|
||||
self._test_ddp_checkpointing(
|
||||
self.CheckpointTwiceModuleWeightSharing(),
|
||||
@ -1145,7 +1166,7 @@ class AbstractCommTest:
|
||||
|
||||
# Verify sequence numbers are appropriately incremented
|
||||
for i in range(10):
|
||||
t = torch.ones(1, device=torch.cuda.current_device())
|
||||
t = torch.ones(1, device=device_type)
|
||||
dist.all_reduce(t, group=process_group)
|
||||
if not c10d._rank_not_in_group(process_group):
|
||||
seq_num = self._verify_sequence_number_across_pg(
|
||||
@ -1176,7 +1197,7 @@ class AbstractCommTest:
|
||||
self.assertEqual(rank_to_seq_num[0] + 1, rank_to_seq_num[1])
|
||||
|
||||
def _test_sequence_num_incremented_default_group(self, backend_name):
|
||||
torch.cuda.set_device(self.rank)
|
||||
torch.accelerator.set_device_index(self.rank)
|
||||
store = dist.FileStore(self.file_name, self.world_size)
|
||||
dist.init_process_group(
|
||||
backend_name,
|
||||
@ -1190,7 +1211,7 @@ class AbstractCommTest:
|
||||
)
|
||||
|
||||
def _test_sequence_num_incremented_subgroup(self, backend_name):
|
||||
torch.cuda.set_device(self.rank)
|
||||
torch.accelerator.set_device_index(self.rank)
|
||||
store = dist.FileStore(self.file_name, self.world_size)
|
||||
dist.init_process_group(
|
||||
backend_name,
|
||||
@ -1245,8 +1266,8 @@ class AbstractCommTest:
|
||||
in_group_ranks = list(filter(lambda x: x % 2 == 0, range(self.world_size)))
|
||||
group = dist.new_group(in_group_ranks)
|
||||
|
||||
x = torch.zeros(2, 2).cuda(self.rank)
|
||||
xs = [torch.zeros(2, 2).cuda(self.rank) for _ in range(len(in_group_ranks))]
|
||||
x = torch.zeros(2, 2).to(self.rank)
|
||||
xs = [torch.zeros(2, 2).to(self.rank) for _ in range(len(in_group_ranks))]
|
||||
if self.rank not in in_group_ranks:
|
||||
msg = ".*{}.*does not belong to.*"
|
||||
with self.assertWarnsOnceRegex(UserWarning, msg.format("all_gather")):
|
||||
@ -1375,7 +1396,7 @@ class AbstractCommTest:
|
||||
rank=self.rank,
|
||||
store=store,
|
||||
)
|
||||
device = "cuda" if backend == "nccl" else "cpu"
|
||||
device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
|
||||
# test alltoall_base
|
||||
tensor = torch.tensor([1, 0, 0, 1], dtype=torch.bool, device=device)
|
||||
zeros = torch.tensor([0, 0, 0, 0], dtype=torch.bool, device=device)
|
||||
@ -1557,8 +1578,8 @@ class CommTest(AbstractCommTest, MultiProcessTestCase):
|
||||
|
||||
class DummyWork(dist._Work):
|
||||
def wait(self, timeout=5.0):
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.current_stream().synchronize()
|
||||
if torch.accelerator.is_available():
|
||||
torch.accelerator.current_stream().synchronize()
|
||||
return True
|
||||
|
||||
|
||||
@ -1773,6 +1794,18 @@ class PythonProcessGroupExtensionTest(MultiProcessTestCase):
|
||||
("cpu:gloo,cuda:nccl", "cpu:gloo,cuda:nccl"),
|
||||
]
|
||||
|
||||
if TEST_XPU:
|
||||
# Override backend_config_strings_and_expected_values for Intel GPU.
|
||||
backend_config_strings_and_expected_values[4:10] = [
|
||||
(dist.Backend.DUMMY, "cpu:dummy,cuda:dummy,xpu:dummy"),
|
||||
("DUMMY", "cpu:dummy,cuda:dummy,xpu:dummy"),
|
||||
("dummy", "cpu:dummy,cuda:dummy,xpu:dummy"),
|
||||
("cpu:dummy,xpu:dummy", "cpu:dummy,xpu:dummy"),
|
||||
("cpu:dummy,xpu:xccl", "cpu:dummy,xpu:xccl"),
|
||||
("cpu:gloo,xpu:dummy", "cpu:gloo,xpu:dummy"),
|
||||
("cpu:gloo,xpu:xccl", "cpu:gloo,xpu:xccl"),
|
||||
]
|
||||
|
||||
for config_str, expected_value in backend_config_strings_and_expected_values:
|
||||
with self.subTest(config_str):
|
||||
# ensures these configs strings are valid and no ValueError is raised
|
||||
@ -1783,6 +1816,8 @@ class PythonProcessGroupExtensionTest(MultiProcessTestCase):
|
||||
invalid_backend_config_strings = [
|
||||
"cpu:gloo,cuda:nccl,", # trailing comma
|
||||
"cpu:gloo,cuda:nccl,cpu:dummy", # duplicate device
|
||||
"cpu:gloo,xpu:xccl,", # trailing comma
|
||||
"cpu:gloo,xpu:xccl,cpu:dummy", # duplicate device
|
||||
]
|
||||
for config_str in invalid_backend_config_strings:
|
||||
with self.subTest(config_str):
|
||||
@ -1797,7 +1832,7 @@ class PythonProcessGroupExtensionTest(MultiProcessTestCase):
|
||||
os.environ["MASTER_ADDR"] = "localhost"
|
||||
os.environ["MASTER_PORT"] = "6789"
|
||||
dist.init_process_group(
|
||||
"cpu:dummy,cuda:dummy", rank=self.rank, world_size=self.world_size
|
||||
"cpu:dummy,cuda:dummy,xpu:dummy", rank=self.rank, world_size=self.world_size
|
||||
)
|
||||
|
||||
# test all_gather
|
||||
@ -2036,7 +2071,7 @@ dist.init_process_group(rank=0, world_size=1, store=dist.HashStore())
|
||||
# correctly dispatched
|
||||
|
||||
# TODO: this will be updated in the future to not be backend specific
|
||||
device = "cuda" if backend == "nccl" else "cpu"
|
||||
device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
|
||||
# ensure supported devices (cpu, cuda) succeeds during dispatch call
|
||||
tensor = torch.zeros(2, 2, device=torch.device(device))
|
||||
# multi tensor collectives
|
||||
@ -2102,7 +2137,7 @@ dist.init_process_group(rank=0, world_size=1, store=dist.HashStore())
|
||||
rank=self.rank,
|
||||
store=store,
|
||||
)
|
||||
device = "cuda" if backend == "nccl" else "cpu"
|
||||
device = "cuda" if backend == "nccl" else "xpu" if backend == "xccl" else "cpu"
|
||||
# test alltoall_base
|
||||
input_tensor = torch.ones(2, 2, device=torch.device(device))
|
||||
output_tensor = torch.zeros(2, 2, device=torch.device(device))
|
||||
@ -2234,8 +2269,9 @@ class LocalRankTest(MultiProcessTestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
assert not torch.cuda._initialized, (
|
||||
"test_distributed must not have initialized CUDA context on main process"
|
||||
)
|
||||
if device_type != "cpu":
|
||||
assert not torch.get_device_module()._initialized, (
|
||||
"test_distributed must not have initialized {device_type} context on main process"
|
||||
)
|
||||
|
||||
run_tests()
|
||||
|
||||
@ -24,7 +24,7 @@ from torch.distributed._functional_collectives import (
|
||||
from torch.testing._internal.common_cuda import SM90OrLater
|
||||
from torch.testing._internal.common_distributed import (
|
||||
MultiProcessTestCase,
|
||||
requires_nccl,
|
||||
requires_accelerator_dist_backend,
|
||||
skip_if_lt_x_gpu,
|
||||
)
|
||||
from torch.testing._internal.common_utils import ( # type: ignore[attr-defined]
|
||||
@ -59,7 +59,7 @@ if not dist.is_available():
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
@requires_nccl()
|
||||
@requires_accelerator_dist_backend(["nccl", "xccl"])
|
||||
class TestWithNCCL(MultiProcessTestCase):
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
@ -75,13 +75,15 @@ class TestWithNCCL(MultiProcessTestCase):
|
||||
|
||||
@property
|
||||
def device(self) -> torch.device:
|
||||
return torch.device(f"cuda:{self.rank}")
|
||||
return torch.device(self.rank)
|
||||
|
||||
def _init_process_group(self) -> None:
|
||||
torch.cuda.set_device(self.device)
|
||||
torch.accelerator.set_device_idx(self.device.index)
|
||||
store = dist.FileStore(self.file_name, self.world_size)
|
||||
backend = dist.get_default_backend_for_device(self.device.type)
|
||||
|
||||
dist.init_process_group(
|
||||
backend="nccl",
|
||||
backend=backend,
|
||||
world_size=self.world_size,
|
||||
rank=self.rank,
|
||||
store=store,
|
||||
@ -273,7 +275,7 @@ class TestWithNCCL(MultiProcessTestCase):
|
||||
)
|
||||
# check memory leak
|
||||
for i in range(1, 10):
|
||||
mem_usage[i] = torch.cuda.max_memory_allocated()
|
||||
mem_usage[i] = torch.accelerator.max_memory_allocated()
|
||||
compiled(arg)
|
||||
|
||||
assert mem_usage[9] == mem_usage[8]
|
||||
@ -370,14 +372,16 @@ class TestWithNCCL(MultiProcessTestCase):
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_all_to_all_single(self) -> None:
|
||||
self._init_process_group()
|
||||
torch.cuda.set_device(self.device)
|
||||
torch.accelerator.set_device_index(self.rank)
|
||||
|
||||
torch.manual_seed(42)
|
||||
send_sz_matrix = torch.randint(0, 20, (self.world_size, self.world_size))
|
||||
|
||||
input_split_sizes = send_sz_matrix[self.rank].tolist()
|
||||
output_split_sizes = send_sz_matrix[:, self.rank].tolist()
|
||||
input = torch.full((sum(input_split_sizes),), float(self.rank)).cuda()
|
||||
input = torch.full((sum(input_split_sizes),), float(self.rank)).to(
|
||||
self.device.type
|
||||
)
|
||||
|
||||
output = torch.ops._c10d_functional.all_to_all_single(
|
||||
input,
|
||||
@ -388,7 +392,7 @@ class TestWithNCCL(MultiProcessTestCase):
|
||||
output = torch.ops._c10d_functional.wait_tensor(output)
|
||||
expect = torch.cat(
|
||||
[
|
||||
torch.full((sz,), float(rank)).cuda()
|
||||
torch.full((sz,), float(rank)).to(self.device.type)
|
||||
for rank, sz in enumerate(output_split_sizes)
|
||||
]
|
||||
)
|
||||
@ -464,7 +468,7 @@ class TestWithNCCL(MultiProcessTestCase):
|
||||
@fresh_cache()
|
||||
def test_threading(self):
|
||||
self._init_process_group()
|
||||
device = torch.device(f"cuda:{self.rank}")
|
||||
device = self.device
|
||||
|
||||
def func(arg: torch.Tensor) -> torch.Tensor:
|
||||
buf0 = arg + 42
|
||||
@ -546,9 +550,9 @@ class TestWithNCCL(MultiProcessTestCase):
|
||||
return in_grad, w_grad
|
||||
|
||||
m, n, k = 128, 256, 64
|
||||
in_ = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
|
||||
w = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
|
||||
out_grad = torch.randn((m, n), device="cuda", dtype=torch.bfloat16)
|
||||
in_ = torch.randn((m, k), device=self.device.type, dtype=torch.bfloat16)
|
||||
w = torch.randn((n, k), device=self.device.type, dtype=torch.bfloat16)
|
||||
out_grad = torch.randn((m, n), device=self.device.type, dtype=torch.bfloat16)
|
||||
|
||||
eager_in_grad, eager_w_grad = fp8_rowwise_backward(in_, w, out_grad)
|
||||
compile_in_grad, compile_w_grad = torch.compile(fp8_rowwise_backward)(
|
||||
@ -777,7 +781,8 @@ class CompileTest(TestCase):
|
||||
|
||||
self.rank = 0
|
||||
self.world_size = 2
|
||||
torch.cuda.set_device("cuda:0")
|
||||
torch.accelerator.set_device_index(0)
|
||||
self.device = torch.accelerator.current_accelerator()
|
||||
|
||||
store = FakeStore()
|
||||
dist.init_process_group(
|
||||
@ -803,7 +808,7 @@ class CompileTest(TestCase):
|
||||
ar1 = funcol.wait_tensor(ar1)
|
||||
return ar0, ar1
|
||||
|
||||
arg = torch.rand(4, 4, device="cuda")
|
||||
arg = torch.rand(4, 4, device=self.device)
|
||||
compiled = torch.compile(func)
|
||||
|
||||
code = run_and_get_triton_code(compiled, arg)
|
||||
@ -836,7 +841,7 @@ class CompileTest(TestCase):
|
||||
|
||||
# Test aoti
|
||||
AOTIRunnerUtil.run(func, (arg,))
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@fresh_cache()
|
||||
@ -851,7 +856,7 @@ class CompileTest(TestCase):
|
||||
ar1 = [funcol.wait_tensor(out) for out in ar1]
|
||||
return ar0, ar1
|
||||
|
||||
args = [torch.rand(4, 4, device="cuda") for _ in range(2)]
|
||||
args = [torch.rand(4, 4, device=self.device.type) for _ in range(2)]
|
||||
compiled = torch.compile(func)
|
||||
code = run_and_get_triton_code(compiled, args)
|
||||
buf0, buf1, buf2, buf3 = find_buffer_assignments(code)
|
||||
@ -881,7 +886,7 @@ class CompileTest(TestCase):
|
||||
|
||||
# Test aoti
|
||||
out = AOTIRunnerUtil.run(func, (args,)) # noqa: F841
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@fresh_cache()
|
||||
@ -892,7 +897,7 @@ class CompileTest(TestCase):
|
||||
ar0 = funcol.wait_tensor(ar0)
|
||||
return ar0
|
||||
|
||||
arg = torch.rand(4, 4, device="cuda")
|
||||
arg = torch.rand(4, 4, device=self.device.type)
|
||||
compiled = torch.compile(func)
|
||||
|
||||
code = run_and_get_triton_code(compiled, arg)
|
||||
@ -917,7 +922,7 @@ class CompileTest(TestCase):
|
||||
# Expect allocation
|
||||
return ar0
|
||||
|
||||
arg = torch.rand(4, 4, device="cuda").T
|
||||
arg = torch.rand(4, 4, device=self.device.type).T
|
||||
compiled = torch.compile(func)
|
||||
|
||||
code = run_and_get_triton_code(compiled, arg)
|
||||
@ -948,7 +953,7 @@ class CompileTest(TestCase):
|
||||
buf2 = torch.mm(arg, buf1)
|
||||
return buf1, buf2
|
||||
|
||||
arg = torch.rand(4, 4, device="cuda")
|
||||
arg = torch.rand(4, 4, device=self.device.type)
|
||||
compiled = torch.compile(func)
|
||||
code = run_and_get_triton_code(compiled, arg)
|
||||
buf0, buf1 = find_buffer_assignments(code)
|
||||
@ -978,7 +983,7 @@ class CompileTest(TestCase):
|
||||
ag0 = funcol.wait_tensor(ag0)
|
||||
return ag0
|
||||
|
||||
arg = torch.rand(4, 4, device="cuda")
|
||||
arg = torch.rand(4, 4, device=self.device.type)
|
||||
compiled = torch.compile(func)
|
||||
code = run_and_get_triton_code(compiled, arg)
|
||||
(
|
||||
@ -995,7 +1000,7 @@ class CompileTest(TestCase):
|
||||
|
||||
# Test aoti
|
||||
AOTIRunnerUtil.run(func, (arg,))
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@fresh_cache()
|
||||
@ -1005,7 +1010,7 @@ class CompileTest(TestCase):
|
||||
ag0 = [funcol.wait_tensor(out) for out in ag0]
|
||||
return ag0
|
||||
|
||||
args = [torch.rand(4, 4, device="cuda") for _ in range(4)]
|
||||
args = [torch.rand(4, 4, device=self.device.type) for _ in range(4)]
|
||||
compiled = torch.compile(func)
|
||||
code = run_and_get_triton_code(compiled, args)
|
||||
(
|
||||
@ -1029,7 +1034,7 @@ class CompileTest(TestCase):
|
||||
|
||||
# Test aoti
|
||||
out = AOTIRunnerUtil.run(func, (args,)) # noqa: F841
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "This is a GPU test!")
|
||||
@fresh_cache()
|
||||
@ -1039,7 +1044,7 @@ class CompileTest(TestCase):
|
||||
return funcol.wait_tensor(t)
|
||||
|
||||
# Test aoti
|
||||
arg = torch.rand(4, 4, device="cuda")
|
||||
arg = torch.rand(4, 4, device=self.device.type)
|
||||
compiled = torch.compile(func)
|
||||
code = run_and_get_triton_code(compiled, arg)
|
||||
(
|
||||
@ -1051,7 +1056,7 @@ class CompileTest(TestCase):
|
||||
|
||||
# Test aoti
|
||||
AOTIRunnerUtil.run(func, (arg,))
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@fresh_cache()
|
||||
@ -1061,7 +1066,7 @@ class CompileTest(TestCase):
|
||||
rs0 = funcol.wait_tensor(rs0)
|
||||
return rs0
|
||||
|
||||
arg = torch.rand(4, 4, device="cuda")
|
||||
arg = torch.rand(4, 4, device=self.device.type)
|
||||
compiled = torch.compile(func)
|
||||
code = run_and_get_triton_code(compiled, arg)
|
||||
(
|
||||
@ -1077,7 +1082,7 @@ class CompileTest(TestCase):
|
||||
|
||||
# Test aoti
|
||||
AOTIRunnerUtil.run(func, (arg,))
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@fresh_cache()
|
||||
@ -1089,7 +1094,7 @@ class CompileTest(TestCase):
|
||||
rs0 = [funcol.wait_tensor(out) for out in rs0]
|
||||
return rs0
|
||||
|
||||
args = [torch.rand(4, 4, device="cuda") for _ in range(4)]
|
||||
args = [torch.rand(4, 4, device=self.device.type) for _ in range(4)]
|
||||
compiled = torch.compile(func)
|
||||
code = run_and_get_triton_code(compiled, args)
|
||||
(
|
||||
@ -1113,7 +1118,7 @@ class CompileTest(TestCase):
|
||||
|
||||
# Test aoti
|
||||
AOTIRunnerUtil.run(func, (args,))
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@fresh_cache()
|
||||
@ -1142,7 +1147,9 @@ class CompileTest(TestCase):
|
||||
|
||||
input_split_sizes = send_sz_matrix[self.rank]
|
||||
output_split_sizes = send_sz_matrix[:, self.rank].contiguous()
|
||||
input = torch.full((input_split_sizes.sum().item(),), float(self.rank)).cuda()
|
||||
input = torch.full((input_split_sizes.sum().item(),), float(self.rank)).to(
|
||||
self.device.type
|
||||
)
|
||||
|
||||
with torch._dynamo.config.patch(
|
||||
dynamic_shapes=True,
|
||||
@ -1176,7 +1183,7 @@ class CompileTest(TestCase):
|
||||
br1 = funcol.wait_tensor(br1)
|
||||
return br0, br1
|
||||
|
||||
arg = torch.rand(4, 4, device="cuda")
|
||||
arg = torch.rand(4, 4, device=self.device.type)
|
||||
compiled = torch.compile(func)
|
||||
|
||||
code = run_and_get_triton_code(compiled, arg)
|
||||
@ -1199,7 +1206,7 @@ class CompileTest(TestCase):
|
||||
|
||||
# Test aoti
|
||||
AOTIRunnerUtil.run(func, (arg,))
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@fresh_cache()
|
||||
@ -1214,7 +1221,7 @@ class CompileTest(TestCase):
|
||||
ar1 = funcol.wait_tensor(ar1)
|
||||
return ar0, ar1
|
||||
|
||||
arg = torch.rand(4, 4, device="cuda")
|
||||
arg = torch.rand(4, 4, device=self.device.type)
|
||||
compiled = torch.compile(func, fullgraph=True)
|
||||
|
||||
code = run_and_get_triton_code(compiled, arg)
|
||||
|
||||
@ -25,6 +25,7 @@ if not c10d.is_available() or not c10d.is_gloo_available():
|
||||
|
||||
import test_c10d_common
|
||||
from test_c10d_common import (
|
||||
FFTModel,
|
||||
gpus_for_rank,
|
||||
LOOPBACK,
|
||||
ModuleForDdpCommHook,
|
||||
@ -134,6 +135,32 @@ def simple_reduce_tests(rank, world_size):
|
||||
),
|
||||
)
|
||||
|
||||
# Extend tests for cfloat dtype
|
||||
tests.extend(
|
||||
(
|
||||
(
|
||||
c10d.ReduceOp.SUM,
|
||||
torch.tensor([complex(rank + 1.0, rank + 1.0)], dtype=torch.cfloat),
|
||||
torch.tensor(
|
||||
[
|
||||
complex(
|
||||
world_size * (world_size + 1) / 2,
|
||||
world_size * (world_size + 1) / 2,
|
||||
)
|
||||
],
|
||||
dtype=torch.cfloat,
|
||||
),
|
||||
),
|
||||
(
|
||||
c10d.ReduceOp.AVG,
|
||||
torch.tensor([complex(rank + 1.0, rank + 1.0)], dtype=torch.cfloat),
|
||||
torch.tensor(
|
||||
[complex(float((world_size + 1) / 2), float((world_size + 1) / 2))],
|
||||
dtype=torch.cfloat,
|
||||
),
|
||||
),
|
||||
)
|
||||
)
|
||||
return tests
|
||||
|
||||
|
||||
@ -373,6 +400,13 @@ class ProcessGroupGlooTest(MultiProcessTestCase):
|
||||
torch.tensor([i * num + j], dtype=torch.float32), output[1]
|
||||
)
|
||||
|
||||
# Run with 1 input tensor of cfloat dtype
|
||||
x = fn(torch.tensor([complex(self.rank, self.rank)], dtype=torch.cfloat))
|
||||
output = broadcast([x], i, 0)
|
||||
self.assertEqual(
|
||||
torch.tensor([complex(i, i)], dtype=torch.cfloat), output[0]
|
||||
)
|
||||
|
||||
# Test overloaded convenience function
|
||||
x = torch.tensor([self.rank + 1.0])
|
||||
fut = pg.broadcast(x, root=0).get_future()
|
||||
@ -1605,6 +1639,22 @@ class ProcessGroupGlooTest(MultiProcessTestCase):
|
||||
|
||||
work.wait()
|
||||
|
||||
@requires_gloo()
|
||||
def test_send_recv_complex(self):
|
||||
store = c10d.FileStore(self.file_name, self.world_size)
|
||||
pg = self._create_process_group_gloo(
|
||||
store, self.rank, self.world_size, self.opts()
|
||||
)
|
||||
# Generate the same random tensor
|
||||
torch.manual_seed(0)
|
||||
send_tensor = torch.rand(10, 10, dtype=torch.cfloat)
|
||||
if self.rank == 0:
|
||||
pg.send([send_tensor], 1, 0).wait()
|
||||
if self.rank == 1:
|
||||
recv_tensor = torch.rand(10, 10, dtype=torch.cfloat)
|
||||
pg.recv([recv_tensor], 0, 0).wait()
|
||||
self.assertEqual(send_tensor, recv_tensor)
|
||||
|
||||
|
||||
class DistributedDataParallelTest(
|
||||
test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
|
||||
@ -2270,6 +2320,24 @@ class DistributedDataParallelTest(
|
||||
|
||||
self._run_and_verify_sparse_gradients(vanilla_model, ddp_model)
|
||||
|
||||
@requires_gloo()
|
||||
def test_ddp_complex_params(self):
|
||||
process_group = self._get_process_group()
|
||||
N, C, H, W = 1, 16, 64, 64
|
||||
ddp_model = DistributedDataParallel(
|
||||
FFTModel(hin=H, win=W, n_features=C),
|
||||
process_group=process_group,
|
||||
)
|
||||
optimizer = torch.optim.Adam(ddp_model.parameters(), lr=0.001)
|
||||
|
||||
inp = torch.ones((N, C, H, W), dtype=torch.float32)
|
||||
|
||||
# train step
|
||||
out = ddp_model(inp)
|
||||
loss = torch.sum(out)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
|
||||
class ReducerModule(nn.Module):
|
||||
def __init__(self) -> None:
|
||||
|
||||
@ -29,7 +29,13 @@ if not c10d.is_available() or not c10d.is_nccl_available():
|
||||
|
||||
|
||||
import test_c10d_common
|
||||
from test_c10d_common import ConvNet, DoubleGpuNet, gpus_for_rank, ModuleForDdpCommHook
|
||||
from test_c10d_common import (
|
||||
ConvNet,
|
||||
DoubleGpuNet,
|
||||
FFTModel,
|
||||
gpus_for_rank,
|
||||
ModuleForDdpCommHook,
|
||||
)
|
||||
|
||||
import torch.distributed as dist
|
||||
import torch.distributed.algorithms.ddp_comm_hooks.default_hooks as default
|
||||
@ -2552,25 +2558,6 @@ class DistributedDataParallelTest(
|
||||
@requires_nccl()
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_ddp_complex_params(self):
|
||||
class FFTModel(nn.Module):
|
||||
def __init__(self, hin, win, n_features):
|
||||
super().__init__()
|
||||
self.hin = hin
|
||||
self.win = win
|
||||
self.weight = nn.Parameter(
|
||||
torch.ones(
|
||||
(n_features, n_features, hin, win // 2 + 1), dtype=torch.cfloat
|
||||
)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
xc = torch.fft.rfft2(
|
||||
x, s=(self.hin, self.win), dim=(-2, -1), norm="ortho"
|
||||
)
|
||||
xcw = torch.einsum("nchw,cohw->nohw", xc, self.weight)
|
||||
x = torch.fft.irfft2(xcw, dim=(-2, -1), norm="ortho")
|
||||
return x
|
||||
|
||||
process_group = self._get_process_group()
|
||||
device_id = gpus_for_rank(self.world_size)[self.rank][0]
|
||||
N, C, H, W = 1, 16, 64, 64
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user