mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-12 14:54:55 +08:00
Compare commits
1 Commits
eqy-patch-
...
csl/remove
| Author | SHA1 | Date | |
|---|---|---|---|
| 37cfd41f0d |
@ -7,13 +7,13 @@ ENV LC_ALL en_US.UTF-8
|
||||
ENV LANG en_US.UTF-8
|
||||
ENV LANGUAGE en_US.UTF-8
|
||||
|
||||
ARG DEVTOOLSET_VERSION=13
|
||||
ARG DEVTOOLSET_VERSION=11
|
||||
|
||||
RUN yum -y update
|
||||
RUN yum -y install epel-release
|
||||
# install glibc-langpack-en make sure en_US.UTF-8 locale is available
|
||||
RUN yum -y install glibc-langpack-en
|
||||
RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
|
||||
RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
|
||||
# Just add everything as a safe.directory for git since these will be used in multiple places with git
|
||||
RUN git config --global --add safe.directory '*'
|
||||
ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
|
||||
@ -41,7 +41,6 @@ RUN bash ./install_conda.sh && rm install_conda.sh
|
||||
# Install CUDA
|
||||
FROM base as cuda
|
||||
ARG CUDA_VERSION=12.6
|
||||
ARG DEVTOOLSET_VERSION=13
|
||||
RUN rm -rf /usr/local/cuda-*
|
||||
ADD ./common/install_cuda.sh install_cuda.sh
|
||||
COPY ./common/install_nccl.sh install_nccl.sh
|
||||
@ -51,8 +50,7 @@ ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
|
||||
# Preserve CUDA_VERSION for the builds
|
||||
ENV CUDA_VERSION=${CUDA_VERSION}
|
||||
# Make things in our path by default
|
||||
ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
|
||||
|
||||
ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH
|
||||
|
||||
FROM cuda as cuda12.6
|
||||
RUN bash ./install_cuda.sh 12.6
|
||||
@ -70,22 +68,8 @@ FROM cuda as cuda13.0
|
||||
RUN bash ./install_cuda.sh 13.0
|
||||
ENV DESIRED_CUDA=13.0
|
||||
|
||||
FROM ${ROCM_IMAGE} as rocm_base
|
||||
ARG DEVTOOLSET_VERSION=13
|
||||
ENV LC_ALL en_US.UTF-8
|
||||
ENV LANG en_US.UTF-8
|
||||
ENV LANGUAGE en_US.UTF-8
|
||||
# Install devtoolset on ROCm base image
|
||||
RUN yum -y update && \
|
||||
yum -y install epel-release && \
|
||||
yum -y install glibc-langpack-en && \
|
||||
yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
|
||||
RUN git config --global --add safe.directory '*'
|
||||
ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
|
||||
|
||||
FROM rocm_base as rocm
|
||||
FROM ${ROCM_IMAGE} as rocm
|
||||
ARG PYTORCH_ROCM_ARCH
|
||||
ARG DEVTOOLSET_VERSION=13
|
||||
ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
|
||||
ADD ./common/install_mkl.sh install_mkl.sh
|
||||
RUN bash ./install_mkl.sh && rm install_mkl.sh
|
||||
@ -104,7 +88,6 @@ COPY --from=cuda13.0 /usr/local/cuda-13.0 /usr/local/cuda-13.0
|
||||
|
||||
# Final step
|
||||
FROM ${BASE_TARGET} as final
|
||||
ARG DEVTOOLSET_VERSION=13
|
||||
COPY --from=openssl /opt/openssl /opt/openssl
|
||||
COPY --from=patchelf /patchelf /usr/local/bin/patchelf
|
||||
COPY --from=conda /opt/conda /opt/conda
|
||||
|
||||
@ -63,7 +63,7 @@ docker build \
|
||||
--target final \
|
||||
--progress plain \
|
||||
--build-arg "BASE_TARGET=${BASE_TARGET}" \
|
||||
--build-arg "DEVTOOLSET_VERSION=13" \
|
||||
--build-arg "DEVTOOLSET_VERSION=11" \
|
||||
${EXTRA_BUILD_ARGS} \
|
||||
-t ${tmp_tag} \
|
||||
$@ \
|
||||
|
||||
@ -168,18 +168,6 @@ case "$tag" in
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.11-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.11
|
||||
CLANG_VERSION=12
|
||||
VISION=no
|
||||
TRITON=no
|
||||
;;
|
||||
pytorch-linux-jammy-py3.12-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
CLANG_VERSION=12
|
||||
VISION=no
|
||||
TRITON=no
|
||||
;;
|
||||
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
|
||||
if [[ $tag =~ "jammy" ]]; then
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
@ -207,9 +195,9 @@ case "$tag" in
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-noble-xpu-n-py3 | pytorch-linux-noble-xpu-n-py3-inductor-benchmarks)
|
||||
pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=13
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
XPU_VERSION=2025.2
|
||||
NINJA_VERSION=1.9.0
|
||||
@ -273,9 +261,9 @@ case "$tag" in
|
||||
PYTHON_VERSION=3.10
|
||||
CUDA_VERSION=12.8.1
|
||||
;;
|
||||
pytorch-linux-jammy-aarch64-py3.10-gcc13)
|
||||
pytorch-linux-jammy-aarch64-py3.10-gcc11)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=13
|
||||
GCC_VERSION=11
|
||||
ACL=yes
|
||||
VISION=yes
|
||||
OPENBLAS=yes
|
||||
@ -293,9 +281,9 @@ case "$tag" in
|
||||
# from pytorch/llvm:9.0.1 is x86 specific
|
||||
SKIP_LLVM_SRC_BUILD_INSTALL=yes
|
||||
;;
|
||||
pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks)
|
||||
pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=13
|
||||
GCC_VERSION=11
|
||||
ACL=yes
|
||||
VISION=yes
|
||||
OPENBLAS=yes
|
||||
|
||||
@ -7,11 +7,11 @@ if [ -n "$GCC_VERSION" ]; then
|
||||
# Need the official toolchain repo to get alternate packages
|
||||
add-apt-repository ppa:ubuntu-toolchain-r/test
|
||||
apt-get update
|
||||
apt-get install -y g++-$GCC_VERSION gfortran-$GCC_VERSION
|
||||
apt-get install -y g++-$GCC_VERSION
|
||||
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50
|
||||
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50
|
||||
update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50
|
||||
update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-"$GCC_VERSION" 50
|
||||
|
||||
|
||||
# Cleanup package manager
|
||||
apt-get autoclean && apt-get clean
|
||||
|
||||
@ -1,56 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Script used only in CD pipeline
|
||||
|
||||
set -ex
|
||||
|
||||
# install dependencies
|
||||
dnf -y install gmp-devel libmpc-devel texinfo flex bison
|
||||
|
||||
cd /usr/local/src
|
||||
# fetch source for gcc 13
|
||||
git clone --depth 1 --single-branch -b releases/gcc-13.3.0 https://github.com/gcc-mirror/gcc.git gcc-13.3.0
|
||||
|
||||
mkdir -p gcc-13.3.0/build-gomp
|
||||
cd gcc-13.3.0/build-gomp
|
||||
|
||||
# configure gcc build
|
||||
# I got these flags by:
|
||||
# 1. downloading the source rpm for gcc-11 on AlmaLinux 8 container
|
||||
# dnf install -y dnf-plugins-core rpmdevtools
|
||||
# dnf download --source libgomp
|
||||
# 2. extracting the gcc.spec from the source.
|
||||
# rpmdev-extract gcc-xx.src.rpm
|
||||
# 3. extracting optflags and ld_flags from gcc.spec:
|
||||
# rpm --eval '%{optflags}'
|
||||
# rpm --eval '%{build_ldflags}'
|
||||
#
|
||||
# I had to remove the following flags because they didn't compile for this version of libgomp:
|
||||
# -Werror=format-security
|
||||
# -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1
|
||||
# -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1
|
||||
#
|
||||
# I added -march=armv8-a -mtune=generic to make them explicit. I don't think they're strictly needed.
|
||||
|
||||
OPT_FLAGS='-O2 -march=armv8-a -mtune=generic'\
|
||||
' -fexceptions -g -grecord-gcc-switches -pipe -Wall'\
|
||||
' -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS'\
|
||||
' -fstack-protector-strong -fasynchronous-unwind-tables'\
|
||||
' -fstack-clash-protection'
|
||||
|
||||
LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now'
|
||||
|
||||
CFLAGS="$OPT_FLAGS" \
|
||||
CXXFLAGS="$OPT_FLAGS" \
|
||||
LDFLAGS="$LDFLAGS" \
|
||||
../configure \
|
||||
--prefix=/usr \
|
||||
--libdir=/usr/lib64 \
|
||||
--enable-languages=c,c++ \
|
||||
--disable-multilib \
|
||||
--disable-bootstrap \
|
||||
--enable-libgomp
|
||||
|
||||
# only build libgomp
|
||||
make -j$(nproc) all-target-libgomp
|
||||
|
||||
make install-target-libgomp
|
||||
@ -9,7 +9,7 @@ set -xe
|
||||
|
||||
function install_ubuntu() {
|
||||
. /etc/os-release
|
||||
if [[ ! " jammy noble " =~ " ${VERSION_CODENAME} " ]]; then
|
||||
if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then
|
||||
echo "Ubuntu version ${VERSION_CODENAME} not supported"
|
||||
exit
|
||||
fi
|
||||
@ -35,24 +35,25 @@ function install_ubuntu() {
|
||||
# The xpu-smi packages
|
||||
apt-get install -y flex bison xpu-smi
|
||||
|
||||
# Compute and Media Runtimes
|
||||
if [[ " ${VERSION_CODENAME} " =~ " noble " ]]; then
|
||||
if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
|
||||
# Compute and Media Runtimes
|
||||
apt-get install -y \
|
||||
intel-opencl-icd libze-intel-gpu1 libze1 \
|
||||
intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
|
||||
libegl-mesa0 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
||||
intel-opencl-icd intel-level-zero-gpu level-zero \
|
||||
intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
|
||||
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
||||
libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
|
||||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
|
||||
else # jammy
|
||||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
|
||||
# Development Packages
|
||||
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
|
||||
else # rolling driver
|
||||
apt-get install -y \
|
||||
intel-opencl-icd libze-intel-gpu1 libze1 \
|
||||
intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
|
||||
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
||||
libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
|
||||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
|
||||
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
|
||||
fi
|
||||
# Development Packages
|
||||
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
|
||||
|
||||
# Install Intel Support Packages
|
||||
apt-get install -y ${XPU_PACKAGES}
|
||||
@ -65,7 +66,7 @@ function install_ubuntu() {
|
||||
function install_rhel() {
|
||||
. /etc/os-release
|
||||
if [[ "${ID}" == "rhel" ]]; then
|
||||
if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
|
||||
if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
|
||||
echo "RHEL version ${VERSION_ID} not supported"
|
||||
exit
|
||||
fi
|
||||
@ -146,7 +147,7 @@ function install_sles() {
|
||||
XPU_DRIVER_VERSION=""
|
||||
if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
|
||||
# Use GPU driver LTS releases
|
||||
XPU_DRIVER_VERSION="/lts/2523"
|
||||
XPU_DRIVER_VERSION="/lts/2350"
|
||||
fi
|
||||
|
||||
# Default use Intel® oneAPI Deep Learning Essentials 2025.1
|
||||
|
||||
@ -50,10 +50,6 @@ RUN rm install_ninja.sh
|
||||
ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
|
||||
ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
|
||||
|
||||
# Build a newer version of libgomp than that supported in in Almalinux 8.
|
||||
COPY ./common/install_libgomp.sh install_libgomp.sh
|
||||
RUN bash ./install_libgomp.sh && rm install_libgomp.sh
|
||||
|
||||
# git236+ would refuse to run git commands in repos owned by other users
|
||||
# Which causes version check to fail, as pytorch repo is bind-mounted into the image
|
||||
# Override this behaviour by treating every folder as safe
|
||||
|
||||
@ -1,11 +1,15 @@
|
||||
sphinx==7.2.6
|
||||
sphinx==5.3.0
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 7.2.6
|
||||
#Pinned versions: 5.3.0
|
||||
|
||||
pytorch_sphinx_theme2==0.2.0
|
||||
#Description: This is needed to generate PyTorch docs
|
||||
#Pinned versions: 0.2.0
|
||||
standard-imghdr==3.13.0; python_version >= "3.13"
|
||||
#Description: This is needed by Sphinx, so it needs to be added here.
|
||||
# The reasons are as follows:
|
||||
# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
|
||||
# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
|
||||
# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
|
||||
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
|
||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
|
||||
# something related to Docker setup. We can investigate this later.
|
||||
@ -32,17 +36,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 2.13.0
|
||||
|
||||
breathe==4.36.0
|
||||
breathe==4.34.0
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
#Pinned versions: 4.36.0
|
||||
#Pinned versions: 4.34.0
|
||||
|
||||
exhale==0.3.7
|
||||
exhale==0.2.3
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
#Pinned versions: 0.3.7
|
||||
#Pinned versions: 0.2.3
|
||||
|
||||
docutils==0.20
|
||||
docutils==0.16
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
#Pinned versions: 0.20
|
||||
#Pinned versions: 0.16
|
||||
|
||||
bs4==0.0.1
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
@ -52,13 +56,13 @@ IPython==8.12.0
|
||||
#Description: This is used to generate PyTorch functorch docs
|
||||
#Pinned versions: 8.12.0
|
||||
|
||||
myst-nb==1.3.0
|
||||
myst-nb==0.17.2
|
||||
#Description: This is used to generate PyTorch functorch and torch.compile docs.
|
||||
#Pinned versions: 1.3.0
|
||||
#Pinned versions: 0.17.2
|
||||
|
||||
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
|
||||
python-etcd==0.4.5
|
||||
sphinx-copybutton==0.5.0
|
||||
sphinx-design==0.6.1
|
||||
sphinx-design==0.4.0
|
||||
sphinxcontrib-mermaid==1.0.0
|
||||
myst-parser==4.0.1
|
||||
myst-parser==0.18.1
|
||||
|
||||
@ -89,41 +89,23 @@ if [ "$is_main_doc" = true ]; then
|
||||
|
||||
make coverage
|
||||
# Now we have the coverage report, we need to make sure it is empty.
|
||||
# Sphinx 7.2.6+ format: python.txt contains a statistics table with a TOTAL row
|
||||
# showing the undocumented count in the third column.
|
||||
# Example: | TOTAL | 99.83% | 2 |
|
||||
# Count the number of lines in the file and turn that number into a variable
|
||||
# $lines. The `cut -f1 ...` is to only parse the number, not the filename
|
||||
# Skip the report header by subtracting 2: the header will be output even if
|
||||
# there are no undocumented items.
|
||||
#
|
||||
# Also: see docs/source/conf.py for "coverage_ignore*" items, which should
|
||||
# be documented then removed from there.
|
||||
|
||||
# Extract undocumented count from TOTAL row in Sphinx 7.2.6 statistics table
|
||||
# The table format is: | Module | Coverage | Undocumented |
|
||||
# Extract the third column (undocumented count) from the TOTAL row
|
||||
undocumented=$(grep "| TOTAL" build/coverage/python.txt | awk -F'|' '{print $4}' | tr -d ' ')
|
||||
|
||||
if [ -z "$undocumented" ] || ! [[ "$undocumented" =~ ^[0-9]+$ ]]; then
|
||||
lines=$(wc -l build/coverage/python.txt 2>/dev/null |cut -f1 -d' ')
|
||||
undocumented=$((lines - 2))
|
||||
if [ $undocumented -lt 0 ]; then
|
||||
echo coverage output not found
|
||||
exit 1
|
||||
elif [ "$undocumented" -gt 0 ]; then
|
||||
set +x # Disable command echoing for cleaner output
|
||||
echo ""
|
||||
echo "====================="
|
||||
echo "UNDOCUMENTED OBJECTS:"
|
||||
echo "====================="
|
||||
echo ""
|
||||
# Find the line number of the TOTAL row and print only what comes after it
|
||||
total_line=$(grep -n "| TOTAL" build/coverage/python.txt | cut -d: -f1)
|
||||
if [ -n "$total_line" ]; then
|
||||
# Print only the detailed list (skip the statistics table)
|
||||
tail -n +$((total_line + 2)) build/coverage/python.txt
|
||||
else
|
||||
# Fallback to showing entire file if TOTAL line not found
|
||||
cat build/coverage/python.txt
|
||||
fi
|
||||
echo ""
|
||||
elif [ $undocumented -gt 0 ]; then
|
||||
echo undocumented objects found:
|
||||
cat build/coverage/python.txt
|
||||
echo "Make sure you've updated relevant .rsts in docs/source!"
|
||||
echo "You can reproduce locally by running 'cd docs && make coverage && tail -n +\$((grep -n \"| TOTAL\" build/coverage/python.txt | cut -d: -f1) + 2)) build/coverage/python.txt'"
|
||||
set -x # Re-enable command echoing
|
||||
echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
|
||||
@ -70,7 +70,7 @@ sccache --zero-stats
|
||||
sccache --show-stats
|
||||
|
||||
# Build the wheel
|
||||
python -m build --wheel --no-isolation
|
||||
python -m build --wheel --no-build-isolation
|
||||
if ($LASTEXITCODE -ne 0) { exit 1 }
|
||||
|
||||
# Install the wheel locally
|
||||
|
||||
@ -1,11 +1,11 @@
|
||||
name: 🚀 New Feature for Release
|
||||
name: 🚀 Release highlight for proposed Feature
|
||||
description: Submit a Release highlight for proposed Feature
|
||||
labels: ["release-feature-request"]
|
||||
|
||||
body:
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: New Feature for Release
|
||||
label: Release highlight for proposed Feature
|
||||
description: >
|
||||
Example: “A torch.special module, analogous to SciPy's special module.”
|
||||
- type: input
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
ad5816f0eee1c873df1b7d371c69f1f811a89387
|
||||
3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2
|
||||
|
||||
2
.github/ci_commit_pins/vision.txt
vendored
2
.github/ci_commit_pins/vision.txt
vendored
@ -1 +1 @@
|
||||
ca2212438fdd8ce29b66999ed70ed54b0f9372d1
|
||||
cfbc5c2f1c798991715a6b06bb3ce46478c4487c
|
||||
|
||||
9
.github/labeler.yml
vendored
9
.github/labeler.yml
vendored
@ -138,8 +138,7 @@
|
||||
- test/test_matmul_cuda.py
|
||||
- test/test_scaled_matmul_cuda.py
|
||||
- test/inductor/test_fp8.py
|
||||
- aten/src/ATen/native/cuda/*Blas.cpp
|
||||
- aten/src/ATen/cuda/CUDA*Blas.*
|
||||
- aten/src/ATen/native/cuda/Blas.cpp
|
||||
- torch/**/*cublas*
|
||||
- torch/_inductor/kernel/mm.py
|
||||
- test/inductor/test_max_autotune.py
|
||||
@ -149,8 +148,7 @@
|
||||
- test/test_matmul_cuda.py
|
||||
- test/test_scaled_matmul_cuda.py
|
||||
- test/inductor/test_fp8.py
|
||||
- aten/src/ATen/native/cuda/*Blas.cpp
|
||||
- aten/src/ATen/cuda/CUDA*Blas.*
|
||||
- aten/src/ATen/native/cuda/Blas.cpp
|
||||
- torch/**/*cublas*
|
||||
- torch/_inductor/kernel/mm.py
|
||||
- test/inductor/test_max_autotune.py
|
||||
@ -160,8 +158,7 @@
|
||||
- test/test_matmul_cuda.py
|
||||
- test/test_scaled_matmul_cuda.py
|
||||
- test/inductor/test_fp8.py
|
||||
- aten/src/ATen/native/cuda/*Blas.cpp
|
||||
- aten/src/ATen/cuda/CUDA*Blas.*
|
||||
- aten/src/ATen/native/cuda/Blas.cpp
|
||||
- torch/_inductor/kernel/mm.py
|
||||
- test/inductor/test_max_autotune.py
|
||||
- third_party/fbgemm
|
||||
|
||||
6
.github/pytorch-probot.yml
vendored
6
.github/pytorch-probot.yml
vendored
@ -2,8 +2,8 @@ tracking_issue: 24422
|
||||
ciflow_tracking_issue: 64124
|
||||
ciflow_push_tags:
|
||||
- ciflow/b200
|
||||
- ciflow/b200-distributed
|
||||
- ciflow/b200-symm-mem
|
||||
- ciflow/b200-distributed
|
||||
- ciflow/binaries
|
||||
- ciflow/binaries_libtorch
|
||||
- ciflow/binaries_wheel
|
||||
@ -22,8 +22,6 @@ ciflow_push_tags:
|
||||
- ciflow/inductor-perf-test-nightly-xpu
|
||||
- ciflow/inductor-periodic
|
||||
- ciflow/inductor-rocm
|
||||
- ciflow/inductor-rocm-mi200
|
||||
- ciflow/inductor-rocm-mi300
|
||||
- ciflow/linux-aarch64
|
||||
- ciflow/mps
|
||||
- ciflow/nightly
|
||||
@ -35,13 +33,11 @@ ciflow_push_tags:
|
||||
- ciflow/quantization-periodic
|
||||
- ciflow/riscv64
|
||||
- ciflow/rocm
|
||||
- ciflow/rocm-mi200
|
||||
- ciflow/rocm-mi300
|
||||
- ciflow/rocm-mi355
|
||||
- ciflow/rocm-navi31
|
||||
- ciflow/s390
|
||||
- ciflow/slow
|
||||
- ciflow/slow-rocm-mi200
|
||||
- ciflow/torchbench
|
||||
- ciflow/triton_binaries
|
||||
- ciflow/trunk
|
||||
|
||||
10
.github/workflows/docker-builds.yml
vendored
10
.github/workflows/docker-builds.yml
vendored
@ -56,8 +56,6 @@ jobs:
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
|
||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-py3.10-clang12,
|
||||
pytorch-linux-jammy-py3.11-clang12,
|
||||
pytorch-linux-jammy-py3.12-clang12,
|
||||
pytorch-linux-jammy-py3.13-clang12,
|
||||
pytorch-linux-jammy-py3.14-clang12,
|
||||
pytorch-linux-jammy-rocm-n-py3,
|
||||
@ -68,8 +66,8 @@ jobs:
|
||||
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
|
||||
pytorch-linux-jammy-py3.12-halide,
|
||||
pytorch-linux-jammy-xpu-n-1-py3,
|
||||
pytorch-linux-noble-xpu-n-py3,
|
||||
pytorch-linux-noble-xpu-n-py3-inductor-benchmarks,
|
||||
pytorch-linux-jammy-xpu-n-py3,
|
||||
pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
|
||||
pytorch-linux-jammy-py3-clang18-asan,
|
||||
pytorch-linux-jammy-py3-clang12-onnx,
|
||||
pytorch-linux-jammy-linter,
|
||||
@ -79,11 +77,11 @@ jobs:
|
||||
pytorch-linux-noble-riscv64-py3.12-gcc14
|
||||
]
|
||||
include:
|
||||
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13
|
||||
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-clang21
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks
|
||||
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
timeout-minutes: 600
|
||||
# Docker uploads fail from LF runners, see https://github.com/pytorch/pytorch/pull/137358
|
||||
|
||||
@ -72,7 +72,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
build-environment: linux-jammy-aarch64-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_cpu_aarch64", shard: 1, num_shards: 9, runner: "linux.arm64.m7g.metal" },
|
||||
|
||||
@ -83,8 +83,8 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-noble-xpu-n-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3-inductor-benchmarks
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
|
||||
runner: linux.c7i.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -117,7 +117,7 @@ jobs:
|
||||
uses: ./.github/workflows/_xpu-test.yml
|
||||
needs: xpu-n-py3_10-inductor-benchmark-build
|
||||
with:
|
||||
build-environment: linux-noble-xpu-n-py3.10
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
|
||||
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
|
||||
@ -137,7 +137,7 @@ jobs:
|
||||
uses: ./.github/workflows/_xpu-test.yml
|
||||
needs: xpu-n-py3_10-inductor-benchmark-build
|
||||
with:
|
||||
build-environment: linux-noble-xpu-n-py3.10
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
|
||||
|
||||
1
.github/workflows/inductor-rocm-mi300.yml
vendored
1
.github/workflows/inductor-rocm-mi300.yml
vendored
@ -7,7 +7,6 @@ on:
|
||||
- release/*
|
||||
tags:
|
||||
- ciflow/inductor-rocm/*
|
||||
- ciflow/inductor-rocm-mi300/*
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
|
||||
@ -2,12 +2,12 @@ name: inductor-rocm
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: 0 */3 * * *
|
||||
- cron: 0 * * * *
|
||||
push:
|
||||
branches:
|
||||
- release/*
|
||||
tags:
|
||||
- ciflow/inductor-rocm-mi200/*
|
||||
- ciflow/inductor-rocm/*
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
2
.github/workflows/linux-aarch64.yml
vendored
2
.github/workflows/linux-aarch64.yml
vendored
@ -33,7 +33,7 @@ jobs:
|
||||
with:
|
||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||
build-environment: linux-jammy-aarch64-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
|
||||
2
.github/workflows/operator_benchmark.yml
vendored
2
.github/workflows/operator_benchmark.yml
vendored
@ -60,7 +60,7 @@ jobs:
|
||||
with:
|
||||
build-environment: linux-jammy-aarch64-py3.10
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" },
|
||||
|
||||
1
.github/workflows/periodic-rocm-mi200.yml
vendored
1
.github/workflows/periodic-rocm-mi200.yml
vendored
@ -11,6 +11,7 @@ on:
|
||||
- cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
|
||||
push:
|
||||
tags:
|
||||
- ciflow/periodic/*
|
||||
- ciflow/periodic-rocm-mi200/*
|
||||
branches:
|
||||
- release/*
|
||||
|
||||
1
.github/workflows/periodic-rocm-mi300.yml
vendored
1
.github/workflows/periodic-rocm-mi300.yml
vendored
@ -11,7 +11,6 @@ on:
|
||||
- cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
|
||||
push:
|
||||
tags:
|
||||
- ciflow/periodic/*
|
||||
- ciflow/periodic-rocm-mi300/*
|
||||
branches:
|
||||
- release/*
|
||||
|
||||
8
.github/workflows/pull.yml
vendored
8
.github/workflows/pull.yml
vendored
@ -342,16 +342,16 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-noble-xpu-n-py3_10-build:
|
||||
name: linux-noble-xpu-n-py3.10
|
||||
linux-jammy-xpu-n-py3_10-build:
|
||||
name: linux-jammy-xpu-n-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# This should sync with the build in xpu.yml but xpu uses a larger runner
|
||||
# sync-tag: linux-xpu-n-build
|
||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||
build-environment: linux-noble-xpu-n-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
|
||||
|
||||
1
.github/workflows/rocm-mi300.yml
vendored
1
.github/workflows/rocm-mi300.yml
vendored
@ -6,7 +6,6 @@ on:
|
||||
- main
|
||||
- release/*
|
||||
tags:
|
||||
- ciflow/rocm/*
|
||||
- ciflow/rocm-mi300/*
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
|
||||
@ -5,12 +5,11 @@ on:
|
||||
branches:
|
||||
- release/*
|
||||
tags:
|
||||
- ciflow/rocm-mi200/*
|
||||
- ciflow/rocm/*
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: 29 8 * * * # about 1:29am PDT
|
||||
- cron: 0 */3 * * *
|
||||
|
||||
- cron: 0 * * * *
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
81
.github/workflows/slow-rocm-mi200.yml
vendored
81
.github/workflows/slow-rocm-mi200.yml
vendored
@ -1,81 +0,0 @@
|
||||
# This workflow is dedicated to host slow jobs that are run only periodically because
|
||||
# they are too slow to run in every commit. The list of slow tests can be found in
|
||||
# https://github.com/pytorch/test-infra/blob/generated-stats/stats/slow-tests.json
|
||||
name: slow-rocm-mi200
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- release/*
|
||||
tags:
|
||||
- ciflow/slow/*
|
||||
- ciflow/slow-rocm-mi200/*
|
||||
schedule:
|
||||
- cron: 0 */3 * * *
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
llm-td:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: before-test
|
||||
uses: ./.github/workflows/llm_td_retrieval.yml
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
target-determination:
|
||||
name: before-test
|
||||
uses: ./.github/workflows/target_determination.yml
|
||||
needs: llm-td
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
|
||||
{ config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
30
.github/workflows/slow.yml
vendored
30
.github/workflows/slow.yml
vendored
@ -105,6 +105,36 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
|
||||
{ config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_10-clang18-asan-build:
|
||||
name: linux-jammy-py3.10-clang18-asan
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
5
.github/workflows/upload-test-stats.yml
vendored
5
.github/workflows/upload-test-stats.yml
vendored
@ -11,16 +11,15 @@ on:
|
||||
- inductor
|
||||
- unstable
|
||||
- slow
|
||||
- slow-rocm-mi200
|
||||
- unstable-periodic
|
||||
- inductor-periodic
|
||||
- rocm-mi200
|
||||
- rocm
|
||||
- rocm-mi300
|
||||
- rocm-mi355
|
||||
- inductor-micro-benchmark
|
||||
- inductor-micro-benchmark-x86
|
||||
- inductor-cu124
|
||||
- inductor-rocm-mi200
|
||||
- inductor-rocm
|
||||
- inductor-rocm-mi300
|
||||
- mac-mps
|
||||
- linux-aarch64
|
||||
|
||||
20
.github/workflows/xpu.yml
vendored
20
.github/workflows/xpu.yml
vendored
@ -47,15 +47,15 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-noble-xpu-n-py3_10-build:
|
||||
name: linux-noble-xpu-n-py3.10
|
||||
linux-jammy-xpu-n-py3_10-build:
|
||||
name: linux-jammy-xpu-n-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
sync-tag: linux-xpu-n-build
|
||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||
build-environment: linux-noble-xpu-n-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
|
||||
runner: linux.c7i.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -74,17 +74,17 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-noble-xpu-n-py3_10-test:
|
||||
name: linux-noble-xpu-n-py3.10
|
||||
linux-jammy-xpu-n-py3_10-test:
|
||||
name: linux-jammy-xpu-n-py3.10
|
||||
uses: ./.github/workflows/_xpu-test.yml
|
||||
needs: linux-noble-xpu-n-py3_10-build
|
||||
needs: linux-jammy-xpu-n-py3_10-build
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
with:
|
||||
build-environment: linux-noble-xpu-n-py3.10
|
||||
docker-image: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.test-matrix }}
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
windows-xpu-n-1-build:
|
||||
|
||||
@ -143,8 +143,7 @@ init_command = [
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
|
||||
'numpy==2.1.0 ; python_version >= "3.12" and python_version <= "3.13"',
|
||||
'numpy==2.3.4 ; python_version >= "3.14"',
|
||||
'numpy==2.1.0 ; python_version >= "3.12"',
|
||||
'expecttest==0.3.0',
|
||||
'pyrefly==0.36.2',
|
||||
'sympy==1.13.3',
|
||||
|
||||
@ -210,12 +210,8 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
|
||||
/test/inductor/test_flex_attention.py @drisspg
|
||||
/test/inductor/test_flex_decoding.py @drisspg
|
||||
|
||||
# Low Precision & Grouped GEMMs
|
||||
# Low Precision GEMMs
|
||||
/aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
|
||||
/aten/src/ATen/native/cuda/GroupedBlas.cpp @drisspg @slayton58
|
||||
/aten/src/ATen/native/cuda/ScaledBlas.cpp @drisspg @slayton58
|
||||
/aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
|
||||
/aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
|
||||
/aten/src/ATen/cuda/CUDAScaledBlas.cpp @drisspg @slayton58
|
||||
/aten/src/ATen/cuda/CUDAScaledBlas.h @drisspg @slayton58
|
||||
/test/test_scaled_matmul_cuda.py @drisspg @slayton58
|
||||
|
||||
@ -174,12 +174,6 @@ class TORCH_API Context {
|
||||
static long versionCuDNN() {
|
||||
return detail::getCUDAHooks().versionCuDNN();
|
||||
}
|
||||
static long versionRuntimeCuDNN() {
|
||||
return detail::getCUDAHooks().versionRuntimeCuDNN();
|
||||
}
|
||||
static long versionCuDNNFrontend() {
|
||||
return detail::getCUDAHooks().versionCuDNNFrontend();
|
||||
}
|
||||
static bool hasCuSOLVER() {
|
||||
return detail::getCUDAHooks().hasCuSOLVER();
|
||||
}
|
||||
|
||||
@ -6,7 +6,6 @@
|
||||
#include <c10/util/Half.h>
|
||||
#include <c10/util/Metaprogramming.h>
|
||||
#include <c10/util/complex.h>
|
||||
#include <torch/headeronly/core/Dispatch.h>
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#include <cuda.h> // For CUDA_VERSION
|
||||
@ -62,9 +61,12 @@ TORCH_API void record_kernel_function_dtype(std::string name);
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \
|
||||
THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL( \
|
||||
AT_PRIVATE_CHECK_SELECTIVE_BUILD, enum_type, HINT, __VA_ARGS__)
|
||||
#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \
|
||||
case enum_type: { \
|
||||
AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type); \
|
||||
using HINT [[maybe_unused]] = c10::impl::ScalarTypeToCPPTypeT<enum_type>; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
|
||||
#define AT_DISPATCH_CASE(enum_type, ...) \
|
||||
AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
|
||||
@ -93,6 +95,14 @@ TORCH_API void record_kernel_function_dtype(std::string name);
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
|
||||
inline at::ScalarType scalar_type(at::ScalarType s) {
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// The AT_DISPATCH_* family of macros provides the ability to
|
||||
// conveniently generate specializations of a kernel over all of the
|
||||
// dtypes we care about in PyTorch. We call it "dispatch" because
|
||||
@ -180,13 +190,27 @@ TORCH_API void record_kernel_function_dtype(std::string name);
|
||||
// but we're just being safe (and it doesn't hurt.) Note we must
|
||||
// use it to shut up warnings about unused store.
|
||||
|
||||
#define AT_DISPATCH_SWITCH(TYPE, NAME, ...) \
|
||||
THO_DISPATCH_SWITCH_TMPL( \
|
||||
RECORD_KERNEL_FUNCTION_DTYPE, \
|
||||
TORCH_CHECK_NOT_IMPLEMENTED, \
|
||||
TYPE, \
|
||||
NAME, \
|
||||
__VA_ARGS__)
|
||||
#define AT_DISPATCH_SWITCH(TYPE, NAME, ...) \
|
||||
[&] { \
|
||||
const auto& the_type = TYPE; \
|
||||
constexpr const char* at_dispatch_name = NAME; \
|
||||
/* don't use TYPE again in case it is an expensive or side-effect op */ \
|
||||
at::ScalarType _st = ::detail::scalar_type(the_type); \
|
||||
RECORD_KERNEL_FUNCTION_DTYPE(at_dispatch_name, _st); \
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") \
|
||||
switch (_st) { \
|
||||
__VA_ARGS__ \
|
||||
default: \
|
||||
TORCH_CHECK_NOT_IMPLEMENTED( \
|
||||
false, \
|
||||
'"', \
|
||||
at_dispatch_name, \
|
||||
"\" not implemented for '", \
|
||||
toString(_st), \
|
||||
"'"); \
|
||||
} \
|
||||
C10_DIAGNOSTIC_POP() \
|
||||
}()
|
||||
|
||||
#define AT_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
|
||||
|
||||
@ -1,8 +1,3 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/headeronly/core/Dispatch_v2.h>
|
||||
|
||||
// Get AT_DISPATCH_SWITCH and AT_DISPATCH_CASE:
|
||||
#include <ATen/Dispatch.h>
|
||||
|
||||
// This is a new implementation of the AT_DISPATCH macro family from
|
||||
@ -79,19 +74,41 @@
|
||||
// macro expansion occurs, mediated with AT_EXPAND and AT_GUARD. I mostly
|
||||
// relied on GPT4 to help me get it right.
|
||||
|
||||
// Public API macros
|
||||
|
||||
// See documentation above
|
||||
#define AT_DISPATCH_V2(TYPE, NAME, BODY, ...) \
|
||||
THO_DISPATCH_V2_TMPL( \
|
||||
AT_DISPATCH_SWITCH, \
|
||||
AT_DISPATCH_CASE, \
|
||||
TYPE, \
|
||||
NAME, \
|
||||
AT_WRAP(BODY), \
|
||||
__VA_ARGS__)
|
||||
AT_DISPATCH_SWITCH(TYPE, NAME, AT_AP_VAR(AT_WRAP(BODY), TYPE, __VA_ARGS__))
|
||||
|
||||
// This macro lets you pass an arbitrary expression that may contain internal
|
||||
// commas to another macro without having the commas causing the expression
|
||||
// to be interpreted as being multiple arguments
|
||||
#define AT_WRAP(...) __VA_ARGS__
|
||||
|
||||
#define AT_FLOAT8_TYPES \
|
||||
c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \
|
||||
c10::kFloat8_e4m3fnuz, c10::kFloat8_e8m0fnu
|
||||
|
||||
#define AT_INTEGRAL_TYPES \
|
||||
c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort
|
||||
#define AT_FLOATING_TYPES c10::kDouble, c10::kFloat
|
||||
#define AT_BAREBONES_UNSIGNED_TYPES c10::kUInt16, c10::kUInt32, c10::kUInt64
|
||||
#define AT_INTEGRAL_TYPES_V2 \
|
||||
AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)
|
||||
#define AT_COMPLEX_TYPES c10::kComplexDouble, c10::kComplexFloat
|
||||
#define AT_QINT_TYPES c10::kQInt8, c10::kQUInt8, c10::kQInt32
|
||||
// NB: not *actually* all types
|
||||
#define AT_ALL_TYPES AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES)
|
||||
#define AT_ALL_TYPES_AND_COMPLEX \
|
||||
AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES)
|
||||
|
||||
// Helper macros
|
||||
|
||||
// Unused helper macros, kept for BC:
|
||||
#define AT_AP_VAR(N, T, ...) \
|
||||
AT_EXPAND(AT_CONCAT(AT_AP, AT_NUM_ARGS(__VA_ARGS__))(AT_WRAP(N), __VA_ARGS__))
|
||||
#define AT_CONCAT(a, b) AT_CONCAT_AUX(a, b)
|
||||
#define AT_CONCAT_AUX(a, b) a##b
|
||||
#define AT_EXPAND(X) X
|
||||
|
||||
// Ensure we never have too many scalar types for the expansion here to
|
||||
// support. To bump this, you must regenerate the macros below.
|
||||
@ -102,6 +119,12 @@ static_assert(static_cast<int>(c10::ScalarType::NumOptions) < 60);
|
||||
|
||||
num_args = 60
|
||||
|
||||
nums = ', '.join(str(i) for i in reversed(range(num_args+1)))
|
||||
args = ', '.join(f'_{i}' for i in range(1, num_args+1))
|
||||
|
||||
print(f'#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, {nums}))')
|
||||
print(f'#define AT_NUM_ARGS_AUX({args}, N, ...) N')
|
||||
|
||||
for i in range(1, num_args+1):
|
||||
args = ', '.join(f'_{i}' for i in range(1, i+1))
|
||||
cases = ' '.join([f'AT_DISPATCH_CASE(_{j}, N)' for j in range(1, i+1)])
|
||||
@ -112,6 +135,8 @@ for i in range(1, num_args+1):
|
||||
// Begin generated code
|
||||
// clang-format off
|
||||
|
||||
#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
|
||||
#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, N, ...) N
|
||||
#define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N)
|
||||
#define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N)
|
||||
#define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N)
|
||||
|
||||
@ -388,7 +388,6 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
#ifndef USE_ROCM
|
||||
at::Half halpha;
|
||||
at::Half hbeta;
|
||||
uint32_t mask = -1;
|
||||
#endif
|
||||
void * alpha_ptr = α
|
||||
void * beta_ptr = β
|
||||
@ -428,7 +427,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
mask =
|
||||
uint32_t mask =
|
||||
fp16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
@ -445,7 +444,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
mask =
|
||||
uint32_t mask =
|
||||
bf16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
@ -512,41 +511,17 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
|
||||
cublasLtMatmulHeuristicResult_t heuristicResult = {};
|
||||
int returnedResult = 0;
|
||||
// on Blackwell+, we fake a n > 1 matmul when querying heuristics
|
||||
// to prevent cuBLASLt from dispatching to a GEMV kernel for batch-invariance
|
||||
#ifndef USE_ROCM
|
||||
const bool lie_to_cublaslt = mask == CUBLASLT_REDUCTION_SCHEME_NONE && n == 1 && at::cuda::getCurrentDeviceProperties()->major >= 10;
|
||||
#else
|
||||
const bool lie_to_cublaslt = false;
|
||||
#endif
|
||||
if (lie_to_cublaslt) {
|
||||
CuBlasLtMatrixLayout FakeBdesc(abType, k, 2, ldb, opb == CUBLAS_OP_T);
|
||||
CuBlasLtMatrixLayout FakeCdesc(cType, m, 2, ldc);
|
||||
|
||||
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
|
||||
ltHandle,
|
||||
computeDesc.descriptor(),
|
||||
Adesc.descriptor(),
|
||||
FakeBdesc.descriptor(),
|
||||
FakeCdesc.descriptor(),
|
||||
FakeCdesc.descriptor(),
|
||||
preference.descriptor(),
|
||||
1,
|
||||
&heuristicResult,
|
||||
&returnedResult));
|
||||
} else {
|
||||
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
|
||||
ltHandle,
|
||||
computeDesc.descriptor(),
|
||||
Adesc.descriptor(),
|
||||
Bdesc.descriptor(),
|
||||
Cdesc.descriptor(),
|
||||
Cdesc.descriptor(),
|
||||
preference.descriptor(),
|
||||
1,
|
||||
&heuristicResult,
|
||||
&returnedResult));
|
||||
}
|
||||
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
|
||||
ltHandle,
|
||||
computeDesc.descriptor(),
|
||||
Adesc.descriptor(),
|
||||
Bdesc.descriptor(),
|
||||
Cdesc.descriptor(),
|
||||
Cdesc.descriptor(),
|
||||
preference.descriptor(),
|
||||
1,
|
||||
&heuristicResult,
|
||||
&returnedResult));
|
||||
if (returnedResult == 0) {
|
||||
cublasStatus = CUBLAS_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
@ -1597,7 +1572,7 @@ bool gemm_and_bias(
|
||||
}
|
||||
|
||||
using opmath_t = at::opmath_type<Dtype>;
|
||||
opmath_t beta_val = bias ? 0 : 1; // bias is added in epilogue unless nullptr
|
||||
opmath_t beta_val = 0; // bias is added in epilogue
|
||||
|
||||
cudaDataType_t abType = CUDA_R_32F;
|
||||
cudaDataType_t cType = CUDA_R_32F;
|
||||
@ -1686,22 +1661,15 @@ bool gemm_and_bias(
|
||||
_syncCurrentWithCarveoutStream(stream, true);
|
||||
}
|
||||
#endif
|
||||
const auto epilogue = [&]() -> cublasLtEpilogue_t {
|
||||
// The cuBLAS documentation indicates that
|
||||
// *_<ACTIVATION>_BIAS = *_<ACTIVATION>,
|
||||
// but we keep it verbose here for clarity.
|
||||
switch (activation) {
|
||||
case GEMMAndBiasActivationEpilogue::RELU:
|
||||
return bias ? CUBLASLT_EPILOGUE_RELU_BIAS : CUBLASLT_EPILOGUE_RELU;
|
||||
case GEMMAndBiasActivationEpilogue::GELU:
|
||||
return bias ? CUBLASLT_EPILOGUE_GELU_BIAS : CUBLASLT_EPILOGUE_GELU;
|
||||
default:
|
||||
return bias ? CUBLASLT_EPILOGUE_BIAS : CUBLASLT_EPILOGUE_DEFAULT;
|
||||
}
|
||||
}();
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue);
|
||||
cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
|
||||
if (activation == GEMMAndBiasActivationEpilogue::RELU) {
|
||||
epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
|
||||
} else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
|
||||
epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
|
||||
}
|
||||
|
||||
if (bias) {
|
||||
if (bias != nullptr) {
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue);
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias);
|
||||
}
|
||||
|
||||
|
||||
@ -175,7 +175,7 @@ void CUDAGraph::instantiate() {
|
||||
// who prefer not to report error message through these arguments moving forward
|
||||
// (they prefer return value, or errors on api calls internal to the capture)
|
||||
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000)
|
||||
AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, cudaGraphInstantiateFlagUseNodePriority));
|
||||
AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, 0));
|
||||
#else
|
||||
AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
|
||||
#endif
|
||||
@ -184,7 +184,7 @@ void CUDAGraph::instantiate() {
|
||||
} else {
|
||||
AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
|
||||
graph_,
|
||||
cudaGraphInstantiateFlagAutoFreeOnLaunch | cudaGraphInstantiateFlagUseNodePriority));
|
||||
cudaGraphInstantiateFlagAutoFreeOnLaunch));
|
||||
}
|
||||
has_graph_exec_ = true;
|
||||
}
|
||||
|
||||
@ -24,13 +24,7 @@ namespace detail {
|
||||
// radix_sort_pairs doesn't interact with value_t other than to copy
|
||||
// the data, so we can save template instantiations by reinterpreting
|
||||
// it as an opaque type.
|
||||
// We use native integer types for 1/2/4/8-byte values to reduce
|
||||
// register usage in CUDA kernels. For sizes > 8 fall back to char array.
|
||||
template <int N> struct alignas(N) OpaqueType { char data[N]; };
|
||||
template <> struct alignas(1) OpaqueType<1> { uint8_t data; };
|
||||
template <> struct alignas(2) OpaqueType<2> { uint16_t data; };
|
||||
template <> struct alignas(4) OpaqueType<4> { uint32_t data; };
|
||||
template <> struct alignas(8) OpaqueType<8> { uint64_t data; };
|
||||
|
||||
template<typename key_t, int value_size>
|
||||
void radix_sort_pairs_impl(
|
||||
|
||||
@ -21,7 +21,6 @@
|
||||
|
||||
#if AT_CUDNN_ENABLED()
|
||||
#include <ATen/cudnn/cudnn-wrapper.h>
|
||||
#include <cudnn_frontend.h>
|
||||
#endif
|
||||
|
||||
#if AT_MAGMA_ENABLED()
|
||||
@ -352,26 +351,6 @@ long CUDAHooks::versionCuDNN() const {
|
||||
#endif
|
||||
}
|
||||
|
||||
long CUDAHooks::versionRuntimeCuDNN() const {
|
||||
#if AT_CUDNN_ENABLED()
|
||||
#ifndef USE_STATIC_CUDNN
|
||||
return cudnnGetVersion();
|
||||
#else
|
||||
return CUDNN_VERSION;
|
||||
#endif
|
||||
#else
|
||||
TORCH_CHECK(false, "Cannot query CuDNN version if ATen_cuda is not built with CuDNN");
|
||||
#endif
|
||||
}
|
||||
|
||||
long CUDAHooks::versionCuDNNFrontend() const {
|
||||
#if AT_CUDNN_ENABLED()
|
||||
return CUDNN_FRONTEND_VERSION;
|
||||
#else
|
||||
TORCH_CHECK(false, "Cannot query CuDNN Frontend version if ATen_cuda is not built with CuDNN");
|
||||
#endif
|
||||
}
|
||||
|
||||
long CUDAHooks::versionMIOpen() const {
|
||||
#if AT_ROCM_ENABLED()
|
||||
return MIOPEN_VERSION_MAJOR * 10000 +
|
||||
|
||||
@ -49,8 +49,6 @@ struct CUDAHooks : public at::CUDAHooksInterface {
|
||||
bool hasCUDART() const override;
|
||||
long versionCUDART() const override;
|
||||
long versionCuDNN() const override;
|
||||
long versionRuntimeCuDNN() const override;
|
||||
long versionCuDNNFrontend() const override;
|
||||
long versionMIOpen() const override;
|
||||
std::string showConfig() const override;
|
||||
double batchnormMinEpsilonCuDNN() const override;
|
||||
|
||||
@ -174,14 +174,6 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
|
||||
TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
|
||||
}
|
||||
|
||||
virtual long versionRuntimeCuDNN() const {
|
||||
TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
|
||||
}
|
||||
|
||||
virtual long versionCuDNNFrontend() const {
|
||||
TORCH_CHECK(false, "Cannot query cuDNN Frontend version without ATen_cuda library. ", CUDA_HELP);
|
||||
}
|
||||
|
||||
virtual long versionMIOpen() const {
|
||||
TORCH_CHECK(false, "Cannot query MIOpen version without ATen_cuda library. ", CUDA_HELP);
|
||||
}
|
||||
|
||||
@ -1009,25 +1009,12 @@ static Device correct_out_device(const Tensor& self, const Tensor& other) {
|
||||
}
|
||||
}
|
||||
|
||||
static Tensor send_to_meta(const Tensor& self, const Device& device) {
|
||||
Tensor out_meta;
|
||||
if (self._is_zerotensor() && self.unsafeGetTensorImpl()->is_wrapped_number()) {
|
||||
out_meta = at::_efficientzerotensor(self.sizes(), self.options().device(device));
|
||||
out_meta.unsafeGetTensorImpl()->set_wrapped_number(true);
|
||||
} else {
|
||||
out_meta = self.to(device);
|
||||
}
|
||||
return out_meta;
|
||||
}
|
||||
|
||||
Tensor mul_zerotensor(const Tensor& self, const Tensor& other) {
|
||||
auto out_device = correct_out_device(self, other);
|
||||
// hack to use the TensorIterator to get the correct broadcasting and type promotion logic
|
||||
auto device_ = Device(DeviceType::Meta);
|
||||
constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
|
||||
auto self_meta = send_to_meta(self, device_);
|
||||
auto other_meta = send_to_meta(other, device_);
|
||||
auto meta_out = at::_ops::mul_Tensor::redispatch(meta_dks, self_meta, other_meta);
|
||||
auto meta_out = at::_ops::mul_Tensor::redispatch(meta_dks, self.to(device_), other.to(device_));
|
||||
return at::_efficientzerotensor(meta_out.sizes(), meta_out.options().device(out_device));
|
||||
}
|
||||
|
||||
@ -1036,9 +1023,7 @@ Tensor div_zerotensor(const Tensor& self, const Tensor& other) {
|
||||
// hack to use the TensorIterator to get the correct broadcasting and type promotion logic
|
||||
auto device_ = Device(DeviceType::Meta);
|
||||
constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
|
||||
auto self_meta = send_to_meta(self, device_);
|
||||
auto other_meta = send_to_meta(other, device_);
|
||||
auto meta_out = at::_ops::div_Tensor::redispatch(meta_dks, self_meta, other_meta);
|
||||
auto meta_out = at::_ops::div_Tensor::redispatch(meta_dks, self.to(device_), other.to(device_));
|
||||
|
||||
if (self._is_zerotensor()) {
|
||||
if (other._is_zerotensor()) {
|
||||
@ -1067,9 +1052,8 @@ static Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const
|
||||
// hack to use the TensorIterator to get the correct broadcasting and type promotion logic
|
||||
auto device_ = Device(DeviceType::Meta);
|
||||
constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
|
||||
auto self_meta = send_to_meta(self, device_);
|
||||
auto other_meta = send_to_meta(other, device_);
|
||||
auto meta_out = at::_ops::add_Tensor::redispatch(meta_dks, self_meta, other_meta, alpha);
|
||||
auto meta_out = at::_ops::add_Tensor::redispatch(
|
||||
meta_dks, self.to(device_), other.to(device_), alpha);
|
||||
|
||||
auto get_out_like = [&] (const Tensor& tensor)
|
||||
{
|
||||
|
||||
@ -409,7 +409,7 @@ struct ConvParams {
|
||||
if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
|
||||
return false;
|
||||
}
|
||||
static long cudnn_version = detail::getCUDAHooks().versionRuntimeCuDNN();
|
||||
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
|
||||
// broken on cuDNN 9.8 - 9.14
|
||||
if (cudnn_version >= 90800 && cudnn_version < 91500) {
|
||||
if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
|
||||
@ -453,7 +453,7 @@ struct ConvParams {
|
||||
}
|
||||
// native kernel doesn't support 64-bit non-splittable case
|
||||
if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
|
||||
static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionRuntimeCuDNN() : -1;
|
||||
static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
|
||||
// TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
|
||||
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
|
||||
if (cudnn_version < 0 || cudnn_version > 91000) {
|
||||
|
||||
@ -50,35 +50,18 @@ static inline bool parseLinearFlatten3d() {
|
||||
// `_flatten_nd_linear` flattens all but the last dimension of the input tensor
|
||||
// before passing it to linear operation
|
||||
static inline Tensor _flatten_nd_linear(const Tensor& input, const Tensor& weight, const Tensor& bias) {
|
||||
const auto input_sizes = input.sym_sizes();
|
||||
|
||||
const auto result_flattened = [&]() -> Tensor {
|
||||
const auto input_ncols = input_sizes.back();
|
||||
const auto input_flattened_nrows = [&]() -> c10::SymInt {
|
||||
// can't use -1 in reshape because it errors when a dimension is 0
|
||||
auto flattened_nrows = c10::SymInt{1};
|
||||
for (const auto& size : input_sizes.slice(0, input_sizes.size() - 1)) {
|
||||
flattened_nrows *= size;
|
||||
}
|
||||
return flattened_nrows;
|
||||
}();
|
||||
|
||||
const auto input_flattened = input.view_symint({input_flattened_nrows, input_ncols});
|
||||
if (weight.layout() == c10::kStrided) {
|
||||
return at::addmm(bias, input_flattened, weight.t());
|
||||
} else {
|
||||
// weight is sparse, and addmm for sparse expects matmul lhs to be sparse,
|
||||
// so we transpose the problem.
|
||||
// NOTE: at::matmul handles (dense @ sparse) similarly.
|
||||
const auto bias_t = (bias.dim() >= 2) ? bias.mT() : bias.unsqueeze(-1);
|
||||
return at::addmm(bias_t, weight, input_flattened.t()).t();
|
||||
const auto input_sizes = input.sym_sizes();
|
||||
// can't use -1 in reshape because it errors when a dimension is 0
|
||||
c10::SymInt flattened_dim = 1;
|
||||
for (int64_t i = 0, ndim = input_sizes.size(); i < ndim - 1; ++i) {
|
||||
flattened_dim = flattened_dim * input_sizes[i];
|
||||
}
|
||||
}();
|
||||
|
||||
// Unflatten flattened row dims
|
||||
auto result_sizes = c10::SymDimVector{input_sizes.begin(), input_sizes.end()};
|
||||
result_sizes.back() = result_flattened.sym_size(1);
|
||||
return result_flattened.view_symint(result_sizes);
|
||||
auto inp_reshape = input.reshape_symint({flattened_dim, input_sizes.at(input_sizes.size() -1)});
|
||||
const auto result = at::addmm(bias, inp_reshape, weight.t());
|
||||
auto new_size = input_sizes.slice(0, input_sizes.size() - 1);
|
||||
c10::SymDimVector sizes_vec(new_size.begin(), new_size.end());
|
||||
sizes_vec.push_back(result.sym_size(1));
|
||||
return result.view_symint(sizes_vec);
|
||||
}
|
||||
|
||||
|
||||
@ -107,23 +90,15 @@ Tensor linear(const Tensor& input, const Tensor& weight, const std::optional<Ten
|
||||
// Fused op is marginally faster.
|
||||
return at::addmm(*bias, input, weight.t());
|
||||
}
|
||||
|
||||
const auto is_bias_likely_fusable = (
|
||||
bias->defined() &&
|
||||
// cuBLASLt: will fuse in the epilogue without copies
|
||||
// when input/weight/bias are all strided.
|
||||
// When weight is not strided, bias will not be fused,
|
||||
// but we can still dispatch here to avoid at::matmul
|
||||
// path which will probably use a very similar
|
||||
// flattening optimization.
|
||||
((bias->dim() == 1 || bias->squeeze().dim() == 1) && bias->is_contiguous_or_false())
|
||||
);
|
||||
if (is_bias_likely_fusable && !input.is_xla()) {
|
||||
// Also hit the fused path for contiguous nD input, if not using xla
|
||||
if (bias->defined() && !input.is_xla()) {
|
||||
// Also hit the fused path for contiguous 3D input, if not using xla
|
||||
// backend. Reshaping/flattening has some performance implications on xla.
|
||||
if (input.is_contiguous_or_false()) {
|
||||
bool is_contiguous = input.is_contiguous_or_false();
|
||||
if (is_contiguous && input_dim == 3) {
|
||||
return _flatten_nd_linear(input, weight, *bias);
|
||||
} else if (parseLinearFlatten3d()) {
|
||||
} else if (is_contiguous && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) {
|
||||
return _flatten_nd_linear(input, weight, *bias);
|
||||
} else if (parseLinearFlatten3d() && input_dim == 3) {
|
||||
// If user forces flattening via env var
|
||||
const Tensor input_cont = input.contiguous();
|
||||
return _flatten_nd_linear(input_cont, weight, *bias);
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#include <ATen/core/ATen_fwd.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/core/SymInt.h>
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/AccumulateType.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
@ -1711,37 +1710,11 @@ Tensor narrow_symint(
|
||||
"], but got ",
|
||||
start,
|
||||
")")
|
||||
|
||||
auto cond1 = TORCH_GUARD_OR_FALSE(start.sym_lt(0));
|
||||
auto cond2 = TORCH_GUARD_OR_FALSE(start.sym_ge(0));
|
||||
|
||||
if (cond1 || cond2) {
|
||||
if (cond1) {
|
||||
start = start + cur_size;
|
||||
}
|
||||
|
||||
TORCH_SYM_CHECK(
|
||||
start.sym_le(cur_size - length),
|
||||
"start (",
|
||||
start,
|
||||
") + length (",
|
||||
length,
|
||||
") exceeds dimension size (",
|
||||
cur_size,
|
||||
").");
|
||||
return at::slice_symint(self, dim, start, start + length, 1);
|
||||
if (start < 0) {
|
||||
start = start + cur_size;
|
||||
}
|
||||
|
||||
// Unbacked start handling!
|
||||
|
||||
// Bounds check without converting start:
|
||||
// - If start < 0: need (start + cur_size) + length <= cur_size, i.e., start +
|
||||
// length <= 0
|
||||
// - If start >= 0: need start + length <= cur_size
|
||||
auto end = start + length;
|
||||
TORCH_SYM_CHECK(
|
||||
(start.sym_lt(0).sym_and((end).sym_le(0)))
|
||||
.sym_or(start.sym_ge(0).sym_and((end).sym_le(cur_size))),
|
||||
start.sym_le(cur_size - length),
|
||||
"start (",
|
||||
start,
|
||||
") + length (",
|
||||
@ -1749,28 +1722,7 @@ Tensor narrow_symint(
|
||||
") exceeds dimension size (",
|
||||
cur_size,
|
||||
").");
|
||||
|
||||
if (TORCH_GUARD_OR_FALSE(end.sym_ne(0))) {
|
||||
return at::slice_symint(self, dim, start, end, 1);
|
||||
} else {
|
||||
// Cannot statically determine the condition due to unbacked.
|
||||
// This is an interesting situation; when start is negative and
|
||||
// start + length == 0, slice and narrow do different things.
|
||||
// i.e., x.narrow(0, -2, 2) != x[-2:0]; in that case, we want to
|
||||
// pass curr_size instead of 0. Otherwise, they would do the same thing.
|
||||
// This says at runtime: if start < 0 and end == 0, then pass curr_size
|
||||
// instead of 0.
|
||||
|
||||
auto use_different = start.sym_lt(0).sym_and(end.sym_eq(0)).toSymInt();
|
||||
auto result =
|
||||
at::slice_symint(self, dim, start, end + use_different * cur_size, 1);
|
||||
|
||||
// Ensure slice allocated unbacked size is specialized to length.
|
||||
SymInt new_size = result.sym_size(dim);
|
||||
TORCH_SYM_CHECK(new_size.sym_eq(length), "")
|
||||
|
||||
return result;
|
||||
}
|
||||
return at::slice_symint(self, dim, start, start + length, 1);
|
||||
}
|
||||
|
||||
// This overload exists purely for XLA, because they wanted to pass in
|
||||
@ -1784,8 +1736,8 @@ Tensor narrow_tensor_symint(
|
||||
start.dim() == 0 &&
|
||||
isIntegralType(start.scalar_type(), /*includeBool=*/false),
|
||||
"start must be an 0-dim integral Tensor.");
|
||||
c10::SymInt st = start.item().toSymInt();
|
||||
return at::narrow_symint(self, dim, std::move(st), std::move(length));
|
||||
int64_t st = start.item<int64_t>();
|
||||
return at::narrow_symint(self, dim, c10::SymInt(st), std::move(length));
|
||||
}
|
||||
|
||||
std::
|
||||
|
||||
@ -247,8 +247,8 @@ void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) {
|
||||
});
|
||||
}
|
||||
|
||||
template <typename func_t, typename vec_func_t>
|
||||
void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, double ident = 0) {
|
||||
template <typename func_t, typename vec_func_t, typename ident_t = double>
|
||||
void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, ident_t ident = static_cast<ident_t>(0)) {
|
||||
using traits = binary_function_traits<func_t>;
|
||||
static_assert(
|
||||
all_same<
|
||||
|
||||
@ -339,33 +339,13 @@ void or_kernel_impl(TensorIterator& iter) {
|
||||
}
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
struct MinValuesOps: public at::native::MinOps<scalar_t> {
|
||||
using arg_t = typename MinOps<scalar_t>::arg_t;
|
||||
static scalar_t project(arg_t arg) {
|
||||
return arg.first;
|
||||
}
|
||||
};
|
||||
|
||||
void min_values_kernel_impl(TensorIterator& iter) {
|
||||
// This case is special because of Vectorized<int64_t> does not
|
||||
// handle upper_bound<int64_t>().
|
||||
// See: https://github.com/pytorch/pytorch/issues/43254
|
||||
if (iter.dtype() == kLong || iter.dtype() == kUInt64) {
|
||||
AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
|
||||
binary_kernel_reduce(
|
||||
iter,
|
||||
MinValuesOps<scalar_t>{},
|
||||
std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
|
||||
}), kLong, kUInt64);
|
||||
return;
|
||||
}
|
||||
AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
|
||||
binary_kernel_reduce_vec(
|
||||
iter,
|
||||
[](scalar_t a, scalar_t b) -> scalar_t { return min_impl(a, b); },
|
||||
[](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return minimum(a, b); },
|
||||
static_cast<double>(upper_bound<scalar_t>()));
|
||||
upper_bound<scalar_t>());
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
|
||||
@ -147,24 +147,14 @@ static bool isGloballyDisabledAddmmCudaLt(const at::Device& device) {
|
||||
/*
|
||||
* Check whether for the given input we want to enable the Lt interface
|
||||
*/
|
||||
static bool isInputCompliesAddmmCudaLt(
|
||||
Tensor& result,
|
||||
const Tensor& self,
|
||||
const Tensor& mat1,
|
||||
const Tensor& mat2,
|
||||
const Scalar& beta,
|
||||
const Scalar& alpha,
|
||||
Activation activation
|
||||
) {
|
||||
#ifdef USE_ROCM
|
||||
static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) {
|
||||
// Implies 2D bias which we currently not send through Lt.
|
||||
// TODO: this check is done pre col-major input preparation,
|
||||
// so, this condition can be ralexed in cases when a col-major
|
||||
// copy of result is needed.
|
||||
if (self.is_same(result) || self.dim() == 2) {
|
||||
if (result.is_same(self)) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(USE_ROCM) && ROCM_VERSION == 60400
|
||||
// hipblaslt TT fp32 regression on ROCm 6.4, cannot use
|
||||
@ -179,33 +169,13 @@ static bool isInputCompliesAddmmCudaLt(
|
||||
#if defined(CUDA_VERSION) || defined(USE_ROCM)
|
||||
const auto scalar_type = mat1.scalar_type();
|
||||
return (beta.toComplexDouble() == 1.0
|
||||
// NOTE: row-major result is important when bias is 1D.
|
||||
// This is because Lt broadcasts 1D bias over the columns
|
||||
// while the aten::addmm API broadcasts it over the rows,
|
||||
// and this is in conjuction with the data preparation
|
||||
// procedure that does not transpose arguments with
|
||||
// col-major result. For col-major result we need
|
||||
// to explicitly transpose the problem so that bias is
|
||||
// correctly applied.
|
||||
// TODO: enable col-major result if needed.
|
||||
// TODO: no need to check result's layout when
|
||||
// !result.is_same(self) and self.dim() == 2, because
|
||||
// self needs to be copied into result and the bias ptr
|
||||
// will be ignored.
|
||||
&& result.dim() == 2 && result.is_contiguous()
|
||||
// Conditions for bias to be fusable
|
||||
&& (
|
||||
( // Conditions for bias to be fusable -- implies direct Lt path without copies.
|
||||
self.is_contiguous() &&
|
||||
// NOTE: fine to have 1-len dims to the left from the right-most one
|
||||
(self.dim() == 1 || self.squeeze().dim() == 1) &&
|
||||
self.sizes().back() == mat2_sizes[1]
|
||||
)
|
||||
|| ( // 2D bias restrictions. self.is_contiguous() is implicit when result.is_same(self),
|
||||
// and we need to copy self into result otherwise, so the self's layout becomes irrelevant.
|
||||
// See also TODO from above.
|
||||
activation != Activation::None && // Lt is faster when activation is fused
|
||||
(self.dim() == 2 && at::is_expandable_to(self.sizes(), {mat1_sizes[0], mat2_sizes[1]}))
|
||||
)
|
||||
self.is_contiguous() &&
|
||||
// NOTE: fine to have 1-len dims to the left from the right-most one
|
||||
(self.dim() == 1 || self.squeeze().dim() == 1) &&
|
||||
self.sizes().back() == mat2_sizes[1]
|
||||
)
|
||||
&& ( // some dtype restrictions
|
||||
#ifndef USE_ROCM
|
||||
@ -300,16 +270,7 @@ bool launchGemmAndBiasCublasLt(
|
||||
const Scalar& alpha,
|
||||
Activation activation = Activation::None
|
||||
) {
|
||||
// We apply bias in the epilogue only when it is 1D,
|
||||
// or when it can be squeezed to 1D.
|
||||
// self_ptr == nullptr implies ignore bias epilogue
|
||||
// and use standard gemm-like API.
|
||||
const auto* self_ptr = [&]() -> auto {
|
||||
if (self.dim() == 1 || self.squeeze().dim() == 1) {
|
||||
return self.const_data_ptr<scalar_t>();
|
||||
}
|
||||
return static_cast<const scalar_t*>(nullptr);
|
||||
}();
|
||||
const auto* self_ptr = self.const_data_ptr<scalar_t>();
|
||||
|
||||
const auto tuning_ctx = at::cuda::tunable::getTuningContext();
|
||||
if (tuning_ctx->IsTunableOpEnabled()) {
|
||||
@ -395,7 +356,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
|
||||
#endif
|
||||
// Condition on the input
|
||||
disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
|
||||
disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha) || disable_addmm_cuda_lt;
|
||||
// }
|
||||
|
||||
at::ScalarType scalar_type = mat1.scalar_type();
|
||||
@ -405,20 +366,19 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
if (!result.is_same(self)) {
|
||||
at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});
|
||||
|
||||
// We use bias ptr in the Lt path only when bias is 1D
|
||||
const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
|
||||
const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
|
||||
if (!use_bias_ptr_lt) {
|
||||
// We do expand self even before
|
||||
if (disable_addmm_cuda_lt) {
|
||||
// When in non-Lt path we do expand self even before
|
||||
// check for beta != 0.0 to make sure that
|
||||
// test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
|
||||
// runs green.
|
||||
return expand_size(self, result.sizes(), "addmm");
|
||||
}
|
||||
// copy next, should broadcast
|
||||
return c10::MaybeOwned<Tensor>::borrowed(self);
|
||||
}();
|
||||
// We do not copy bias only when we need the bias ptr
|
||||
if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
|
||||
// We copy bias when in the non-Lt path
|
||||
if (beta.toComplexDouble() != 0.0 && disable_addmm_cuda_lt) {
|
||||
// NOTE: self should broadcast over result
|
||||
at::native::copy_(result, *self_maybe_expanded);
|
||||
}
|
||||
|
||||
@ -884,69 +884,6 @@ struct type_specialized_kernel_launcher {
|
||||
}
|
||||
};
|
||||
|
||||
template <int arg_index>
|
||||
struct type_specialized_broadcast_kernel_launcher {
|
||||
template <
|
||||
typename func_t,
|
||||
typename array_t,
|
||||
typename dtypes_t,
|
||||
typename calc_t>
|
||||
static void apply(
|
||||
int64_t numel,
|
||||
func_t f,
|
||||
array_t data,
|
||||
dtypes_t dtypes,
|
||||
calc_t offset_calc) {
|
||||
using traits = function_traits<func_t>;
|
||||
using ret_t = typename traits::result_type;
|
||||
using arg0_t = typename traits::template arg<0>::type;
|
||||
using arg1_t = typename traits::template arg<1>::type;
|
||||
if (dtypes[0] == rt_binary_specializations[arg_index][0] &&
|
||||
dtypes[1] == rt_binary_specializations[arg_index][1] &&
|
||||
dtypes[2] == rt_binary_specializations[arg_index][2]) {
|
||||
using ret_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][0]>;
|
||||
using arg0_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][1]>;
|
||||
using arg1_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][2]>;
|
||||
constexpr int grp_sz = 128;
|
||||
launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
|
||||
if (unrl) {
|
||||
auto offsets0 = offset_calc.get(idx);
|
||||
auto offsets1 = offset_calc.get(idx + grp_sz);
|
||||
auto offsets2 = offset_calc.get(idx + grp_sz * 2);
|
||||
auto offsets3 = offset_calc.get(idx + grp_sz * 3);
|
||||
void* out0 = data[0] + offsets0[0];
|
||||
void* out1 = data[0] + offsets1[0];
|
||||
void* out2 = data[0] + offsets2[0];
|
||||
void* out3 = data[0] + offsets3[0];
|
||||
auto u = c10::load<arg0_cpp_t>(data[1] + offsets0[1]);
|
||||
auto v = c10::load<arg1_cpp_t>(data[2] + offsets0[2]);
|
||||
ret_t result0 = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
|
||||
auto u1 = c10::load<arg0_cpp_t>(data[1] + offsets1[1]);
|
||||
auto v1 = c10::load<arg1_cpp_t>(data[2]+ offsets1[2]);
|
||||
ret_t result1 = f(c10::convert<arg0_t>(u1), c10::convert<arg1_t>(v1));
|
||||
auto u2 = c10::load<arg0_cpp_t>(data[1] + offsets2[1]);
|
||||
auto v2 = c10::load<arg1_cpp_t>(data[2] + offsets2[2]);
|
||||
ret_t result2 = f(c10::convert<arg0_t>(u2), c10::convert<arg1_t>(v2));
|
||||
auto u3 = c10::load<arg0_cpp_t>(data[1] + offsets3[1]);
|
||||
auto v3 = c10::load<arg1_cpp_t>(data[2] + offsets3[2]);
|
||||
ret_t result3 = f(c10::convert<arg0_t>(u3), c10::convert<arg1_t>(v3));
|
||||
*(ret_cpp_t*)out0 = c10::convert<ret_cpp_t>(result0);
|
||||
*(ret_cpp_t*)out1 = c10::convert<ret_cpp_t>(result1);
|
||||
*(ret_cpp_t*)out2 = c10::convert<ret_cpp_t>(result2);
|
||||
*(ret_cpp_t*)out3 = c10::convert<ret_cpp_t>(result3);
|
||||
} else {
|
||||
auto offsets = offset_calc.get(idx);
|
||||
void* out = data[0] + offsets[0];
|
||||
auto u = c10::load<arg0_cpp_t>(data[1] + offsets[1]);
|
||||
auto v = c10::load<arg1_cpp_t>(data[2] + offsets[2]);
|
||||
ret_t result = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
|
||||
*(ret_cpp_t*)out = c10::convert<ret_cpp_t>(result);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
#endif
|
||||
|
||||
@ -1065,32 +1002,6 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
|
||||
}
|
||||
auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
|
||||
#ifdef USE_ROCM
|
||||
if (check_binary_rt_types_for_specialization(iter)) {
|
||||
// constexpr to reduce the amount of kernels generated for
|
||||
// broadcast elementwise with mexed dtypes and limit which functors are actually
|
||||
// applied to the load and store at compile time.
|
||||
using func_tuple = typename traits::ArgsTuple;
|
||||
if constexpr (
|
||||
std::is_same_v<float, arg0_t> && traits::arity == 2 &&
|
||||
check_binary_functor_types_for_specialization<
|
||||
func_tuple,
|
||||
float,
|
||||
float,
|
||||
traits::arity,
|
||||
/*arg_num=*/0>::check()) {
|
||||
memory::detail::static_unroll<
|
||||
type_specialized_broadcast_kernel_launcher,
|
||||
rt_binary_specializations.size()>::with_args(
|
||||
numel,
|
||||
f,
|
||||
data,
|
||||
dtypes,
|
||||
offset_calc
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr int grp_sz = 128;
|
||||
launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
|
||||
if (unrl) {
|
||||
|
||||
@ -133,7 +133,7 @@ at::Tensor quantized_convolution(
|
||||
// supported in conv.
|
||||
mask_weight = weight_zero_points.numel() > 1 ? 1 : 0;
|
||||
if (groups > 1 && weight_zero_points.numel() > 1)
|
||||
mask_weight = (1 << 0) | (1 << 1); // 2^0 (group) | 2^1 (output channel)
|
||||
mask_weight = (2 ^ 0) | (2 ^ 1); // 2^0 (group) | 2^1 (output channel)
|
||||
dnnl::primitive_attr pattr;
|
||||
|
||||
bool src_need_zp = (act_zero_point != 0);
|
||||
|
||||
@ -212,12 +212,17 @@ static Tensor& bce_loss_out_impl(const Tensor& input,
|
||||
loss.resize_((reduction == Reduction::None || grad_output.defined()) ? target.sizes() : IntArrayRef({}));
|
||||
TORCH_CHECK(loss.is_mps());
|
||||
|
||||
Tensor loss_squeezed = loss.squeeze();
|
||||
Tensor input_squeezed = input.squeeze();
|
||||
Tensor target_squeezed = target.squeeze();
|
||||
|
||||
@autoreleasepool {
|
||||
std::string key = op_name + reductionToString(reduction) + getTensorsStringKey({input, target, weight});
|
||||
std::string key =
|
||||
op_name + reductionToString(reduction) + getTensorsStringKey({input_squeezed, target_squeezed, weight});
|
||||
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
|
||||
newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
|
||||
newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_squeezed);
|
||||
newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target_squeezed);
|
||||
|
||||
MPSGraphTensor* bceLossUnweighted = nil;
|
||||
// if grad_output is defined, then it's a backward pass
|
||||
@ -247,12 +252,12 @@ static Tensor& bce_loss_out_impl(const Tensor& input,
|
||||
newCachedGraph->gradInputTensor = bceLoss;
|
||||
}
|
||||
} else {
|
||||
newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input.sizes().size());
|
||||
newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input_squeezed.sizes().size());
|
||||
}
|
||||
});
|
||||
Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input);
|
||||
Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target);
|
||||
Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss);
|
||||
Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input_squeezed);
|
||||
Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target_squeezed);
|
||||
Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss_squeezed);
|
||||
|
||||
NSMutableDictionary* feeds = [[NSMutableDictionary new] autorelease];
|
||||
|
||||
|
||||
@ -478,7 +478,7 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
|
||||
const auto s_k = params.key.sym_size(2);
|
||||
const auto d_qk = params.query.sym_size(3);
|
||||
const auto d_v = params.value.sym_size(3);
|
||||
long cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
|
||||
long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
|
||||
if (cudnn_version < 8903) {
|
||||
if (debug) {
|
||||
TORCH_WARN("SDPA fprop requires cudnn 8.9.3 or higher");
|
||||
@ -709,7 +709,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
|
||||
return false;
|
||||
#endif
|
||||
#if defined(CUDNN_VERSION)
|
||||
static auto cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
|
||||
static auto cudnn_version = cudnnGetVersion();
|
||||
if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) {
|
||||
if (debug) {
|
||||
TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13).");
|
||||
|
||||
@ -53,8 +53,10 @@ class AddmmBenchmark(op_bench.TorchBenchmarkBase):
|
||||
return torch.addmm(input_one, mat1, mat2)
|
||||
|
||||
|
||||
op_bench.generate_pt_test(addmm_short_configs + addmm_long_configs, AddmmBenchmark)
|
||||
op_bench.generate_pt_gradient_test(addmm_long_configs, AddmmBenchmark)
|
||||
op_bench.generate_pt_test(addmm_long_configs + addmm_long_configs, AddmmBenchmark)
|
||||
op_bench.generate_pt_gradient_test(
|
||||
addmm_long_configs + addmm_long_configs, AddmmBenchmark
|
||||
)
|
||||
|
||||
"""Mircobenchmark for addbmm operator."""
|
||||
|
||||
@ -105,7 +107,9 @@ addbmm_short_configs = op_bench.cross_product_configs(
|
||||
)
|
||||
|
||||
op_bench.generate_pt_test(addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark)
|
||||
op_bench.generate_pt_gradient_test(addbmm_long_configs, AddbmmBenchmark)
|
||||
op_bench.generate_pt_gradient_test(
|
||||
addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
op_bench.benchmark_runner.main()
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
#include <c10/core/SymBool.h>
|
||||
#include <c10/core/SymInt.h>
|
||||
#include <c10/core/SymNodeImpl.h>
|
||||
|
||||
namespace c10 {
|
||||
@ -112,17 +111,4 @@ bool SymBool::has_hint() const {
|
||||
return toSymNodeImpl()->has_hint();
|
||||
}
|
||||
|
||||
SymInt SymBool::toSymInt() const {
|
||||
// If concrete bool, return concrete SymInt
|
||||
if (auto ma = maybe_as_bool()) {
|
||||
return SymInt(*ma ? 1 : 0);
|
||||
}
|
||||
|
||||
// Symbolic case: use sym_ite to convert bool to int (0 or 1)
|
||||
auto node = toSymNodeImpl();
|
||||
auto one_node = node->wrap_int(1);
|
||||
auto zero_node = node->wrap_int(0);
|
||||
return SymInt(node->sym_ite(one_node, zero_node));
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
||||
@ -12,8 +12,6 @@
|
||||
|
||||
namespace c10 {
|
||||
|
||||
class SymInt;
|
||||
|
||||
class C10_API SymBool {
|
||||
public:
|
||||
/*implicit*/ SymBool(bool b) : data_(b) {}
|
||||
@ -82,10 +80,6 @@ class C10_API SymBool {
|
||||
return toSymNodeImplUnowned()->constant_bool();
|
||||
}
|
||||
|
||||
// Convert SymBool to SymInt (0 or 1)
|
||||
// This is the C++ equivalent of Python's cast_symbool_to_symint_guardless
|
||||
SymInt toSymInt() const;
|
||||
|
||||
bool is_heap_allocated() const {
|
||||
return ptr_;
|
||||
}
|
||||
|
||||
@ -106,9 +106,6 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
|
||||
} else if (key == "graph_capture_record_stream_reuse") {
|
||||
i = parseGraphCaptureRecordStreamReuse(tokenizer, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (key == "per_process_memory_fraction") {
|
||||
i = parsePerProcessMemoryFraction(tokenizer, i);
|
||||
used_native_specific_option = true;
|
||||
} else {
|
||||
const auto& keys =
|
||||
c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
|
||||
@ -149,18 +146,6 @@ size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
|
||||
return i;
|
||||
}
|
||||
|
||||
double CUDAAllocatorConfig::parsePerProcessMemoryFraction(
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
double val_env = tokenizer.toDouble(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
val_env >= 0.0 && val_env <= 1.0,
|
||||
"per_process_memory_fraction is invalid, set it in [0.0, 1.0]");
|
||||
m_per_process_memory_fraction = val_env;
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
|
||||
@ -61,10 +61,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
return instance().m_graph_capture_record_stream_reuse;
|
||||
}
|
||||
|
||||
static double per_process_memory_fraction() {
|
||||
return instance().m_per_process_memory_fraction;
|
||||
}
|
||||
|
||||
/** Pinned memory allocator settings */
|
||||
static bool pinned_use_cuda_host_register() {
|
||||
return instance().m_pinned_use_cuda_host_register;
|
||||
@ -156,8 +152,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
"pinned_use_hip_host_register",
|
||||
"graph_capture_record_stream_reuse",
|
||||
"pinned_reserve_segment_size_mb",
|
||||
"pinned_num_register_threads",
|
||||
"per_process_memory_fraction"};
|
||||
"pinned_num_register_threads"};
|
||||
return keys;
|
||||
}
|
||||
|
||||
@ -182,9 +177,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
size_t parseGraphCaptureRecordStreamReuse(
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
size_t i);
|
||||
double parsePerProcessMemoryFraction(
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
size_t i);
|
||||
|
||||
std::atomic<size_t> m_pinned_num_register_threads{1};
|
||||
std::atomic<size_t> m_pinned_reserve_segment_size_mb{0};
|
||||
@ -197,7 +189,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
std::atomic<bool> m_release_lock_on_cudamalloc{false};
|
||||
std::atomic<bool> m_pinned_use_cuda_host_register{false};
|
||||
std::atomic<bool> m_graph_capture_record_stream_reuse{false};
|
||||
std::atomic<double> m_per_process_memory_fraction{1.0};
|
||||
};
|
||||
|
||||
// Keep this for backwards compatibility
|
||||
|
||||
@ -1100,7 +1100,7 @@ class RingBuffer {
|
||||
} // anonymous namespace
|
||||
} // namespace Native
|
||||
|
||||
static std::string reportProcessMemoryInfo(const cudaDeviceProp& prop) {
|
||||
static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
|
||||
#ifdef PYTORCH_C10_DRIVER_API_SUPPORTED
|
||||
void* nvml_handle = DriverAPI::get_nvml_handle();
|
||||
if (!nvml_handle) {
|
||||
@ -1111,6 +1111,9 @@ static std::string reportProcessMemoryInfo(const cudaDeviceProp& prop) {
|
||||
return true;
|
||||
}();
|
||||
|
||||
cudaDeviceProp prop{};
|
||||
C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
||||
|
||||
// NOLINTNEXTLINE(*-c-arrays)
|
||||
char pci_id[80];
|
||||
snprintf(
|
||||
@ -1212,16 +1215,14 @@ class DeviceCachingAllocator {
|
||||
// record used memory.
|
||||
size_t total_allocated_memory = 0;
|
||||
|
||||
cudaDeviceProp device_prop;
|
||||
|
||||
// maximum amount of memory that device is allowed to
|
||||
// allocate. This is set iff memory fraction is less than 1
|
||||
std::optional<size_t> allowed_memory_maximum{std::nullopt};
|
||||
size_t allowed_memory_maximum = 0;
|
||||
|
||||
// all live expandable segments
|
||||
std::vector<ExpandableSegment*> expandable_segments_;
|
||||
std::vector<c10::DeviceIndex> devices_with_peer_access_;
|
||||
|
||||
bool set_fraction = false;
|
||||
|
||||
bool record_history = false;
|
||||
|
||||
std::atomic<CreateContextFn> context_recorder_;
|
||||
@ -1263,9 +1264,6 @@ class DeviceCachingAllocator {
|
||||
: device_id(id),
|
||||
large_blocks(/*small=*/false),
|
||||
small_blocks(/*small=*/true) {
|
||||
C10_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, id));
|
||||
|
||||
setMemoryFraction(CUDAAllocatorConfig::per_process_memory_fraction());
|
||||
stats.max_split_size =
|
||||
static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
|
||||
context_recorder_.store(nullptr);
|
||||
@ -1401,7 +1399,7 @@ class DeviceCachingAllocator {
|
||||
if (!block_found) {
|
||||
// Do garbage collection if the flag is set.
|
||||
if (C10_UNLIKELY(
|
||||
allowed_memory_maximum.has_value() &&
|
||||
set_fraction &&
|
||||
AcceleratorAllocatorConfig::garbage_collection_threshold() >
|
||||
0.0)) {
|
||||
garbage_collect_cached_blocks(context);
|
||||
@ -1458,12 +1456,11 @@ class DeviceCachingAllocator {
|
||||
C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
|
||||
std::string allowed_info;
|
||||
|
||||
if (allowed_memory_maximum.has_value()) {
|
||||
allowed_info =
|
||||
format_size(allowed_memory_maximum.value()) + " allowed; ";
|
||||
if (set_fraction) {
|
||||
allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
|
||||
}
|
||||
|
||||
std::string proc_info = reportProcessMemoryInfo(device_prop);
|
||||
std::string proc_info = reportProcessMemoryInfo(device_id);
|
||||
|
||||
record_trace(
|
||||
TraceEntry::OOM,
|
||||
@ -1521,7 +1518,7 @@ class DeviceCachingAllocator {
|
||||
for (const auto& obs : observers_local) {
|
||||
obs(device_id,
|
||||
alloc_size,
|
||||
allowed_memory_maximum.value_or(device_total),
|
||||
set_fraction ? allowed_memory_maximum : device_total,
|
||||
device_free);
|
||||
}
|
||||
|
||||
@ -2018,26 +2015,25 @@ class DeviceCachingAllocator {
|
||||
|
||||
/** get memory fraction limiting maximum allocated memory **/
|
||||
double getMemoryFraction() {
|
||||
if (!allowed_memory_maximum.has_value()) {
|
||||
if (!set_fraction) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
return static_cast<double>(allowed_memory_maximum.value()) /
|
||||
static_cast<double>(device_prop.totalGlobalMem);
|
||||
size_t device_free = 0;
|
||||
size_t device_total = 0;
|
||||
C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
|
||||
return static_cast<double>(allowed_memory_maximum) /
|
||||
static_cast<double>(device_total);
|
||||
}
|
||||
|
||||
/** set memory fraction to limit maximum allocated memory **/
|
||||
void setMemoryFraction(double fraction) {
|
||||
TORCH_CHECK(
|
||||
0 <= fraction && fraction <= 1,
|
||||
"invalid fraction:",
|
||||
fraction,
|
||||
". Please set within [0, 1].");
|
||||
allowed_memory_maximum = std::nullopt;
|
||||
if (fraction < 1.0) {
|
||||
allowed_memory_maximum = static_cast<size_t>(
|
||||
fraction * static_cast<double>(device_prop.totalGlobalMem));
|
||||
}
|
||||
size_t device_free = 0;
|
||||
size_t device_total = 0;
|
||||
C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
|
||||
allowed_memory_maximum =
|
||||
static_cast<size_t>(fraction * static_cast<double>(device_total));
|
||||
set_fraction = true;
|
||||
}
|
||||
|
||||
/** get expandable segment size for all the streams on device **/
|
||||
@ -3014,7 +3010,7 @@ class DeviceCachingAllocator {
|
||||
BlockPool& pool = *p.pool;
|
||||
|
||||
if (C10_UNLIKELY(
|
||||
allowed_memory_maximum.has_value() &&
|
||||
set_fraction &&
|
||||
AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
|
||||
// Track block reuse interval only when garbage collection is enabled.
|
||||
++pool.get_free_blocks_call_count;
|
||||
@ -3087,7 +3083,7 @@ class DeviceCachingAllocator {
|
||||
|
||||
size_t gc_threshold = static_cast<size_t>(
|
||||
AcceleratorAllocatorConfig::garbage_collection_threshold() *
|
||||
static_cast<double>(allowed_memory_maximum.value()));
|
||||
static_cast<double>(allowed_memory_maximum));
|
||||
// No need to trigger GC yet
|
||||
if (total_allocated_memory <= gc_threshold) {
|
||||
return;
|
||||
@ -3165,8 +3161,8 @@ class DeviceCachingAllocator {
|
||||
|
||||
bool active_pool =
|
||||
p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator();
|
||||
if (allowed_memory_maximum.has_value() &&
|
||||
total_allocated_memory + size > allowed_memory_maximum.value()) {
|
||||
if (set_fraction &&
|
||||
total_allocated_memory + size > allowed_memory_maximum) {
|
||||
p.err = cudaErrorMemoryAllocation;
|
||||
return false;
|
||||
// Temporarily disable checkpointing & cudagraphs internally
|
||||
@ -3863,6 +3859,7 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
"Allocator not initialized for device ",
|
||||
device,
|
||||
": did you call init?");
|
||||
C10_CUDA_CHECK(c10::cuda::SetDevice(device));
|
||||
return device_allocator[device]->getMemoryFraction();
|
||||
}
|
||||
|
||||
@ -3872,6 +3869,12 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
"Allocator not initialized for device ",
|
||||
device,
|
||||
": did you call init?");
|
||||
TORCH_CHECK(
|
||||
0 <= fraction && fraction <= 1,
|
||||
"invalid fraction:",
|
||||
fraction,
|
||||
". Please set within [0, 1].");
|
||||
C10_CUDA_CHECK(c10::cuda::SetDevice(device));
|
||||
device_allocator[device]->setMemoryFraction(fraction);
|
||||
}
|
||||
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
|
||||
#include <c10/core/AllocatorConfig.h>
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
#include <c10/cuda/CUDAAllocatorConfig.h>
|
||||
#include <c10/cuda/CUDAGraphsC10Utils.h>
|
||||
#include <c10/cuda/CUDAMacros.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
|
||||
@ -427,6 +427,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
|
||||
// on the current device each later call sees.
|
||||
void init(int dev_count) override {
|
||||
static bool called = [](int dev_count) {
|
||||
;
|
||||
// Are there external guarantees init will be called before
|
||||
// any of the allocator's other functions?
|
||||
// std::lock_guard<std::mutex> lk(general_mutex);
|
||||
|
||||
@ -1941,7 +1941,6 @@ if(BUILD_TEST)
|
||||
foreach(test_src ${Caffe2_XPU_TEST_SRCS})
|
||||
get_filename_component(test_name ${test_src} NAME_WE)
|
||||
add_executable(${test_name} "${test_src}")
|
||||
torch_compile_options(${test_name})
|
||||
target_link_libraries(${test_name} torch_library gtest_main)
|
||||
target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
|
||||
target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
|
||||
|
||||
@ -73,19 +73,6 @@ void box_cox_zero_lambda(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
at::vec::Vectorized<T> box_cox_nonzero_lambda_impl(
|
||||
at::vec::Vectorized<T> data,
|
||||
at::vec::Vectorized<T> lambda1,
|
||||
at::vec::Vectorized<T> lambda2,
|
||||
at::vec::Vectorized<T> k_eps) {
|
||||
auto sum = data + lambda2;
|
||||
auto max = at::vec::max(sum, k_eps);
|
||||
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
|
||||
auto pow = max.pow(lambda1);
|
||||
return at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void box_cox_nonzero_lambda(
|
||||
int64_t D,
|
||||
@ -101,18 +88,21 @@ void box_cox_nonzero_lambda(
|
||||
auto k_eps_vec = Vec(k_eps);
|
||||
for(; j + VLEN < D; j += VLEN) {
|
||||
auto data = Vec::loadu(data_ptr + j);
|
||||
auto lambda1 = Vec::loadu(lambda1_ptr + j);
|
||||
auto lambda2 = Vec::loadu(lambda2_ptr + j);
|
||||
auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
|
||||
auto sum = data + lambda2;
|
||||
auto max = at::vec::max(sum, k_eps_vec);
|
||||
auto lambda1 = Vec::loadu(lambda1_ptr + j);
|
||||
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
|
||||
auto pow = max.pow(lambda1);
|
||||
auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
|
||||
res.store(out + j);
|
||||
}
|
||||
if (j < D) {
|
||||
auto remaining = D - j;
|
||||
auto data = Vec::loadu(data_ptr + j, remaining);
|
||||
auto lambda1 = Vec::loadu(lambda1_ptr + j, remaining);
|
||||
auto lambda2 = Vec::loadu(lambda2_ptr + j, remaining);
|
||||
auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
|
||||
res.store(out + j, remaining);
|
||||
for ( ;j < D; ++j) {
|
||||
auto sum = data_ptr[j] + lambda2_ptr[j];
|
||||
auto max = std::max(sum, k_eps);
|
||||
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]);
|
||||
auto pow = std::pow(max, lambda1_ptr[j]);
|
||||
out[j] = pow * lambda_over_1 - lambda_over_1;
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
@ -206,41 +206,6 @@ templates_path = [
|
||||
os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
|
||||
]
|
||||
# TODO: document these and remove them from here.
|
||||
# Fixes the duplicated
|
||||
autosummary_filename_map = {
|
||||
"torch.nn.utils.prune.identity": "torch.nn.utils.prune.identity_function",
|
||||
"torch.nn.utils.prune.Identity": "torch.nn.utils.prune.Identity_class",
|
||||
"torch.optim.adamw.adamw": "torch.optim.adamw.adamw_function",
|
||||
"torch.optim.adamw.AdamW": "torch.optim.adamw.AdamW_class",
|
||||
"torch.optim.asgd.asgd": "torch.optim.asgd.asgd_function",
|
||||
"torch.optim.asgd.ASGD": "torch.optim.asgd.ASGD_class",
|
||||
"torch.optim.nadam.nadam": "torch.optim.nadam.nadam_function",
|
||||
"torch.optim.nadam.NAdam": "torch.optim.nadam.NAdam_class",
|
||||
"torch.optim.radam.radam": "torch.optim.radam.radam_function",
|
||||
"torch.optim.radam.RAdam": "torch.optim.radam.RAdam_class",
|
||||
"torch.optim.rmsprop.rmsprop": "torch.optim.rmsprop.rmsprop_function",
|
||||
"torch.optim.rmsprop.RMSprop": "torch.optim.rmsprop.RMSprop_class",
|
||||
"torch.optim.rprop.rprop": "torch.optim.rprop.rprop_function",
|
||||
"torch.optim.rprop.Rprop": "torch.optim.rprop.Rprop_class",
|
||||
"torch.optim.sgd.sgd": "torch.optim.sgd.sgd_function",
|
||||
"torch.optim.sgd.SGD": "torch.optim.sgd.SGD_class",
|
||||
"torch.optim.adadelta.adadelta": "torch.optim.adadelta.adadelta_function",
|
||||
"torch.optim.adadelta.Adadelta": "torch.optim.adadelta.Adadelta_class",
|
||||
"torch.optim.adagrad.adagrad": "torch.optim.adagrad.adagrad_function",
|
||||
"torch.optim.adagrad.Adagrad": "torch.optim.adagrad.Adagrad_class",
|
||||
"torch.optim.adam.adam": "torch.optim.adam.adam_function",
|
||||
"torch.optim.adam.Adam": "torch.optim.adam.Adam_class",
|
||||
"torch.optim.adamax.adamax": "torch.optim.adamax.adamax_function",
|
||||
"torch.optim.adamax.Adamax": "torch.optim.adamax.Adamax_class",
|
||||
"torch.mtia.stream": "torch.mtia.stream_function",
|
||||
"torch.mtia.Stream": "torch.mtia.Stream_class",
|
||||
"torch.cpu.stream": "torch.cpu.stream_function",
|
||||
"torch.cpu.Stream": "torch.cpu.Stream_class",
|
||||
"torch.cuda.stream": "torch.cuda.stream_function",
|
||||
"torch.cuda.Stream": "torch.cuda.Stream_class",
|
||||
"torch.xpu.stream": "torch.xpu.stream_function",
|
||||
"torch.xpu.Stream": "torch.xpu.Stream_class",
|
||||
}
|
||||
|
||||
coverage_ignore_functions = [
|
||||
# torch
|
||||
@ -3230,11 +3195,6 @@ autodoc_type_aliases = {
|
||||
# Enable overriding of function signatures in the first line of the docstring.
|
||||
autodoc_docstring_signature = True
|
||||
|
||||
# Exclude inherited IntEnum methods that have RST formatting issues in their docstrings
|
||||
autodoc_default_options = {
|
||||
"exclude-members": "from_bytes, to_bytes",
|
||||
}
|
||||
|
||||
# -- katex javascript in header
|
||||
#
|
||||
# def setup(app):
|
||||
|
||||
@ -619,10 +619,6 @@ Available options:
|
||||
and reallocate buffers across multiple streams, especially when the capture DAG frequently
|
||||
reaches joined frontiers.
|
||||
|
||||
* ``per_process_memory_fraction`` option limits the amount of memory that can be allocated
|
||||
on all the CUDA devices to a specified fraction of the available memory. This is a value
|
||||
between 0 and 1. Attempting to allocate more memory will raise an out of memory error.
|
||||
|
||||
.. note::
|
||||
|
||||
Some stats reported by the
|
||||
|
||||
@ -46,108 +46,6 @@ These headers are promised to be ABI stable across releases and adhere to a stro
|
||||
Unless absolutely necessary, we recommend the high-level C++ API in `torch/csrc/stable`
|
||||
which will handle all the rough edges of the C API for the user.
|
||||
|
||||
## Migrating your kernel to the LibTorch stable ABI
|
||||
|
||||
If you'd like your kernel to be ABI stable with LibTorch, meaning you'd the ability to build for one version and run on another, your kernel must only use the limited stable ABI. This following section goes through some steps of migrating an existing kernel and APIs we imagine you would need to swap over.
|
||||
|
||||
Firstly, instead of registering kernels through `TORCH_LIBRARY`, LibTorch ABI stable kernels must be registered via `STABLE_TORCH_LIBRARY`. Note that, for the time being, implementations registered via `STABLE_TORCH_LIBRARY` must be boxed unlike `TORCH_LIBRARY`. See the simple example below or our docs on [Stack-based APIs](stack-based-apis) for more details. For kernels that are registered via `pybind`, before using the stable ABI, it would be useful to migrate to register them via `TORCH_LIBRARY`.
|
||||
|
||||
While previously your kernels might have included APIs from `<torch/*.h>` (for example, `<torch/all.h>`), they are now limited to including from the 3 categories of headers mentioned above (`torch/csrc/stable/*.h`, `torch/headeronly/*.h` and the stable C headers). This means that your extension should no longer use any utilities from the `at::` or `c10::` namespaces but instead use their replacements in `torch::stable` and `torch::headeronly`. To provide a couple examples of the necessary migrations:
|
||||
- all uses of `at::Tensor` must be replaced with `torch::stable::Tensor`
|
||||
- all uses of `TORCH_CHECK` must be replaced with `STD_TORCH_CHECK`
|
||||
- all uses of `at::kCUDA` must be replaced with `torch::headeronly::kCUDA` etc.
|
||||
- native functions such as `at::pad` must be replaced with `torch::stable::pad`
|
||||
- native functions that are called as Tensor methods (e.g., `Tensor.pad`) must be replaced with the ATen variant through `torch::stable::pad`.
|
||||
|
||||
As mentioned above, the LibTorch stable ABI is still under development. If there is any API or feature you would like to see added to the stable ABI/`torch::headeronly`/`torch::stable`, please file a request through a [new issue on the PyTorch repo](https://github.com/pytorch/pytorch/issues).
|
||||
|
||||
Below is a simple example of migrating an existing kernel that uses `TORCH_LIBRARY` to the stable ABI (`TORCH_STABLE_LIBRARY`). For a larger end to end example you can take a look at the FA3 repository. Specifically the diff between [`flash_api.cpp`](https://github.com/Dao-AILab/flash-attention/blob/ad70a007e6287d4f7e766f94bcf2f9a813f20f6b/hopper/flash_api.cpp#L1) and the stable variant [`flash_api_stable.cpp`](https://github.com/Dao-AILab/flash-attention/blob/ad70a007e6287d4f7e766f94bcf2f9a813f20f6b/hopper/flash_api_stable.cpp#L1).
|
||||
|
||||
|
||||
### Original Version with `TORCH_LIBRARY`
|
||||
|
||||
```cpp
|
||||
// original_kernel.cpp - Using TORCH_LIBRARY (not stable ABI)
|
||||
#include <torch/torch.h>
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
namespace myops {
|
||||
|
||||
// Simple kernel that adds a scalar value to each element of a tensor
|
||||
at::Tensor add_scalar(const at::Tensor& input, double scalar) {
|
||||
TORCH_CHECK(input.scalar_type() == at::kFloat, "Input must be float32");
|
||||
|
||||
return input.add(scalar);
|
||||
}
|
||||
|
||||
// Register the operator
|
||||
TORCH_LIBRARY(myops, m) {
|
||||
m.def("add_scalar(Tensor input, float scalar) -> Tensor", &add_scalar);
|
||||
}
|
||||
|
||||
// Register the implementation
|
||||
TORCH_LIBRARY_IMPL(myops, CompositeExplicitAutograd, m) {
|
||||
m.impl("add_scalar", &add_scalar);
|
||||
}
|
||||
|
||||
} // namespace myops
|
||||
```
|
||||
|
||||
### Migrated Version with `STABLE_TORCH_LIBRARY`
|
||||
|
||||
```cpp
|
||||
// stable_kernel.cpp - Using STABLE_TORCH_LIBRARY (stable ABI)
|
||||
|
||||
// (1) Don't include <torch/torch.h> <ATen/ATen.h>
|
||||
// only include APIs from torch/csrc/stable, torch/headeronly and C-shims
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor_struct.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/csrc/stable/stableivalue_conversions.h>
|
||||
#include <torch/headeronly/core/ScalarType.h>
|
||||
#include <torch/headeronly/macros/Macros.h>
|
||||
|
||||
namespace myops {
|
||||
|
||||
// Simple kernel that adds a scalar value to each element of a tensor
|
||||
torch::stable::Tensor add_scalar(const torch::stable::Tensor& input, double scalar) {
|
||||
// (2) use STD_TORCH_CHECK instead of TORCH_CHECK
|
||||
STD_TORCH_CHECK(
|
||||
// (3) use torch::headeronly::kFloat instead of at:kFloat
|
||||
input.scalar_type() == torch::headeronly::kFloat,
|
||||
"Input must be float32");
|
||||
|
||||
// (4) Use stable ops namespace instead of input.add
|
||||
return torch::stable::add(input, scalar);
|
||||
}
|
||||
|
||||
// (5) Add Boxed wrapper required for STABLE_TORCH_LIBRARY
|
||||
void boxed_add_scalar(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
// Extract arguments from stack using `to<T>`
|
||||
auto input = to<torch::stable::Tensor>(stack[0]);
|
||||
auto scalar = to<double>(stack[1]);
|
||||
|
||||
// Call the actual kernel
|
||||
auto result = add_scalar(input, scalar);
|
||||
|
||||
// Put result back on stack using `from()`
|
||||
// Stack slot 0 now holds the return value
|
||||
stack[0] = from(result);
|
||||
}
|
||||
|
||||
// (6) Register the operator using STABLE_TORCH_LIBRARY
|
||||
STABLE_TORCH_LIBRARY(myops, m) {
|
||||
m.def("add_scalar(Tensor input, float scalar) -> Tensor", &boxed_add_scalar);
|
||||
}
|
||||
|
||||
// (7) Register the implementation using STABLE_TORCH_LIBRARY_IMPL
|
||||
STABLE_TORCH_LIBRARY_IMPL(myops, CompositeExplicitAutograd, m) {
|
||||
m.impl("add_scalar", &boxed_add_scalar);
|
||||
}
|
||||
|
||||
} // namespace myops
|
||||
```
|
||||
|
||||
|
||||
## How are objects passed across the ABI boundary when interacting with the dispatcher?
|
||||
|
||||
@ -211,7 +109,6 @@ There are two invariants for the stack:
|
||||
a. When calling a stack-based API, you must give owning references to the calling stack and steal references from the returned stack.
|
||||
b. When registering your function to be called with a stack, you must steal references from your argument stack and push onto the stack new references.
|
||||
|
||||
(stack-based-apis)=
|
||||
### Stack-based APIs
|
||||
|
||||
The above is relevant in two places:
|
||||
|
||||
@ -253,6 +253,7 @@ regular full-precision tensor.
|
||||
.. autosummary::
|
||||
:toctree: generated
|
||||
:nosignatures:
|
||||
:template: classtemplate.rst
|
||||
|
||||
view
|
||||
as_strided
|
||||
|
||||
@ -75,7 +75,6 @@ class TestScheduler(TestCase):
|
||||
|
||||
class TestCubicScheduler(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.model_sparse_config = [
|
||||
{"tensor_fqn": "0.weight", "sparsity_level": 0.8},
|
||||
{"tensor_fqn": "2.weight", "sparsity_level": 0.4},
|
||||
|
||||
@ -11,7 +11,6 @@ from torch.testing._internal.common_utils import IS_LINUX, run_tests, TestCase
|
||||
@unittest.skipIf(not IS_LINUX, "Only works on linux")
|
||||
class TestTorchrun(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self._test_dir = tempfile.mkdtemp(prefix=self.__class__.__name__)
|
||||
|
||||
def tearDown(self):
|
||||
|
||||
@ -10,8 +10,6 @@ set(AOTI_ABI_CHECK_TEST_SRCS
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/main.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_cast.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_dispatch.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_dispatch_v2.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_exception.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_headeronlyarrayref.cpp
|
||||
@ -47,10 +45,6 @@ endif()
|
||||
# Disable unused-variable warnings for variables that are only used to test compilation
|
||||
target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-variable)
|
||||
target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-but-set-variable)
|
||||
# Add -Wno-dangling-pointer for GCC 13
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
|
||||
target_compile_options_if_supported(test_aoti_abi_check -Wno-dangling-pointer)
|
||||
endif()
|
||||
|
||||
foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
|
||||
foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
|
||||
|
||||
@ -1,82 +0,0 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <torch/headeronly/core/Dispatch.h>
|
||||
#include <torch/headeronly/core/Dispatch_v2.h>
|
||||
|
||||
// MY_PRIVATE_CHECK_SELECTIVE_BUILD is a prelude to case block. For
|
||||
// testing, we do nothing:
|
||||
#define MY_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type) /* empty */
|
||||
|
||||
#define MY_PRIVATE_CASE_TYPE_USING_HINT(...) \
|
||||
THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL( \
|
||||
MY_PRIVATE_CHECK_SELECTIVE_BUILD, __VA_ARGS__)
|
||||
|
||||
#define MY_DISPATCH_CASE(...) \
|
||||
THO_DISPATCH_CASE_TMPL(MY_PRIVATE_CASE_TYPE_USING_HINT, __VA_ARGS__)
|
||||
|
||||
// MY_RECORD_KERNEL_FUNCTION_DTYPE is a prelude to switch
|
||||
// statement. For testing, we just avoid unused variable warning:
|
||||
#define MY_RECORD_KERNEL_FUNCTION_DTYPE(DISPATCHNAME, ENUMTYPE) \
|
||||
(void)DISPATCHNAME
|
||||
|
||||
// MY_CHECK_NOT_IMPLEMENTED is called in switch default block. For
|
||||
// testing, we count case mismatches:
|
||||
#define MY_CHECK_NOT_IMPLEMENTED(...) default_count++
|
||||
|
||||
#define MY_DISPATCH_SWITCH(...) \
|
||||
THO_DISPATCH_SWITCH_TMPL( \
|
||||
MY_RECORD_KERNEL_FUNCTION_DTYPE, MY_CHECK_NOT_IMPLEMENTED, __VA_ARGS__)
|
||||
|
||||
// MY_CASE_FUNCTION is called in a case block. For testing, we count
|
||||
// case matches and ensure that scalar_t/index_t type is defined:
|
||||
#define MY_CASE_FUNCTION \
|
||||
[&] { \
|
||||
count++; \
|
||||
scalar_t tmp; \
|
||||
(void)tmp; \
|
||||
}
|
||||
#define MY_INDEX_CASE_FUNCTION \
|
||||
[&] { \
|
||||
count++; \
|
||||
index_t tmp; \
|
||||
(void)tmp; \
|
||||
}
|
||||
|
||||
#define DEFINE_ITEM(TYPE, SCALARTYPE) ScalarType::SCALARTYPE,
|
||||
|
||||
#define MY_DISPATCH_V2(TYPE, NAME, BODY, ...) \
|
||||
THO_DISPATCH_V2_TMPL( \
|
||||
MY_DISPATCH_SWITCH, \
|
||||
MY_DISPATCH_CASE, \
|
||||
TYPE, \
|
||||
NAME, \
|
||||
AT_WRAP(BODY), \
|
||||
__VA_ARGS__)
|
||||
|
||||
#define TEST_DISPATCH_V2(NAME, EXPECTEDCOUNT, ...) \
|
||||
TEST(TestDispatchV2, NAME) { \
|
||||
using torch::headeronly::ScalarType; \
|
||||
using torch::headeronly::impl::ScalarTypeToCPPTypeT; \
|
||||
int8_t total_count = 0; \
|
||||
int8_t count = 0; \
|
||||
int8_t default_count = 0; \
|
||||
for (ScalarType t : \
|
||||
{AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ITEM)}) { \
|
||||
total_count++; \
|
||||
MY_DISPATCH_V2(t, "test_my_dispatch_v2", MY_CASE_FUNCTION, __VA_ARGS__); \
|
||||
} \
|
||||
EXPECT_EQ(count, EXPECTEDCOUNT); \
|
||||
EXPECT_EQ(default_count + count, total_count); \
|
||||
}
|
||||
|
||||
TEST_DISPATCH_V2(AT_FLOAT8_TYPES_, 5, AT_FLOAT8_TYPES);
|
||||
TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_, 5, AT_INTEGRAL_TYPES);
|
||||
TEST_DISPATCH_V2(AT_FLOATING_TYPES_, 2, AT_FLOATING_TYPES);
|
||||
TEST_DISPATCH_V2(AT_BAREBONES_UNSIGNED_TYPES_, 3, AT_BAREBONES_UNSIGNED_TYPES);
|
||||
TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_V2_, 8, AT_INTEGRAL_TYPES_V2);
|
||||
TEST_DISPATCH_V2(AT_COMPLEX_TYPES_, 2, AT_COMPLEX_TYPES);
|
||||
TEST_DISPATCH_V2(AT_QINT_TYPES_, 3, AT_QINT_TYPES);
|
||||
TEST_DISPATCH_V2(AT_ALL_TYPES_, 7, AT_ALL_TYPES);
|
||||
TEST_DISPATCH_V2(AT_ALL_TYPES_AND_COMPLEX_, 9, AT_ALL_TYPES_AND_COMPLEX);
|
||||
|
||||
#undef DEFINE_ITEM
|
||||
@ -1,45 +0,0 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <torch/headeronly/core/Dispatch_v2.h>
|
||||
#include <torch/headeronly/util/Exception.h>
|
||||
|
||||
#define DEFINE_ITEM(TYPE, SCALARTYPE) ScalarType::SCALARTYPE,
|
||||
|
||||
#define TEST_DISPATCH_V2(NAME, EXPECTEDCOUNT, ...) \
|
||||
TEST(TestThoDispatchV2, NAME) { \
|
||||
using torch::headeronly::ScalarType; \
|
||||
using torch::headeronly::impl::ScalarTypeToCPPTypeT; \
|
||||
int8_t total_count = 0; \
|
||||
int8_t count = 0; \
|
||||
int8_t default_count = 0; \
|
||||
for (ScalarType t : \
|
||||
{AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ITEM)}) { \
|
||||
total_count++; \
|
||||
try { \
|
||||
THO_DISPATCH_V2( \
|
||||
t, \
|
||||
"test_tho_dispatch_v2", \
|
||||
[&] { \
|
||||
count++; \
|
||||
scalar_t tmp; \
|
||||
(void)tmp; \
|
||||
}, \
|
||||
__VA_ARGS__); \
|
||||
} catch (...) { \
|
||||
default_count++; /* counts mismatches */ \
|
||||
} \
|
||||
} \
|
||||
EXPECT_EQ(count, EXPECTEDCOUNT); \
|
||||
EXPECT_EQ(default_count + count, total_count); \
|
||||
}
|
||||
|
||||
TEST_DISPATCH_V2(AT_FLOAT8_TYPES_, 5, AT_FLOAT8_TYPES);
|
||||
TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_, 5, AT_INTEGRAL_TYPES);
|
||||
TEST_DISPATCH_V2(AT_FLOATING_TYPES_, 2, AT_FLOATING_TYPES);
|
||||
TEST_DISPATCH_V2(AT_BAREBONES_UNSIGNED_TYPES_, 3, AT_BAREBONES_UNSIGNED_TYPES);
|
||||
TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_V2_, 8, AT_INTEGRAL_TYPES_V2);
|
||||
TEST_DISPATCH_V2(AT_COMPLEX_TYPES_, 2, AT_COMPLEX_TYPES);
|
||||
TEST_DISPATCH_V2(AT_QINT_TYPES_, 3, AT_QINT_TYPES);
|
||||
TEST_DISPATCH_V2(AT_ALL_TYPES_, 7, AT_ALL_TYPES);
|
||||
TEST_DISPATCH_V2(AT_ALL_TYPES_AND_COMPLEX_, 9, AT_ALL_TYPES_AND_COMPLEX);
|
||||
|
||||
#undef DEFINE_ITEM
|
||||
@ -70,13 +70,6 @@ if(NOT MSVC)
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12)
|
||||
target_compile_options_if_supported(test_api "-Wno-error=nonnull")
|
||||
endif()
|
||||
|
||||
# Add -Wno-error=array-bounds for GCC 13+
|
||||
# See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113239
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
|
||||
target_compile_options_if_supported(test_api "-Wno-error=array-bounds")
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
||||
if(INSTALL_TEST)
|
||||
|
||||
@ -64,7 +64,7 @@ def run(initializer):
|
||||
|
||||
def main():
|
||||
initializer_parameter_map = {}
|
||||
for initializer in INITIALIZERS:
|
||||
for initializer in INITIALIZERS.keys():
|
||||
sys.stderr.write(f"Evaluating {initializer} ...\n")
|
||||
initializer_parameter_map[initializer] = run(initializer)
|
||||
|
||||
|
||||
@ -130,7 +130,7 @@ def main():
|
||||
options = parser.parse_args()
|
||||
|
||||
optimizer_parameter_map = {}
|
||||
for optimizer in OPTIMIZERS:
|
||||
for optimizer in OPTIMIZERS.keys():
|
||||
sys.stderr.write(f"Evaluating {optimizer} ...\n")
|
||||
optimizer_parameter_map[optimizer] = run(
|
||||
optimizer, options.iterations, options.sample_every
|
||||
|
||||
@ -11,7 +11,6 @@ from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
class TestCustomBackend(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Load the library containing the custom backend.
|
||||
self.library_path = get_custom_backend_library_path()
|
||||
torch.ops.load_library(self.library_path)
|
||||
|
||||
@ -18,7 +18,6 @@ torch.ops.import_module("pointwise")
|
||||
|
||||
class TestCustomOperators(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.library_path = get_custom_op_library_path()
|
||||
ops.load_library(self.library_path)
|
||||
|
||||
|
||||
@ -76,7 +76,7 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
store=dist.FileStore(self.file_name, self.world_size),
|
||||
)
|
||||
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_replicate_transformer(self):
|
||||
"""
|
||||
This tests that replicate works on a transformer model with fully_shard and replicate layers
|
||||
@ -126,7 +126,7 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
for parameter in layer.parameters():
|
||||
self.assertEqual(parameter.placements, (Shard(dim=0),))
|
||||
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_replicate_transformer_managed_modules(self):
|
||||
"""
|
||||
This tests that replicate managed modules works properly. In this test we use a Transformer Module with 3 layers,
|
||||
@ -178,7 +178,7 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
replicate_model = replicate(replicate_model)
|
||||
self.assertEqual(len(_get_managed_modules((replicate_model,))), 21)
|
||||
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_replicate_tp_device_mesh(self):
|
||||
"""
|
||||
This tests that a user can pass in a device mesh to replicate a module
|
||||
@ -206,7 +206,7 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
self.assertEqual(parameter.device_mesh.shape, (2,))
|
||||
self.assertEqual(parameter.placements, (Replicate(),))
|
||||
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_train_replicate_fsdp(self):
|
||||
"""
|
||||
Tests that replicate_model has the same behavior as original model when training
|
||||
@ -253,7 +253,7 @@ class ReplicateTest(MultiProcessTestCase):
|
||||
self.assertEqual(replicate_loss, loss)
|
||||
check_sharded_parity(self, model, replicate_model)
|
||||
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_train_parity_2d_mlp(self):
|
||||
"""
|
||||
Verifies when a device mesh is passed in, the model has the same behavior as the original model when training
|
||||
|
||||
@ -22,7 +22,6 @@ from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
class TestMakeCheckpointer(TestCase):
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
# Create a temporary directory for checkpoints
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
|
||||
@ -161,7 +161,6 @@ class TestCheckpointProcessConfig(TestCase):
|
||||
|
||||
class TestCheckpointProcess(TestCase):
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
"""Set up common test fixtures."""
|
||||
self.rank_info = RankInfo(
|
||||
global_world_size=1,
|
||||
|
||||
@ -14,7 +14,6 @@ from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
class TestCheckpointReader(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Create a temporary directory for test checkpoints
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
|
||||
@ -52,7 +52,6 @@ class TestCheckpointWriterConfig(TestCase):
|
||||
|
||||
class TestCheckpointWriter(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Create a temporary directory for test checkpoints
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
|
||||
@ -52,7 +52,6 @@ class TestCheckpointer(TestCase):
|
||||
"""Parameterized tests that work with both sync and async checkpointers."""
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Create a temporary directory for checkpoints
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
@ -398,7 +397,6 @@ class TestAsyncCheckpointerSpecific(TestCase):
|
||||
"""Tests specific to AsyncCheckpointer functionality."""
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Create a temporary directory for checkpoints
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
|
||||
@ -12,7 +12,6 @@ from torch.testing._internal.common_utils import requires_cuda, run_tests, TestC
|
||||
|
||||
class TestDefaultStager(TestCase):
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
# Create a test state dictionary with various data types
|
||||
self.state_dict = {
|
||||
"model": torch.nn.Linear(10, 5).state_dict(),
|
||||
|
||||
@ -299,7 +299,7 @@ class TestDTensorReshardMeshChange(DTensorTestBase):
|
||||
|
||||
@with_comms
|
||||
@with_temp_dir
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_dtensor_checkpoint_with_uneven_shards(self) -> None:
|
||||
"""
|
||||
Saving a dtensor with uneven shards.
|
||||
@ -436,7 +436,6 @@ class TestCheckpointableReshard(DTensorTestBase):
|
||||
|
||||
@with_comms
|
||||
@with_temp_dir
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_uneven_reshard_with_checkpointable_api(self) -> None:
|
||||
"""
|
||||
Saves a 1d distributed tensor that has shards with uneven sizes using Checkpointable API.
|
||||
@ -499,7 +498,6 @@ class TestCheckpointableReshard(DTensorTestBase):
|
||||
|
||||
@with_comms
|
||||
@with_temp_dir
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_uneven_reshard_with_dtensor_shards_wrapper_api(self) -> None:
|
||||
"""
|
||||
Saves a 1d distributed tensor that has shards with uneven sizes using Checkpointable API.
|
||||
|
||||
@ -208,7 +208,7 @@ class TestSingleRankSaveLoad(TestCase):
|
||||
|
||||
# Create model.safetensors.index.json with weight mapping
|
||||
weight_map = {}
|
||||
for key in quantized_checkpoint:
|
||||
for key in quantized_checkpoint.keys():
|
||||
weight_map[key] = "model.safetensors"
|
||||
|
||||
index_data = {
|
||||
@ -245,7 +245,7 @@ class TestSingleRankSaveLoad(TestCase):
|
||||
sorted(original_tensors.keys()), sorted(state_dict_to_load.keys())
|
||||
)
|
||||
|
||||
for tensor_name in original_tensors:
|
||||
for tensor_name in original_tensors.keys():
|
||||
original = original_tensors[tensor_name]
|
||||
loaded = state_dict_to_load[tensor_name]
|
||||
|
||||
|
||||
@ -15,7 +15,6 @@ from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
class TestQuantizedHfStorage(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
"""Set up common test fixtures."""
|
||||
self.temp_dir = tempfile.TemporaryDirectory()
|
||||
self.path = self.temp_dir.name
|
||||
|
||||
@ -886,7 +886,7 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
self.assertEqual(cpu_model_value, meta_model_value)
|
||||
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_setting_meta_device_model_broadcasting_and_memory(self) -> None:
|
||||
# This test verifies that we can set model state dict by a meta device model
|
||||
# With the correlated changes in state_dict, meta device model should be accepted
|
||||
|
||||
@ -21,7 +21,6 @@ from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
class SignalHandlingTest(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Save original environment variable if it exists
|
||||
self.original_signals_env = os.environ.get(
|
||||
"TORCHELASTIC_SIGNALS_TO_HANDLE", None
|
||||
|
||||
@ -479,7 +479,6 @@ class TestFSDPMiscMultiProcess(FSDPTest):
|
||||
for (n, p), (n_prev, p_prev) in zip(
|
||||
fsdp_overlap.named_parameters(), fsdp_overlap_prev_params
|
||||
):
|
||||
self.assertEqual(n, n_prev)
|
||||
self.assertNotEqual(
|
||||
p,
|
||||
p_prev,
|
||||
|
||||
@ -498,7 +498,7 @@ class TestFSDPMixedPrecision(FSDPTest):
|
||||
for name, tensor in state_dict.items():
|
||||
# Parameters and buffers are checkpointed in their
|
||||
# original dtypes, which may be different.
|
||||
if name in named_buffers:
|
||||
if name in named_buffers.keys():
|
||||
self.assertEqual(tensor.dtype, _BUFFER_ORIG_DTYPE)
|
||||
else:
|
||||
self.assertEqual(
|
||||
|
||||
@ -16,7 +16,6 @@ from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
class LauncherApiTest(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Save original environment variable if it exists
|
||||
self.original_signals_env = os.environ.get(
|
||||
"TORCHELASTIC_SIGNALS_TO_HANDLE", None
|
||||
|
||||
@ -21,7 +21,6 @@ from torch.distributed.pipelining import (
|
||||
from torch.distributed.pipelining._utils import generate_stage_to_rank_mapping
|
||||
from torch.distributed.pipelining.schedules import (
|
||||
_Action,
|
||||
_add_reduce_grad,
|
||||
_add_send_recv,
|
||||
_add_unshard_reshard,
|
||||
_format_pipeline_order,
|
||||
@ -575,45 +574,6 @@ class TestScheduleLowering(TestCase):
|
||||
),
|
||||
)
|
||||
|
||||
@parametrize(
|
||||
"test_info",
|
||||
[
|
||||
{
|
||||
"compute": ["0F0", "0F1", " ", "0B0", "0B1"],
|
||||
"comms": ["0F0", "0F1", "0B0", "0B1", "0REDUCE_GRAD"],
|
||||
},
|
||||
{
|
||||
"compute": ["0F0", "0F1", "1F0", "1F1", "1B0", "1B1", "0B0", "0B1"],
|
||||
"comms": [
|
||||
"0F0",
|
||||
"0F1",
|
||||
"1F0",
|
||||
"1F1",
|
||||
"1B0",
|
||||
"1B1",
|
||||
"1REDUCE_GRAD",
|
||||
"0B0",
|
||||
"0B1",
|
||||
"0REDUCE_GRAD",
|
||||
],
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_reduce_grad(self, test_info):
|
||||
compute_sch = self._parse_actions(test_info["compute"])
|
||||
expected_comms_sch = self._parse_actions(test_info["comms"])
|
||||
|
||||
comms_sch = _add_reduce_grad(compute_sch, 2)
|
||||
for expected, actual in zip(expected_comms_sch, comms_sch, strict=True):
|
||||
self.assertEqual(
|
||||
expected,
|
||||
actual,
|
||||
(
|
||||
f"Mismatch: expected action {expected} but found {actual}."
|
||||
f"\nWhole Schedule: {comms_sch}"
|
||||
),
|
||||
)
|
||||
|
||||
@parametrize(
|
||||
"test_info",
|
||||
[
|
||||
|
||||
@ -39,7 +39,6 @@ from torch.nn.modules.loss import MSELoss
|
||||
from torch.testing._internal.common_distributed import (
|
||||
MultiProcContinuousTest,
|
||||
requires_accelerator_dist_backend,
|
||||
skip_if_lt_x_gpu,
|
||||
)
|
||||
from torch.testing._internal.common_utils import (
|
||||
check_leaked_tensors,
|
||||
@ -47,7 +46,6 @@ from torch.testing._internal.common_utils import (
|
||||
parametrize,
|
||||
run_tests,
|
||||
skip_but_pass_in_sandcastle_if,
|
||||
TEST_MULTIACCELERATOR,
|
||||
)
|
||||
|
||||
|
||||
@ -58,6 +56,7 @@ batch_size = 64
|
||||
torch.manual_seed(0)
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
backend = dist.get_default_backend_for_device(device_type)
|
||||
TEST_MULTIACCELERATOR = torch.accelerator.device_count() >= 2
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -232,7 +231,6 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize("ScheduleClass", [_ScheduleForwardOnly])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_forward_only(self, ScheduleClass):
|
||||
mod, mod_ref, x, _, _ = setup_models_and_data(self.config)
|
||||
x_clone = x.clone()
|
||||
@ -276,7 +274,6 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
ScheduleInterleavedZeroBubble,
|
||||
],
|
||||
)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_eval_inference_mode(self, ScheduleClass):
|
||||
num_microbatches = 4
|
||||
if ScheduleClass in [
|
||||
@ -354,7 +351,6 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
ScheduleInterleavedZeroBubble,
|
||||
],
|
||||
)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_return_output(self, ScheduleClass):
|
||||
num_microbatches = 4
|
||||
if ScheduleClass in [
|
||||
@ -410,7 +406,6 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_multi_iter(self, ScheduleClass):
|
||||
mod, _, x, target, loss_fn = setup_models_and_data(self.config)
|
||||
chunks = 4
|
||||
@ -434,7 +429,6 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_kwargs_with_tracer(self, ScheduleClass):
|
||||
mod = ModelWithKwargs(d_hid, splits=self.world_size)
|
||||
mod.to(self.device)
|
||||
@ -487,7 +481,6 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_grad_with_tracer(self, ScheduleClass):
|
||||
mod, ref_mod, x, target, loss_fn = setup_models_and_data(self.config)
|
||||
|
||||
@ -530,7 +523,6 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
)
|
||||
@parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
|
||||
@parametrize("shape_inference", [True, False])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_grad_with_manual(self, ScheduleClass, shape_inference):
|
||||
mod, ref_mod, x, target, loss_fn = setup_models_and_data(self.config)
|
||||
|
||||
@ -594,7 +586,6 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
ScheduleInterleavedZeroBubble,
|
||||
],
|
||||
)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_grad_with_manual_interleaved(self, ScheduleClass):
|
||||
stages_per_rank = 2
|
||||
n_stages = stages_per_rank * self.world_size
|
||||
@ -659,7 +650,6 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize("ScheduleClass", [ScheduleInterleavedZeroBubble])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_schedule_with_weight_update_mlp_e2e(self, ScheduleClass):
|
||||
stages_per_rank = 2
|
||||
n_stages = stages_per_rank * self.world_size
|
||||
@ -746,7 +736,6 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
"schedule_class",
|
||||
[ScheduleZBVZeroBubble, ScheduleDualPipeV],
|
||||
)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_v_shape_schedules(self, schedule_class):
|
||||
n_stages = 8
|
||||
rank_stages = {0: [0, 7], 1: [1, 6], 2: [2, 5], 3: [3, 4]}
|
||||
@ -791,7 +780,6 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_custom_function_callback(self):
|
||||
"""Test the custom function callback functionality with _PipelineScheduleRuntime."""
|
||||
n_stages = 8
|
||||
@ -991,7 +979,6 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
"ScheduleClass",
|
||||
[ScheduleInterleavedZeroBubble, ScheduleInterleaved1F1B],
|
||||
)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_zero_bubble_with_model_kwargs(self, ScheduleClass):
|
||||
stages_per_rank = 2
|
||||
n_stages = stages_per_rank * self.world_size
|
||||
@ -1085,7 +1072,6 @@ class CustomSchedulesTest(MultiProcContinuousTest):
|
||||
"schedule_class",
|
||||
[ScheduleVShaped, ScheduleUnbalanced],
|
||||
)
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_non_symmetric_stage_ids(self, schedule_class):
|
||||
n_stages = schedule_class.n_stages
|
||||
rank_stages = schedule_class.rank_stages
|
||||
@ -1135,7 +1121,6 @@ class CustomSchedulesTest(MultiProcContinuousTest):
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize("ScheduleClass", [ScheduleWithReorderedB])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_pipeline_schedule_runtime_custom_sched(self, ScheduleClass):
|
||||
n_stages = 2
|
||||
stages_per_rank = 1
|
||||
@ -1196,7 +1181,6 @@ class CustomSchedulesTest(MultiProcContinuousTest):
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize("ScheduleClass", [ScheduleWithW])
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_schedule_with_native_zero_bubble(self, ScheduleClass):
|
||||
n_stages = ScheduleClass.n_stages
|
||||
num_microbatches = ScheduleClass.num_microbatches
|
||||
|
||||
@ -24,7 +24,6 @@ from torch.testing._internal.common_utils import (
|
||||
parametrize,
|
||||
run_tests,
|
||||
skip_but_pass_in_sandcastle_if,
|
||||
TEST_MULTIACCELERATOR,
|
||||
)
|
||||
from torch.utils._pytree import tree_map_only
|
||||
|
||||
@ -35,6 +34,7 @@ chunks = 8
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
backend = dist.get_default_backend_for_device(device_type)
|
||||
TEST_MULTIACCELERATOR = torch.accelerator.device_count() >= 2
|
||||
|
||||
torch.manual_seed(0)
|
||||
|
||||
|
||||
@ -50,24 +50,22 @@ class TestDTensorDebugMode(TestCase):
|
||||
x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
|
||||
y_dtensor = DTensor.from_local(y, mesh, [Shard(0)], run_check=False)
|
||||
|
||||
with DebugMode(
|
||||
record_torchfunction=True, record_ids=True, record_output=True
|
||||
) as debug_mode:
|
||||
with DebugMode(record_torchfunction=True) as debug_mode:
|
||||
torch.mm(x_dtensor, y_dtensor).sum()
|
||||
|
||||
self.assertExpectedInline(
|
||||
debug_mode.debug_string(),
|
||||
"""\
|
||||
torch.mm(dt$0: f32[8, 8]| S(0), dt$1: f32[8, 32]| S(0)) -> dt$6: f32[8, 32]| S(0)
|
||||
aten::mm(dt$0: f32[8, 8]| S(0), dt$1: f32[8, 32]| S(0))
|
||||
torch.mm(dt: f32[8, 8]| S(0), dt: f32[8, 32]| S(0))
|
||||
aten::mm(dt: f32[8, 8]| S(0), dt: f32[8, 32]| S(0))
|
||||
redistribute_input(1, S(0) -> R)
|
||||
redistribute_input(t$2: f32[1, 32], trace: S(0)->R)
|
||||
_c10d_functional::all_gather_into_tensor(t$2: f32[1, 32], 8, 0) -> t$3: f32[8, 32]
|
||||
_c10d_functional::wait_tensor(t$3: f32[8, 32]) -> t$3: f32[8, 32]
|
||||
aten::mm(t$4: f32[1, 8], t$3: f32[8, 32]) -> t$5: f32[1, 32]
|
||||
<method 'sum' of 'torch._C.TensorBase' objects>(dt$6: f32[8, 32]| S(0)) -> dt$8: f32[]| P
|
||||
aten::sum(dt$6: f32[8, 32]| S(0))
|
||||
aten::sum(t$5: f32[1, 32]) -> t$7: f32[]""",
|
||||
redistribute_input(t: f32[1, 32], trace: S(0)->R)
|
||||
_c10d_functional::all_gather_into_tensor(t: f32[1, 32], 8, 0)
|
||||
_c10d_functional::wait_tensor(t: f32[8, 32])
|
||||
aten::mm(t: f32[1, 8], t: f32[8, 32])
|
||||
<method 'sum' of 'torch._C.TensorBase' objects>(dt: f32[8, 32]| S(0))
|
||||
aten::sum(dt: f32[8, 32]| S(0))
|
||||
aten::sum(t: f32[1, 32])""",
|
||||
)
|
||||
|
||||
self.assertTrue(isinstance(debug_mode.operators[0], _OpCall))
|
||||
|
||||
@ -3,8 +3,7 @@
|
||||
import itertools
|
||||
import random
|
||||
import unittest
|
||||
from collections.abc import Callable
|
||||
from typing import Any, ClassVar, Optional
|
||||
from typing import Any, Callable, ClassVar, Optional
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
@ -26,7 +26,6 @@ from torch.distributed.tensor.parallel import (
|
||||
RowwiseParallel,
|
||||
SequenceParallel,
|
||||
)
|
||||
from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
create_local_tensor_test_class,
|
||||
@ -765,7 +764,6 @@ class DistMathOpsTest(DTensorTestBase):
|
||||
self.assertEqual(grad1_norm.device_mesh, mesh_y)
|
||||
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_foreach_add_different_mesh(self):
|
||||
mesh_shape = (2, self.world_size // 2)
|
||||
mesh_2d = init_device_mesh(
|
||||
|
||||
@ -577,7 +577,7 @@ class DistTensorReplicateStrategyRegistrationTest(DTensorTestBase):
|
||||
self.assertEqual(
|
||||
comm_mode.get_comm_counts(),
|
||||
{
|
||||
torch.ops.c10d_functional.all_gather_into_tensor: self.world_size,
|
||||
torch.ops.c10d_functional.all_gather_into_tensor: 4,
|
||||
},
|
||||
)
|
||||
expected_cost = [
|
||||
|
||||
@ -1,18 +1,11 @@
|
||||
# Owner(s): ["oncall: distributed"]
|
||||
|
||||
import itertools
|
||||
from contextlib import nullcontext
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch.distributed._local_tensor import (
|
||||
local_tensor_mode,
|
||||
LocalTensor,
|
||||
LocalTensorMode,
|
||||
)
|
||||
from torch.distributed.device_mesh import init_device_mesh
|
||||
from torch.distributed.tensor import DeviceMesh, distribute_tensor, DTensor
|
||||
from torch.distributed.tensor import distribute_tensor, DTensor
|
||||
from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
|
||||
from torch.distributed.tensor._utils import (
|
||||
_compute_local_shape_and_global_offset,
|
||||
@ -21,7 +14,6 @@ from torch.distributed.tensor._utils import (
|
||||
compute_global_tensor_shape,
|
||||
compute_local_shape_and_global_offset,
|
||||
compute_local_tensor_info,
|
||||
ExplicitRedistributionContext,
|
||||
)
|
||||
from torch.distributed.tensor.debug import CommDebugMode
|
||||
from torch.distributed.tensor.placement_types import (
|
||||
@ -859,93 +851,5 @@ class Test2DStridedLocalShard(DTensorTestBase):
|
||||
self.assertEqual(global_tensor, dtensor_2d.full_tensor())
|
||||
|
||||
|
||||
class LocalTensorTestBase(TestCase):
|
||||
def assertEqual(self, lhs, rhs, **kwargs):
|
||||
mode = local_tensor_mode()
|
||||
with nullcontext() if mode is None else mode.disable():
|
||||
if isinstance(lhs, LocalTensor) and isinstance(rhs, LocalTensor):
|
||||
assert isinstance(lhs, LocalTensor) and isinstance(rhs, LocalTensor)
|
||||
super().assertEqual(lhs._ranks, rhs._ranks)
|
||||
for r in lhs._ranks:
|
||||
super().assertEqual(
|
||||
lhs._local_tensors[r],
|
||||
rhs._local_tensors[r],
|
||||
lambda m: f"rank {r}: {m}",
|
||||
)
|
||||
elif isinstance(lhs, LocalTensor) or isinstance(rhs, LocalTensor):
|
||||
lhs, rhs = (lhs, rhs) if isinstance(lhs, LocalTensor) else (rhs, lhs)
|
||||
for r in lhs._ranks:
|
||||
super().assertEqual(
|
||||
lhs._local_tensors[r], rhs, lambda m: f"rank {r}: {m}"
|
||||
)
|
||||
else:
|
||||
return super().assertEqual(lhs, rhs, **kwargs)
|
||||
|
||||
@property
|
||||
def world_size(self):
|
||||
raise NotImplementedError("override world-size in your subclass")
|
||||
|
||||
def build_device_mesh(self) -> DeviceMesh:
|
||||
return init_device_mesh("cpu", (self.world_size,))
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
torch.distributed.init_process_group(
|
||||
# TODO: test other ranks too
|
||||
"fake",
|
||||
rank=0,
|
||||
world_size=self.world_size,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
try:
|
||||
dist.destroy_process_group()
|
||||
except AssertionError:
|
||||
pass
|
||||
|
||||
|
||||
class TestExplicitRedistribute(LocalTensorTestBase):
|
||||
@property
|
||||
def world_size(self):
|
||||
return 4
|
||||
|
||||
def test_explicit_matmul(self):
|
||||
with LocalTensorMode(self.world_size):
|
||||
device_mesh = self.build_device_mesh()
|
||||
dim = 128
|
||||
x = torch.randn(8, dim, requires_grad=True)
|
||||
A = torch.randn(dim, dim, requires_grad=True)
|
||||
|
||||
# Prepare DTensors
|
||||
dx = distribute_tensor(x, device_mesh, [Shard(0)])
|
||||
dA = distribute_tensor(A, device_mesh, [Shard(0)])
|
||||
|
||||
# implicit redistribute works as usual by default
|
||||
with CommDebugMode() as comm_mode:
|
||||
torch.matmul(dx, dA)
|
||||
self.assertEqual(comm_mode.get_total_counts(), 1)
|
||||
|
||||
# explicit redistribute works too
|
||||
with ExplicitRedistributionContext():
|
||||
with self.assertRaisesRegex(RuntimeError, "Implicit redistribution"):
|
||||
torch.matmul(dx, dA)
|
||||
|
||||
# explicit redistribute allows manual redistribute
|
||||
with ExplicitRedistributionContext():
|
||||
dA_repl = dA.redistribute(device_mesh, [Replicate()])
|
||||
torch.matmul(dx, dA_repl)
|
||||
|
||||
dx = distribute_tensor(x, device_mesh, [Shard(0)])
|
||||
dA = distribute_tensor(A, device_mesh, [Replicate()])
|
||||
with ExplicitRedistributionContext():
|
||||
dY = torch.matmul(dx, dA_repl)
|
||||
loss = dY.sum()
|
||||
|
||||
# we now see the error during backwards
|
||||
with self.assertRaisesRegex(RuntimeError, "Implicit redistribution"):
|
||||
loss.backward()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user