mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-14 06:07:55 +08:00
Compare commits
129 Commits
cpp-docs-d
...
trunk/7206
| Author | SHA1 | Date | |
|---|---|---|---|
| 7206668f7c | |||
| 7729de07d3 | |||
| 73078f305f | |||
| ea7add4837 | |||
| 0ed4119420 | |||
| 03fd2b796e | |||
| fd7bf9ce10 | |||
| 41c9eeecec | |||
| bfc0ba4af9 | |||
| 3fdc5dbf1d | |||
| cc477f6009 | |||
| 7b055a0103 | |||
| da2eb31b82 | |||
| 2005b5f548 | |||
| b2d72a4008 | |||
| 80ec2ab78e | |||
| c724f0097d | |||
| a51208c656 | |||
| ed4aa449b6 | |||
| 9eebda944d | |||
| 09d8953fb4 | |||
| 8b2365094d | |||
| 7b423c2d21 | |||
| 46b3f913b3 | |||
| f7b7f40a6f | |||
| 91337ae3ff | |||
| eea951758f | |||
| 3feea296a5 | |||
| c3c3653418 | |||
| f72772b184 | |||
| 981dd71893 | |||
| d31599f40b | |||
| 85fab6c9b0 | |||
| c08ce30d18 | |||
| e1a1aeaf5b | |||
| 943227f57b | |||
| 3a2d75a086 | |||
| 69af74972b | |||
| 7432676187 | |||
| fd5edda1ed | |||
| 872d1daec2 | |||
| 6cd57e6fc2 | |||
| d29efba8fa | |||
| a344069f2a | |||
| af829c0dad | |||
| 3869aa115b | |||
| 47eb34b7ac | |||
| 08200280ce | |||
| ad7a57262c | |||
| 711a775878 | |||
| e9a688f02e | |||
| e69aaaf45a | |||
| fd8f368d31 | |||
| 13d2cc7bd2 | |||
| c6c913d18e | |||
| ef3f953966 | |||
| ea44f12bce | |||
| a74fe75c45 | |||
| 6d30666bc1 | |||
| 8e8cbb85ee | |||
| fbd70fb84e | |||
| 6c5db82584 | |||
| 6052a01b71 | |||
| 14b153bcf2 | |||
| 641de23c96 | |||
| 89165c0a2b | |||
| dcc2ba4ca4 | |||
| ad5c7c20e0 | |||
| c86540f120 | |||
| c17aa0f113 | |||
| 4ff068c33a | |||
| 0c7a4a6b48 | |||
| f93ee16fb6 | |||
| 9c2c3dbc15 | |||
| d4dcd0354c | |||
| aba2fa3259 | |||
| d2d13bf62d | |||
| 7a6ff88196 | |||
| 59563dfe56 | |||
| 5c639466f7 | |||
| 0b4dd08e04 | |||
| edd8d356b6 | |||
| 658c5f879c | |||
| 59a6c83dfe | |||
| 431dfe8692 | |||
| c00696144d | |||
| 9ffc480c5a | |||
| 14956eaef4 | |||
| 066c5c57a9 | |||
| 08ef852a4b | |||
| 56fc99915b | |||
| 5863ba1b2e | |||
| a743f9eeb5 | |||
| 53b03f1a2b | |||
| cd5d810c3a | |||
| 01e6e35c7f | |||
| bcd159bcdd | |||
| 64ae31c5d3 | |||
| 45da6e1fe1 | |||
| 39160dba0c | |||
| f2fbc81c50 | |||
| 4271ffe918 | |||
| 7eefcfb1db | |||
| 4b12c0344d | |||
| 661b639663 | |||
| 0cd809f60c | |||
| a96728d188 | |||
| c1e91bd4c3 | |||
| d7e2d0ad30 | |||
| 81038fd326 | |||
| e020fb3431 | |||
| e8052f2f99 | |||
| a64c7d7404 | |||
| cdca63db8c | |||
| ed45c5f38d | |||
| 7f0e932136 | |||
| 2673f8b007 | |||
| 4e1bd16738 | |||
| 871d0cd196 | |||
| 2bba37309b | |||
| b4e4ee81d3 | |||
| 3283eaa5ba | |||
| 397d9fe2ae | |||
| d77c24caac | |||
| cef98ae5cb | |||
| 52ea135f77 | |||
| a5f3035aaf | |||
| 1d3f5e19da | |||
| 496277a8ff |
@ -7,13 +7,13 @@ ENV LC_ALL en_US.UTF-8
|
||||
ENV LANG en_US.UTF-8
|
||||
ENV LANGUAGE en_US.UTF-8
|
||||
|
||||
ARG DEVTOOLSET_VERSION=11
|
||||
ARG DEVTOOLSET_VERSION=13
|
||||
|
||||
RUN yum -y update
|
||||
RUN yum -y install epel-release
|
||||
# install glibc-langpack-en make sure en_US.UTF-8 locale is available
|
||||
RUN yum -y install glibc-langpack-en
|
||||
RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
|
||||
RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
|
||||
# Just add everything as a safe.directory for git since these will be used in multiple places with git
|
||||
RUN git config --global --add safe.directory '*'
|
||||
ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
|
||||
@ -41,6 +41,7 @@ RUN bash ./install_conda.sh && rm install_conda.sh
|
||||
# Install CUDA
|
||||
FROM base as cuda
|
||||
ARG CUDA_VERSION=12.6
|
||||
ARG DEVTOOLSET_VERSION=13
|
||||
RUN rm -rf /usr/local/cuda-*
|
||||
ADD ./common/install_cuda.sh install_cuda.sh
|
||||
COPY ./common/install_nccl.sh install_nccl.sh
|
||||
@ -50,7 +51,8 @@ ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
|
||||
# Preserve CUDA_VERSION for the builds
|
||||
ENV CUDA_VERSION=${CUDA_VERSION}
|
||||
# Make things in our path by default
|
||||
ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH
|
||||
ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
|
||||
|
||||
|
||||
FROM cuda as cuda12.6
|
||||
RUN bash ./install_cuda.sh 12.6
|
||||
@ -68,8 +70,22 @@ FROM cuda as cuda13.0
|
||||
RUN bash ./install_cuda.sh 13.0
|
||||
ENV DESIRED_CUDA=13.0
|
||||
|
||||
FROM ${ROCM_IMAGE} as rocm
|
||||
FROM ${ROCM_IMAGE} as rocm_base
|
||||
ARG DEVTOOLSET_VERSION=13
|
||||
ENV LC_ALL en_US.UTF-8
|
||||
ENV LANG en_US.UTF-8
|
||||
ENV LANGUAGE en_US.UTF-8
|
||||
# Install devtoolset on ROCm base image
|
||||
RUN yum -y update && \
|
||||
yum -y install epel-release && \
|
||||
yum -y install glibc-langpack-en && \
|
||||
yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
|
||||
RUN git config --global --add safe.directory '*'
|
||||
ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
|
||||
|
||||
FROM rocm_base as rocm
|
||||
ARG PYTORCH_ROCM_ARCH
|
||||
ARG DEVTOOLSET_VERSION=13
|
||||
ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
|
||||
ADD ./common/install_mkl.sh install_mkl.sh
|
||||
RUN bash ./install_mkl.sh && rm install_mkl.sh
|
||||
@ -88,6 +104,7 @@ COPY --from=cuda13.0 /usr/local/cuda-13.0 /usr/local/cuda-13.0
|
||||
|
||||
# Final step
|
||||
FROM ${BASE_TARGET} as final
|
||||
ARG DEVTOOLSET_VERSION=13
|
||||
COPY --from=openssl /opt/openssl /opt/openssl
|
||||
COPY --from=patchelf /patchelf /usr/local/bin/patchelf
|
||||
COPY --from=conda /opt/conda /opt/conda
|
||||
|
||||
@ -63,7 +63,7 @@ docker build \
|
||||
--target final \
|
||||
--progress plain \
|
||||
--build-arg "BASE_TARGET=${BASE_TARGET}" \
|
||||
--build-arg "DEVTOOLSET_VERSION=11" \
|
||||
--build-arg "DEVTOOLSET_VERSION=13" \
|
||||
${EXTRA_BUILD_ARGS} \
|
||||
-t ${tmp_tag} \
|
||||
$@ \
|
||||
|
||||
@ -261,9 +261,9 @@ case "$tag" in
|
||||
PYTHON_VERSION=3.10
|
||||
CUDA_VERSION=12.8.1
|
||||
;;
|
||||
pytorch-linux-jammy-aarch64-py3.10-gcc11)
|
||||
pytorch-linux-jammy-aarch64-py3.10-gcc13)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
GCC_VERSION=13
|
||||
ACL=yes
|
||||
VISION=yes
|
||||
OPENBLAS=yes
|
||||
@ -271,9 +271,19 @@ case "$tag" in
|
||||
# from pytorch/llvm:9.0.1 is x86 specific
|
||||
SKIP_LLVM_SRC_BUILD_INSTALL=yes
|
||||
;;
|
||||
pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks)
|
||||
pytorch-linux-jammy-aarch64-py3.10-clang21)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
CLANG_VERSION=21
|
||||
ACL=yes
|
||||
VISION=yes
|
||||
OPENBLAS=yes
|
||||
# snadampal: skipping llvm src build install because the current version
|
||||
# from pytorch/llvm:9.0.1 is x86 specific
|
||||
SKIP_LLVM_SRC_BUILD_INSTALL=yes
|
||||
;;
|
||||
pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=13
|
||||
ACL=yes
|
||||
VISION=yes
|
||||
OPENBLAS=yes
|
||||
|
||||
@ -1 +1 @@
|
||||
7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
|
||||
bfeb066872bc1e8b2d2bc0a3b295b99dd77206e7
|
||||
|
||||
@ -8,8 +8,8 @@ if [ -n "$CLANG_VERSION" ]; then
|
||||
# work around ubuntu apt-get conflicts
|
||||
sudo apt-get -y -f install
|
||||
wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
|
||||
if [[ $CLANG_VERSION == 18 ]]; then
|
||||
apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
|
||||
if [[ $CLANG_VERSION -ge 18 ]]; then
|
||||
apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VERSION} main"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
@ -7,11 +7,11 @@ if [ -n "$GCC_VERSION" ]; then
|
||||
# Need the official toolchain repo to get alternate packages
|
||||
add-apt-repository ppa:ubuntu-toolchain-r/test
|
||||
apt-get update
|
||||
apt-get install -y g++-$GCC_VERSION
|
||||
apt-get install -y g++-$GCC_VERSION gfortran-$GCC_VERSION
|
||||
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50
|
||||
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50
|
||||
update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50
|
||||
|
||||
update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-"$GCC_VERSION" 50
|
||||
|
||||
# Cleanup package manager
|
||||
apt-get autoclean && apt-get clean
|
||||
|
||||
56
.ci/docker/common/install_libgomp.sh
Normal file
56
.ci/docker/common/install_libgomp.sh
Normal file
@ -0,0 +1,56 @@
|
||||
#!/bin/bash
|
||||
# Script used only in CD pipeline
|
||||
|
||||
set -ex
|
||||
|
||||
# install dependencies
|
||||
dnf -y install gmp-devel libmpc-devel texinfo flex bison
|
||||
|
||||
cd /usr/local/src
|
||||
# fetch source for gcc 13
|
||||
git clone --depth 1 --single-branch -b releases/gcc-13.3.0 https://github.com/gcc-mirror/gcc.git gcc-13.3.0
|
||||
|
||||
mkdir -p gcc-13.3.0/build-gomp
|
||||
cd gcc-13.3.0/build-gomp
|
||||
|
||||
# configure gcc build
|
||||
# I got these flags by:
|
||||
# 1. downloading the source rpm for gcc-11 on AlmaLinux 8 container
|
||||
# dnf install -y dnf-plugins-core rpmdevtools
|
||||
# dnf download --source libgomp
|
||||
# 2. extracting the gcc.spec from the source.
|
||||
# rpmdev-extract gcc-xx.src.rpm
|
||||
# 3. extracting optflags and ld_flags from gcc.spec:
|
||||
# rpm --eval '%{optflags}'
|
||||
# rpm --eval '%{build_ldflags}'
|
||||
#
|
||||
# I had to remove the following flags because they didn't compile for this version of libgomp:
|
||||
# -Werror=format-security
|
||||
# -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1
|
||||
# -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1
|
||||
#
|
||||
# I added -march=armv8-a -mtune=generic to make them explicit. I don't think they're strictly needed.
|
||||
|
||||
OPT_FLAGS='-O2 -march=armv8-a -mtune=generic'\
|
||||
' -fexceptions -g -grecord-gcc-switches -pipe -Wall'\
|
||||
' -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS'\
|
||||
' -fstack-protector-strong -fasynchronous-unwind-tables'\
|
||||
' -fstack-clash-protection'
|
||||
|
||||
LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now'
|
||||
|
||||
CFLAGS="$OPT_FLAGS" \
|
||||
CXXFLAGS="$OPT_FLAGS" \
|
||||
LDFLAGS="$LDFLAGS" \
|
||||
../configure \
|
||||
--prefix=/usr \
|
||||
--libdir=/usr/lib64 \
|
||||
--enable-languages=c,c++ \
|
||||
--disable-multilib \
|
||||
--disable-bootstrap \
|
||||
--enable-libgomp
|
||||
|
||||
# only build libgomp
|
||||
make -j$(nproc) all-target-libgomp
|
||||
|
||||
make install-target-libgomp
|
||||
@ -10,6 +10,7 @@ git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" -
|
||||
|
||||
OPENBLAS_CHECKOUT_DIR="OpenBLAS"
|
||||
OPENBLAS_BUILD_FLAGS="
|
||||
CC=gcc
|
||||
NUM_THREADS=128
|
||||
USE_OPENMP=1
|
||||
NO_SHARED=0
|
||||
|
||||
@ -50,6 +50,10 @@ RUN rm install_ninja.sh
|
||||
ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
|
||||
ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
|
||||
|
||||
# Build a newer version of libgomp than that supported in in Almalinux 8.
|
||||
COPY ./common/install_libgomp.sh install_libgomp.sh
|
||||
RUN bash ./install_libgomp.sh && rm install_libgomp.sh
|
||||
|
||||
# git236+ would refuse to run git commands in repos owned by other users
|
||||
# Which causes version check to fail, as pytorch repo is bind-mounted into the image
|
||||
# Override this behaviour by treating every folder as safe
|
||||
|
||||
@ -1,15 +1,11 @@
|
||||
sphinx==5.3.0
|
||||
sphinx==7.2.6
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 5.3.0
|
||||
#Pinned versions: 7.2.6
|
||||
|
||||
standard-imghdr==3.13.0; python_version >= "3.13"
|
||||
#Description: This is needed by Sphinx, so it needs to be added here.
|
||||
# The reasons are as follows:
|
||||
# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
|
||||
# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
|
||||
# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
|
||||
pytorch_sphinx_theme2==0.2.0
|
||||
#Description: This is needed to generate PyTorch docs
|
||||
#Pinned versions: 0.2.0
|
||||
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
|
||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
|
||||
# something related to Docker setup. We can investigate this later.
|
||||
@ -36,17 +32,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 2.13.0
|
||||
|
||||
breathe==4.34.0
|
||||
breathe==4.36.0
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
#Pinned versions: 4.34.0
|
||||
#Pinned versions: 4.36.0
|
||||
|
||||
exhale==0.2.3
|
||||
exhale==0.3.7
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
#Pinned versions: 0.2.3
|
||||
#Pinned versions: 0.3.7
|
||||
|
||||
docutils==0.16
|
||||
docutils==0.20
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
#Pinned versions: 0.16
|
||||
#Pinned versions: 0.20
|
||||
|
||||
bs4==0.0.1
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
@ -56,13 +52,13 @@ IPython==8.12.0
|
||||
#Description: This is used to generate PyTorch functorch docs
|
||||
#Pinned versions: 8.12.0
|
||||
|
||||
myst-nb==0.17.2
|
||||
myst-nb==1.3.0
|
||||
#Description: This is used to generate PyTorch functorch and torch.compile docs.
|
||||
#Pinned versions: 0.17.2
|
||||
#Pinned versions: 1.3.0
|
||||
|
||||
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
|
||||
python-etcd==0.4.5
|
||||
sphinx-copybutton==0.5.0
|
||||
sphinx-design==0.4.0
|
||||
sphinx-design==0.6.1
|
||||
sphinxcontrib-mermaid==1.0.0
|
||||
myst-parser==0.18.1
|
||||
myst-parser==4.0.1
|
||||
|
||||
@ -1 +1 @@
|
||||
3.5.0
|
||||
3.5.1
|
||||
|
||||
@ -6,8 +6,8 @@ set -eou pipefail
|
||||
# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
|
||||
# post merge of https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
|
||||
# https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
|
||||
|
||||
# Folders for the build
|
||||
PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
|
||||
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
|
||||
|
||||
# Fetch magma sources and verify checksum
|
||||
pushd ${PACKAGE_DIR}
|
||||
git clone https://github.com/icl-utk-edu/magma
|
||||
git clone https://github.com/jeffdaily/magma
|
||||
pushd magma
|
||||
git checkout ${MAGMA_VERSION}
|
||||
popd
|
||||
|
||||
@ -89,23 +89,41 @@ if [ "$is_main_doc" = true ]; then
|
||||
|
||||
make coverage
|
||||
# Now we have the coverage report, we need to make sure it is empty.
|
||||
# Count the number of lines in the file and turn that number into a variable
|
||||
# $lines. The `cut -f1 ...` is to only parse the number, not the filename
|
||||
# Skip the report header by subtracting 2: the header will be output even if
|
||||
# there are no undocumented items.
|
||||
# Sphinx 7.2.6+ format: python.txt contains a statistics table with a TOTAL row
|
||||
# showing the undocumented count in the third column.
|
||||
# Example: | TOTAL | 99.83% | 2 |
|
||||
#
|
||||
# Also: see docs/source/conf.py for "coverage_ignore*" items, which should
|
||||
# be documented then removed from there.
|
||||
lines=$(wc -l build/coverage/python.txt 2>/dev/null |cut -f1 -d' ')
|
||||
undocumented=$((lines - 2))
|
||||
if [ $undocumented -lt 0 ]; then
|
||||
|
||||
# Extract undocumented count from TOTAL row in Sphinx 7.2.6 statistics table
|
||||
# The table format is: | Module | Coverage | Undocumented |
|
||||
# Extract the third column (undocumented count) from the TOTAL row
|
||||
undocumented=$(grep "| TOTAL" build/coverage/python.txt | awk -F'|' '{print $4}' | tr -d ' ')
|
||||
|
||||
if [ -z "$undocumented" ] || ! [[ "$undocumented" =~ ^[0-9]+$ ]]; then
|
||||
echo coverage output not found
|
||||
exit 1
|
||||
elif [ $undocumented -gt 0 ]; then
|
||||
echo undocumented objects found:
|
||||
cat build/coverage/python.txt
|
||||
elif [ "$undocumented" -gt 0 ]; then
|
||||
set +x # Disable command echoing for cleaner output
|
||||
echo ""
|
||||
echo "====================="
|
||||
echo "UNDOCUMENTED OBJECTS:"
|
||||
echo "====================="
|
||||
echo ""
|
||||
# Find the line number of the TOTAL row and print only what comes after it
|
||||
total_line=$(grep -n "| TOTAL" build/coverage/python.txt | cut -d: -f1)
|
||||
if [ -n "$total_line" ]; then
|
||||
# Print only the detailed list (skip the statistics table)
|
||||
tail -n +$((total_line + 2)) build/coverage/python.txt
|
||||
else
|
||||
# Fallback to showing entire file if TOTAL line not found
|
||||
cat build/coverage/python.txt
|
||||
fi
|
||||
echo ""
|
||||
echo "Make sure you've updated relevant .rsts in docs/source!"
|
||||
echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
|
||||
echo "You can reproduce locally by running 'cd docs && make coverage && tail -n +\$((grep -n \"| TOTAL\" build/coverage/python.txt | cut -d: -f1) + 2)) build/coverage/python.txt'"
|
||||
set -x # Re-enable command echoing
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
|
||||
@ -337,7 +337,7 @@ test_python() {
|
||||
|
||||
test_python_smoke() {
|
||||
# Smoke tests for H100/B200
|
||||
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
|
||||
@ -1,11 +1,11 @@
|
||||
name: 🚀 Release highlight for proposed Feature
|
||||
name: 🚀 New Feature for Release
|
||||
description: Submit a Release highlight for proposed Feature
|
||||
labels: ["release-feature-request"]
|
||||
|
||||
body:
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Release highlight for proposed Feature
|
||||
label: New Feature for Release
|
||||
description: >
|
||||
Example: “A torch.special module, analogous to SciPy's special module.”
|
||||
- type: input
|
||||
|
||||
12
.github/actions/pytest-cache-download/action.yml
vendored
12
.github/actions/pytest-cache-download/action.yml
vendored
@ -38,9 +38,9 @@ runs:
|
||||
run: |
|
||||
python3 .github/scripts/pytest_cache.py \
|
||||
--download \
|
||||
--cache_dir $GITHUB_WORKSPACE/$CACHE_DIR \
|
||||
--pr_identifier $GITHUB_REF \
|
||||
--job_identifier $JOB_IDENTIFIER \
|
||||
--temp_dir $RUNNER_TEMP \
|
||||
--repo $REPO \
|
||||
--bucket $BUCKET \
|
||||
--cache_dir "$GITHUB_WORKSPACE/$CACHE_DIR" \
|
||||
--pr_identifier "$GITHUB_REF" \
|
||||
--job_identifier "$JOB_IDENTIFIER" \
|
||||
--temp_dir "$RUNNER_TEMP" \
|
||||
--repo "$REPO" \
|
||||
--bucket "$BUCKET" \
|
||||
|
||||
16
.github/actions/pytest-cache-upload/action.yml
vendored
16
.github/actions/pytest-cache-upload/action.yml
vendored
@ -47,11 +47,11 @@ runs:
|
||||
run: |
|
||||
python3 .github/scripts/pytest_cache.py \
|
||||
--upload \
|
||||
--cache_dir $GITHUB_WORKSPACE/$CACHE_DIR \
|
||||
--pr_identifier $GITHUB_REF \
|
||||
--job_identifier $JOB_IDENTIFIER \
|
||||
--sha $SHA \
|
||||
--test_config $TEST_CONFIG \
|
||||
--shard $SHARD \
|
||||
--repo $REPO \
|
||||
--temp_dir $RUNNER_TEMP \
|
||||
--cache_dir "$GITHUB_WORKSPACE/$CACHE_DIR" \
|
||||
--pr_identifier "$GITHUB_REF" \
|
||||
--job_identifier "$JOB_IDENTIFIER" \
|
||||
--sha "$SHA" \
|
||||
--test_config "$TEST_CONFIG" \
|
||||
--shard "$SHARD" \
|
||||
--repo "$REPO" \
|
||||
--temp_dir "$RUNNER_TEMP" \
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2
|
||||
ad5816f0eee1c873df1b7d371c69f1f811a89387
|
||||
|
||||
125
.github/copilot-instructions.md
vendored
Normal file
125
.github/copilot-instructions.md
vendored
Normal file
@ -0,0 +1,125 @@
|
||||
# PyTorch Copilot Instructions
|
||||
|
||||
This is the PyTorch machine learning framework codebase. These instructions help AI agents navigate and contribute effectively.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
### Core Components
|
||||
|
||||
- **c10/** - Core library (C++-10 compatible) for essential, binary-size-conscious functionality
|
||||
- **aten/** - ATen tensor library (C++), PyTorch's foundation without autograd
|
||||
- `aten/src/ATen/native/` - Modern operator implementations (CPU/CUDA/MPS/sparse)
|
||||
- `aten/src/ATen/native/native_functions.yaml` - **Critical**: Declarative operator registry
|
||||
- **torch/** - Python bindings and public API
|
||||
- `torch/csrc/` - C++ Python bindings (hand-written and generated)
|
||||
- `torch/csrc/autograd/` - Reverse-mode automatic differentiation
|
||||
- `torch/csrc/jit/` - TorchScript JIT compiler
|
||||
- **torchgen/** - Code generation tooling that reads `native_functions.yaml`
|
||||
- **tools/** - Build scripts, autograd derivatives, code generation
|
||||
|
||||
### The Code Generation Workflow
|
||||
|
||||
**Most operator changes require editing `native_functions.yaml`**, not direct C++ files. This YAML file:
|
||||
1. Declares operator signatures, variants (function/method), and dispatch behavior
|
||||
2. Gets processed by `torchgen/` to generate C++/Python bindings
|
||||
3. Produces headers in `build/aten/src/ATen/` during compilation
|
||||
|
||||
Example entry structure:
|
||||
```yaml
|
||||
- func: my_op(Tensor self, Scalar alpha=1) -> Tensor
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CPU: my_op_cpu
|
||||
CUDA: my_op_cuda
|
||||
```
|
||||
|
||||
After editing `native_functions.yaml`, implement kernels in `aten/src/ATen/native/` (see `aten/src/ATen/native/README.md`).
|
||||
|
||||
## Development Workflows
|
||||
|
||||
### Building from Source
|
||||
|
||||
**Never run `setup.py` directly** - use pip with editable install:
|
||||
```bash
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
Speed up builds:
|
||||
- `DEBUG=1` - Debug symbols with `-g -O0`
|
||||
- `USE_CUDA=0` - Skip CUDA compilation
|
||||
- `BUILD_TEST=0` - Skip C++ test binaries
|
||||
- Install `ninja` (`pip install ninja`) for faster builds
|
||||
- Use `ccache` for incremental compilation caching
|
||||
|
||||
Rebuild specific targets: `(cd build && ninja <target>)`
|
||||
|
||||
### Testing
|
||||
|
||||
**Critical**: DO NOT run entire test suites. Run specific tests only:
|
||||
```bash
|
||||
python test/test_torch.py TestTorch.test_specific_case
|
||||
```
|
||||
|
||||
**Test structure**: All tests use `torch.testing._internal.common_utils`:
|
||||
```python
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
class TestFeature(TestCase):
|
||||
def test_something(self):
|
||||
# Use self.assertEqual for tensor comparisons
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
```
|
||||
|
||||
**For bug fixes**: Create a standalone reproduction script first, verify it fails, then fix and add to appropriate test file.
|
||||
|
||||
### Linting
|
||||
|
||||
Run linter (not pre-commit): `lintrunner -a` (auto-applies fixes)
|
||||
|
||||
## Project-Specific Conventions
|
||||
|
||||
### Memory and Storage
|
||||
- **Storage is never nullptr** (but `StorageImpl.data` may be nullptr for unallocated outputs)
|
||||
- CUDA device info lives in storage objects
|
||||
|
||||
### Python-C++ Integration (`torch/csrc/`)
|
||||
- Always include `Python.h` **first** to avoid `_XOPEN_SOURCE` redefinition errors
|
||||
- Use `pybind11::gil_scoped_acquire` before calling Python API or using `THPObjectPtr`
|
||||
- Wrap entry points with `HANDLE_TH_ERRORS` / `END_HANDLE_TH_ERRORS` for exception conversion
|
||||
|
||||
### Dispatch System
|
||||
- PyTorch uses operator dispatch to route calls to backend-specific kernels
|
||||
- Prefer `CompositeExplicitAutograd` dispatch when writing device-agnostic compound ops
|
||||
- See `aten/src/ATen/native/README.md` for dispatch keyword guidance
|
||||
|
||||
## Git Workflow (AI Agent Specific)
|
||||
|
||||
When preparing PRs from this environment:
|
||||
```bash
|
||||
git stash -u
|
||||
git reset --hard $(cat /tmp/orig_work.txt) # Reset to LOCAL branch
|
||||
git stash pop
|
||||
# Resolve conflicts if necessary
|
||||
```
|
||||
|
||||
## Common Gotchas
|
||||
|
||||
1. **Editing generated files** - If it's in `build/`, don't edit it. Edit the source template or `native_functions.yaml`
|
||||
2. **NVCC template compilation** - NVCC is stricter about C++ than gcc/clang; code working on Linux may fail Windows CI
|
||||
3. **Windows symbol visibility** - Use `TORCH_API` macros for exported symbols (required on Windows, optional on Linux)
|
||||
4. **No internet access** - DO NOT attempt to install dependencies during development
|
||||
|
||||
## Key Files Reference
|
||||
|
||||
- `AGENTS.md` - Instructions specific to AI coding agents
|
||||
- `CONTRIBUTING.md` - Comprehensive human contributor guide
|
||||
- `GLOSSARY.md` - Terminology (ATen, kernels, operations, JIT, TorchScript)
|
||||
- `aten/src/ATen/native/README.md` - Operator implementation guide
|
||||
- `tools/autograd/derivatives.yaml` - Gradient definitions for autograd
|
||||
|
||||
## Performance Debugging
|
||||
|
||||
Use `TORCH_SHOW_CPP_STACKTRACES=1` for C++ traces in Python errors. For profiling, prefer `py-spy` over manual instrumentation.
|
||||
4
.github/workflows/_rocm-test.yml
vendored
4
.github/workflows/_rocm-test.yml
vendored
@ -97,8 +97,8 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
|
||||
if [[ $ngpu -lt 4 ]]; then
|
||||
echo "Error: only $ngpu GPU(s) detected, at least 4 GPUs are needed for distributed jobs"
|
||||
if [[ $ngpu -lt 2 ]]; then #We are temporarily reducing this down to 2 from 4 so that we can run tests on nodes with less gpus.
|
||||
echo "Error: only $ngpu GPU(s) detected, at least 2 GPUs are needed for distributed jobs"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
16
.github/workflows/_xpu-test.yml
vendored
16
.github/workflows/_xpu-test.yml
vendored
@ -344,5 +344,21 @@ jobs:
|
||||
if-no-files-found: ignore
|
||||
path: ./**/core.[1-9]*
|
||||
|
||||
- name: Authenticate with AWS
|
||||
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
|
||||
# The max duration enforced by the server side
|
||||
role-duration-seconds: 18000
|
||||
aws-region: us-east-1
|
||||
|
||||
- name: Upload the benchmark results
|
||||
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
|
||||
with:
|
||||
benchmark-results-dir: test/test-reports
|
||||
dry-run: false
|
||||
schema-version: v3
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Teardown XPU
|
||||
uses: ./.github/actions/teardown-xpu
|
||||
|
||||
6
.github/workflows/docker-builds.yml
vendored
6
.github/workflows/docker-builds.yml
vendored
@ -77,9 +77,11 @@ jobs:
|
||||
pytorch-linux-noble-riscv64-py3.12-gcc14
|
||||
]
|
||||
include:
|
||||
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
|
||||
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
|
||||
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-clang21
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
timeout-minutes: 600
|
||||
# Docker uploads fail from LF runners, see https://github.com/pytorch/pytorch/pull/137358
|
||||
|
||||
@ -72,7 +72,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
build-environment: linux-jammy-aarch64-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_cpu_aarch64", shard: 1, num_shards: 9, runner: "linux.arm64.m7g.metal" },
|
||||
|
||||
2
.github/workflows/inductor-rocm.yml
vendored
2
.github/workflows/inductor-rocm.yml
vendored
@ -2,7 +2,7 @@ name: inductor-rocm
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: 0 * * * *
|
||||
- cron: 0 */3 * * *
|
||||
push:
|
||||
branches:
|
||||
- release/*
|
||||
|
||||
2
.github/workflows/linux-aarch64.yml
vendored
2
.github/workflows/linux-aarch64.yml
vendored
@ -33,7 +33,7 @@ jobs:
|
||||
with:
|
||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||
build-environment: linux-jammy-aarch64-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
|
||||
2
.github/workflows/operator_benchmark.yml
vendored
2
.github/workflows/operator_benchmark.yml
vendored
@ -60,7 +60,7 @@ jobs:
|
||||
with:
|
||||
build-environment: linux-jammy-aarch64-py3.10
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" },
|
||||
|
||||
3
.github/workflows/rocm.yml
vendored
3
.github/workflows/rocm.yml
vendored
@ -9,7 +9,8 @@ on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: 29 8 * * * # about 1:29am PDT
|
||||
- cron: 0 * * * *
|
||||
- cron: 0 */3 * * *
|
||||
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -127,7 +127,6 @@ torch/test/
|
||||
torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
|
||||
torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
|
||||
torch/version.py
|
||||
torch/_inductor/kernel/vendored_templates/*
|
||||
minifier_launcher.py
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*
|
||||
|
||||
@ -234,7 +234,17 @@ option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
|
||||
option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
|
||||
option(USE_LSAN "Use Leak Sanitizer" OFF)
|
||||
option(USE_TSAN "Use Thread Sanitizer" OFF)
|
||||
|
||||
# Track whether USE_CUDA was explicitly set by the user (before option() is called)
|
||||
# If USE_CUDA is already defined in cache, it means user explicitly set it
|
||||
if(DEFINED CACHE{USE_CUDA})
|
||||
set(_USE_CUDA_EXPLICITLY_SET TRUE)
|
||||
else()
|
||||
set(_USE_CUDA_EXPLICITLY_SET FALSE)
|
||||
endif()
|
||||
|
||||
option(USE_CUDA "Use CUDA" ON)
|
||||
|
||||
option(USE_XPU "Use XPU" ON)
|
||||
cmake_dependent_option(
|
||||
BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
|
||||
|
||||
@ -18,7 +18,7 @@ aspects of contributing to PyTorch.
|
||||
- [Python Unit Testing](#python-unit-testing)
|
||||
- [Better local unit tests with `pytest`](#better-local-unit-tests-with-pytest)
|
||||
- [Local linting](#local-linting)
|
||||
- [Running `mypy`](#running-mypy)
|
||||
- [Running `pyrefly`](#running-pyrefly)
|
||||
- [C++ Unit Testing](#c-unit-testing)
|
||||
- [Run Specific CI Jobs](#run-specific-ci-jobs)
|
||||
- [Merging your Change](#merging-your-change)
|
||||
@ -281,7 +281,7 @@ dependencies as well as the nightly binaries into the repo directory.
|
||||
**Prerequisites**:
|
||||
The following packages should be installed with `pip`:
|
||||
- `expecttest` and `hypothesis` - required to run tests
|
||||
- `mypy` - recommended for linting
|
||||
- `pyrefly` - recommended for type checking. [Pyrefly](https://pyrefly.org/)
|
||||
- `pytest` - recommended to run tests more selectively
|
||||
Running
|
||||
```
|
||||
@ -350,15 +350,32 @@ make lint
|
||||
|
||||
Learn more about the linter on the [lintrunner wiki page](https://github.com/pytorch/pytorch/wiki/lintrunner)
|
||||
|
||||
#### Running `mypy`
|
||||
#### Running `pyrefly`
|
||||
|
||||
`mypy` is an optional static type checker for Python. We have multiple `mypy`
|
||||
configs for the PyTorch codebase that are automatically validated against whenever the linter is run.
|
||||
[Pyrefly](https://pyrefly.org/) is a high-performance static type checker for Python. It provides fast type checking along with IDE features like autocomplete and instant error feedback.
|
||||
|
||||
PyTorch uses Pyrefly for type checking across the codebase. The configuration is managed in `pyrefly.toml` at the root of the repository.
|
||||
|
||||
**Getting Started with Pyrefly:**
|
||||
|
||||
To run type checking on the PyTorch codebase:
|
||||
```bash
|
||||
pyrefly check
|
||||
```
|
||||
|
||||
For more detailed error information with summaries:
|
||||
```bash
|
||||
pyrefly check --summarize-errors
|
||||
```
|
||||
|
||||
**Learn More:**
|
||||
- [Pyrefly Configuration](https://pyrefly.org/en/docs/configuration/) - Detailed configuration options
|
||||
- [Pyrefly IDE Features](https://pyrefly.org/en/docs/IDE-features/) - Set up Pyrefly in your editor for real-time type checking
|
||||
- [Python Typing Tutorial](https://pyrefly.org/en/docs/typing-for-python-developers/) - Learn about Python type annotations
|
||||
|
||||
See [Guide for adding type annotations to
|
||||
PyTorch](https://github.com/pytorch/pytorch/wiki/Guide-for-adding-type-annotations-to-PyTorch)
|
||||
for more information on how to set up `mypy` and tackle type annotation
|
||||
tasks.
|
||||
for PyTorch-specific guidance on how to set up `pyrefly` and tackle type annotation tasks in this codebase.
|
||||
|
||||
### C++ Unit Testing
|
||||
|
||||
|
||||
@ -191,7 +191,7 @@ class Vectorized<BFloat16> {
|
||||
auto vals = svreinterpret_u16_bf16(values);
|
||||
vals = sveor_u16_x(ptrue, vals, mask);
|
||||
return svreinterpret_bf16_u16(vals);
|
||||
};
|
||||
}
|
||||
Vectorized<BFloat16> round() const;
|
||||
Vectorized<BFloat16> tan() const;
|
||||
Vectorized<BFloat16> tanh() const;
|
||||
@ -349,47 +349,47 @@ Vectorized<BFloat16> inline Vectorized<BFloat16>::frac() const {
|
||||
return convert_float_bfloat16(v1, v2); \
|
||||
}
|
||||
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(isnan);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(angle);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(acos);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(acosh);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(asin);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(atan);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(atanh);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(atan2);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(copysign);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(erf);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(erfc);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(exp);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(exp2);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(expm1);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(fmod);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(hypot);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(i0);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(i0e);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(digamma);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igamma);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igammac);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(nextafter);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(log);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(log2);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(log10);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(log1p);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(sin);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(sinh);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(cos);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(cosh);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(ceil);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(floor);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(round);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(tan);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(tanh);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(trunc);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(lgamma);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(sqrt);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(reciprocal);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(rsqrt);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(pow);
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(isnan)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(angle)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(acos)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(acosh)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(asin)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(atan)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(atanh)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(atan2)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(copysign)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(erf)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(erfc)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(exp)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(exp2)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(expm1)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(fmod)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(hypot)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(i0)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(i0e)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(digamma)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igamma)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igammac)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(nextafter)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(log)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(log2)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(log10)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(log1p)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(sin)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(sinh)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(cos)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(cosh)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(ceil)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(floor)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(round)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(tan)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(tanh)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(trunc)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(lgamma)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(sqrt)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(reciprocal)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT(rsqrt)
|
||||
DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(pow)
|
||||
|
||||
Vectorized<BFloat16> inline Vectorized<BFloat16>::operator==(
|
||||
const Vectorized<BFloat16>& other) const {
|
||||
|
||||
@ -388,6 +388,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
#ifndef USE_ROCM
|
||||
at::Half halpha;
|
||||
at::Half hbeta;
|
||||
uint32_t mask = -1;
|
||||
#endif
|
||||
void * alpha_ptr = α
|
||||
void * beta_ptr = β
|
||||
@ -427,7 +428,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
mask =
|
||||
fp16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
@ -444,7 +445,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
mask =
|
||||
bf16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
@ -511,17 +512,41 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
|
||||
cublasLtMatmulHeuristicResult_t heuristicResult = {};
|
||||
int returnedResult = 0;
|
||||
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
|
||||
ltHandle,
|
||||
computeDesc.descriptor(),
|
||||
Adesc.descriptor(),
|
||||
Bdesc.descriptor(),
|
||||
Cdesc.descriptor(),
|
||||
Cdesc.descriptor(),
|
||||
preference.descriptor(),
|
||||
1,
|
||||
&heuristicResult,
|
||||
&returnedResult));
|
||||
// on Blackwell+, we fake a n > 1 matmul when querying heuristics
|
||||
// to prevent cuBLASLt from dispatching to a GEMV kernel for batch-invariance
|
||||
#ifndef USE_ROCM
|
||||
const bool lie_to_cublaslt = mask == CUBLASLT_REDUCTION_SCHEME_NONE && n == 1 && at::cuda::getCurrentDeviceProperties()->major >= 10;
|
||||
#else
|
||||
const bool lie_to_cublaslt = false;
|
||||
#endif
|
||||
if (lie_to_cublaslt) {
|
||||
CuBlasLtMatrixLayout FakeBdesc(abType, k, 2, ldb, opb == CUBLAS_OP_T);
|
||||
CuBlasLtMatrixLayout FakeCdesc(cType, m, 2, ldc);
|
||||
|
||||
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
|
||||
ltHandle,
|
||||
computeDesc.descriptor(),
|
||||
Adesc.descriptor(),
|
||||
FakeBdesc.descriptor(),
|
||||
FakeCdesc.descriptor(),
|
||||
FakeCdesc.descriptor(),
|
||||
preference.descriptor(),
|
||||
1,
|
||||
&heuristicResult,
|
||||
&returnedResult));
|
||||
} else {
|
||||
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
|
||||
ltHandle,
|
||||
computeDesc.descriptor(),
|
||||
Adesc.descriptor(),
|
||||
Bdesc.descriptor(),
|
||||
Cdesc.descriptor(),
|
||||
Cdesc.descriptor(),
|
||||
preference.descriptor(),
|
||||
1,
|
||||
&heuristicResult,
|
||||
&returnedResult));
|
||||
}
|
||||
if (returnedResult == 0) {
|
||||
cublasStatus = CUBLAS_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
@ -55,6 +55,14 @@ struct numeric_limits<int8_t> {
|
||||
static inline __host__ __device__ int8_t upper_bound() { return INT8_MAX; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct numeric_limits<uint16_t> {
|
||||
static inline __host__ __device__ uint16_t lowest() { return 0; }
|
||||
static inline __host__ __device__ uint16_t max() { return UINT16_MAX; }
|
||||
static inline __host__ __device__ uint16_t lower_bound() { return 0; }
|
||||
static inline __host__ __device__ uint16_t upper_bound() { return UINT16_MAX; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct numeric_limits<int16_t> {
|
||||
static inline __host__ __device__ int16_t lowest() { return INT16_MIN; }
|
||||
@ -63,6 +71,14 @@ struct numeric_limits<int16_t> {
|
||||
static inline __host__ __device__ int16_t upper_bound() { return INT16_MAX; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct numeric_limits<uint32_t> {
|
||||
static inline __host__ __device__ uint32_t lowest() { return 0; }
|
||||
static inline __host__ __device__ uint32_t max() { return UINT32_MAX; }
|
||||
static inline __host__ __device__ uint32_t lower_bound() { return 0; }
|
||||
static inline __host__ __device__ uint32_t upper_bound() { return UINT32_MAX; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct numeric_limits<int32_t> {
|
||||
static inline __host__ __device__ int32_t lowest() { return INT32_MIN; }
|
||||
@ -71,6 +87,21 @@ struct numeric_limits<int32_t> {
|
||||
static inline __host__ __device__ int32_t upper_bound() { return INT32_MAX; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct numeric_limits<uint64_t> {
|
||||
#ifdef _MSC_VER
|
||||
static inline __host__ __device__ uint64_t lowest() { return 0; }
|
||||
static inline __host__ __device__ uint64_t max() { return _UI64_MAX; }
|
||||
static inline __host__ __device__ uint64_t lower_bound() { return 0; }
|
||||
static inline __host__ __device__ uint64_t upper_bound() { return _UI64_MAX; }
|
||||
#else
|
||||
static inline __host__ __device__ uint64_t lowest() { return 0; }
|
||||
static inline __host__ __device__ uint64_t max() { return UINT64_MAX; }
|
||||
static inline __host__ __device__ uint64_t lower_bound() { return 0; }
|
||||
static inline __host__ __device__ uint64_t upper_bound() { return UINT64_MAX; }
|
||||
#endif
|
||||
};
|
||||
|
||||
template <>
|
||||
struct numeric_limits<int64_t> {
|
||||
#ifdef _MSC_VER
|
||||
|
||||
@ -24,7 +24,13 @@ namespace detail {
|
||||
// radix_sort_pairs doesn't interact with value_t other than to copy
|
||||
// the data, so we can save template instantiations by reinterpreting
|
||||
// it as an opaque type.
|
||||
// We use native integer types for 1/2/4/8-byte values to reduce
|
||||
// register usage in CUDA kernels. For sizes > 8 fall back to char array.
|
||||
template <int N> struct alignas(N) OpaqueType { char data[N]; };
|
||||
template <> struct alignas(1) OpaqueType<1> { uint8_t data; };
|
||||
template <> struct alignas(2) OpaqueType<2> { uint16_t data; };
|
||||
template <> struct alignas(4) OpaqueType<4> { uint32_t data; };
|
||||
template <> struct alignas(8) OpaqueType<8> { uint64_t data; };
|
||||
|
||||
template<typename key_t, int value_size>
|
||||
void radix_sort_pairs_impl(
|
||||
|
||||
@ -1009,12 +1009,25 @@ static Device correct_out_device(const Tensor& self, const Tensor& other) {
|
||||
}
|
||||
}
|
||||
|
||||
static Tensor send_to_meta(const Tensor& self, const Device& device) {
|
||||
Tensor out_meta;
|
||||
if (self._is_zerotensor() && self.unsafeGetTensorImpl()->is_wrapped_number()) {
|
||||
out_meta = at::_efficientzerotensor(self.sizes(), self.options().device(device));
|
||||
out_meta.unsafeGetTensorImpl()->set_wrapped_number(true);
|
||||
} else {
|
||||
out_meta = self.to(device);
|
||||
}
|
||||
return out_meta;
|
||||
}
|
||||
|
||||
Tensor mul_zerotensor(const Tensor& self, const Tensor& other) {
|
||||
auto out_device = correct_out_device(self, other);
|
||||
// hack to use the TensorIterator to get the correct broadcasting and type promotion logic
|
||||
auto device_ = Device(DeviceType::Meta);
|
||||
constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
|
||||
auto meta_out = at::_ops::mul_Tensor::redispatch(meta_dks, self.to(device_), other.to(device_));
|
||||
auto self_meta = send_to_meta(self, device_);
|
||||
auto other_meta = send_to_meta(other, device_);
|
||||
auto meta_out = at::_ops::mul_Tensor::redispatch(meta_dks, self_meta, other_meta);
|
||||
return at::_efficientzerotensor(meta_out.sizes(), meta_out.options().device(out_device));
|
||||
}
|
||||
|
||||
@ -1023,7 +1036,9 @@ Tensor div_zerotensor(const Tensor& self, const Tensor& other) {
|
||||
// hack to use the TensorIterator to get the correct broadcasting and type promotion logic
|
||||
auto device_ = Device(DeviceType::Meta);
|
||||
constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
|
||||
auto meta_out = at::_ops::div_Tensor::redispatch(meta_dks, self.to(device_), other.to(device_));
|
||||
auto self_meta = send_to_meta(self, device_);
|
||||
auto other_meta = send_to_meta(other, device_);
|
||||
auto meta_out = at::_ops::div_Tensor::redispatch(meta_dks, self_meta, other_meta);
|
||||
|
||||
if (self._is_zerotensor()) {
|
||||
if (other._is_zerotensor()) {
|
||||
@ -1052,8 +1067,9 @@ static Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const
|
||||
// hack to use the TensorIterator to get the correct broadcasting and type promotion logic
|
||||
auto device_ = Device(DeviceType::Meta);
|
||||
constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
|
||||
auto meta_out = at::_ops::add_Tensor::redispatch(
|
||||
meta_dks, self.to(device_), other.to(device_), alpha);
|
||||
auto self_meta = send_to_meta(self, device_);
|
||||
auto other_meta = send_to_meta(other, device_);
|
||||
auto meta_out = at::_ops::add_Tensor::redispatch(meta_dks, self_meta, other_meta, alpha);
|
||||
|
||||
auto get_out_like = [&] (const Tensor& tensor)
|
||||
{
|
||||
|
||||
@ -50,18 +50,35 @@ static inline bool parseLinearFlatten3d() {
|
||||
// `_flatten_nd_linear` flattens all but the last dimension of the input tensor
|
||||
// before passing it to linear operation
|
||||
static inline Tensor _flatten_nd_linear(const Tensor& input, const Tensor& weight, const Tensor& bias) {
|
||||
const auto input_sizes = input.sym_sizes();
|
||||
// can't use -1 in reshape because it errors when a dimension is 0
|
||||
c10::SymInt flattened_dim = 1;
|
||||
for (int64_t i = 0, ndim = input_sizes.size(); i < ndim - 1; ++i) {
|
||||
flattened_dim = flattened_dim * input_sizes[i];
|
||||
const auto input_sizes = input.sym_sizes();
|
||||
|
||||
const auto result_flattened = [&]() -> Tensor {
|
||||
const auto input_ncols = input_sizes.back();
|
||||
const auto input_flattened_nrows = [&]() -> c10::SymInt {
|
||||
// can't use -1 in reshape because it errors when a dimension is 0
|
||||
auto flattened_nrows = c10::SymInt{1};
|
||||
for (const auto& size : input_sizes.slice(0, input_sizes.size() - 1)) {
|
||||
flattened_nrows *= size;
|
||||
}
|
||||
return flattened_nrows;
|
||||
}();
|
||||
|
||||
const auto input_flattened = input.view_symint({input_flattened_nrows, input_ncols});
|
||||
if (weight.layout() == c10::kStrided) {
|
||||
return at::addmm(bias, input_flattened, weight.t());
|
||||
} else {
|
||||
// weight is sparse, and addmm for sparse expects matmul lhs to be sparse,
|
||||
// so we transpose the problem.
|
||||
// NOTE: at::matmul handles (dense @ sparse) similarly.
|
||||
const auto bias_t = (bias.dim() >= 2) ? bias.mT() : bias.unsqueeze(-1);
|
||||
return at::addmm(bias_t, weight, input_flattened.t()).t();
|
||||
}
|
||||
auto inp_reshape = input.reshape_symint({flattened_dim, input_sizes.at(input_sizes.size() -1)});
|
||||
const auto result = at::addmm(bias, inp_reshape, weight.t());
|
||||
auto new_size = input_sizes.slice(0, input_sizes.size() - 1);
|
||||
c10::SymDimVector sizes_vec(new_size.begin(), new_size.end());
|
||||
sizes_vec.push_back(result.sym_size(1));
|
||||
return result.view_symint(sizes_vec);
|
||||
}();
|
||||
|
||||
// Unflatten flattened row dims
|
||||
auto result_sizes = c10::SymDimVector{input_sizes.begin(), input_sizes.end()};
|
||||
result_sizes.back() = result_flattened.sym_size(1);
|
||||
return result_flattened.view_symint(result_sizes);
|
||||
}
|
||||
|
||||
|
||||
@ -90,15 +107,23 @@ Tensor linear(const Tensor& input, const Tensor& weight, const std::optional<Ten
|
||||
// Fused op is marginally faster.
|
||||
return at::addmm(*bias, input, weight.t());
|
||||
}
|
||||
if (bias->defined() && !input.is_xla()) {
|
||||
// Also hit the fused path for contiguous 3D input, if not using xla
|
||||
|
||||
const auto is_bias_likely_fusable = (
|
||||
bias->defined() &&
|
||||
// cuBLASLt: will fuse in the epilogue without copies
|
||||
// when input/weight/bias are all strided.
|
||||
// When weight is not strided, bias will not be fused,
|
||||
// but we can still dispatch here to avoid at::matmul
|
||||
// path which will probably use a very similar
|
||||
// flattening optimization.
|
||||
((bias->dim() == 1 || bias->squeeze().dim() == 1) && bias->is_contiguous_or_false())
|
||||
);
|
||||
if (is_bias_likely_fusable && !input.is_xla()) {
|
||||
// Also hit the fused path for contiguous nD input, if not using xla
|
||||
// backend. Reshaping/flattening has some performance implications on xla.
|
||||
bool is_contiguous = input.is_contiguous_or_false();
|
||||
if (is_contiguous && input_dim == 3) {
|
||||
if (input.is_contiguous_or_false()) {
|
||||
return _flatten_nd_linear(input, weight, *bias);
|
||||
} else if (is_contiguous && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) {
|
||||
return _flatten_nd_linear(input, weight, *bias);
|
||||
} else if (parseLinearFlatten3d() && input_dim == 3) {
|
||||
} else if (parseLinearFlatten3d()) {
|
||||
// If user forces flattening via env var
|
||||
const Tensor input_cont = input.contiguous();
|
||||
return _flatten_nd_linear(input_cont, weight, *bias);
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
#include <ATen/core/ATen_fwd.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/core/SymInt.h>
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/AccumulateType.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
@ -1710,11 +1711,37 @@ Tensor narrow_symint(
|
||||
"], but got ",
|
||||
start,
|
||||
")")
|
||||
if (start < 0) {
|
||||
start = start + cur_size;
|
||||
|
||||
auto cond1 = TORCH_GUARD_OR_FALSE(start.sym_lt(0));
|
||||
auto cond2 = TORCH_GUARD_OR_FALSE(start.sym_ge(0));
|
||||
|
||||
if (cond1 || cond2) {
|
||||
if (cond1) {
|
||||
start = start + cur_size;
|
||||
}
|
||||
|
||||
TORCH_SYM_CHECK(
|
||||
start.sym_le(cur_size - length),
|
||||
"start (",
|
||||
start,
|
||||
") + length (",
|
||||
length,
|
||||
") exceeds dimension size (",
|
||||
cur_size,
|
||||
").");
|
||||
return at::slice_symint(self, dim, start, start + length, 1);
|
||||
}
|
||||
|
||||
// Unbacked start handling!
|
||||
|
||||
// Bounds check without converting start:
|
||||
// - If start < 0: need (start + cur_size) + length <= cur_size, i.e., start +
|
||||
// length <= 0
|
||||
// - If start >= 0: need start + length <= cur_size
|
||||
auto end = start + length;
|
||||
TORCH_SYM_CHECK(
|
||||
start.sym_le(cur_size - length),
|
||||
(start.sym_lt(0).sym_and((end).sym_le(0)))
|
||||
.sym_or(start.sym_ge(0).sym_and((end).sym_le(cur_size))),
|
||||
"start (",
|
||||
start,
|
||||
") + length (",
|
||||
@ -1722,7 +1749,28 @@ Tensor narrow_symint(
|
||||
") exceeds dimension size (",
|
||||
cur_size,
|
||||
").");
|
||||
return at::slice_symint(self, dim, start, start + length, 1);
|
||||
|
||||
if (TORCH_GUARD_OR_FALSE(end.sym_ne(0))) {
|
||||
return at::slice_symint(self, dim, start, end, 1);
|
||||
} else {
|
||||
// Cannot statically determine the condition due to unbacked.
|
||||
// This is an interesting situation; when start is negative and
|
||||
// start + length == 0, slice and narrow do different things.
|
||||
// i.e., x.narrow(0, -2, 2) != x[-2:0]; in that case, we want to
|
||||
// pass curr_size instead of 0. Otherwise, they would do the same thing.
|
||||
// This says at runtime: if start < 0 and end == 0, then pass curr_size
|
||||
// instead of 0.
|
||||
|
||||
auto use_different = start.sym_lt(0).sym_and(end.sym_eq(0)).toSymInt();
|
||||
auto result =
|
||||
at::slice_symint(self, dim, start, end + use_different * cur_size, 1);
|
||||
|
||||
// Ensure slice allocated unbacked size is specialized to length.
|
||||
SymInt new_size = result.sym_size(dim);
|
||||
TORCH_SYM_CHECK(new_size.sym_eq(length), "")
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// This overload exists purely for XLA, because they wanted to pass in
|
||||
@ -1736,8 +1784,8 @@ Tensor narrow_tensor_symint(
|
||||
start.dim() == 0 &&
|
||||
isIntegralType(start.scalar_type(), /*includeBool=*/false),
|
||||
"start must be an 0-dim integral Tensor.");
|
||||
int64_t st = start.item<int64_t>();
|
||||
return at::narrow_symint(self, dim, c10::SymInt(st), std::move(length));
|
||||
c10::SymInt st = start.item().toSymInt();
|
||||
return at::narrow_symint(self, dim, std::move(st), std::move(length));
|
||||
}
|
||||
|
||||
std::
|
||||
|
||||
@ -293,7 +293,7 @@ struct ComputeLocationBase<scalar_t, /*align_corners=*/false> {
|
||||
, empty(size <= 0) {}
|
||||
|
||||
inline Vec unnormalize(const Vec &in) const {
|
||||
return (in + Vec(1)) * Vec(scaling_factor) - Vec(0.5);
|
||||
return (in + Vec(static_cast<scalar_t>(1))) * Vec(scaling_factor) - Vec(static_cast<scalar_t>(0.5));
|
||||
}
|
||||
|
||||
inline Vec clip_coordinates(const Vec &in) const {
|
||||
@ -831,7 +831,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
|
||||
|
||||
// constant used in cubic convolution
|
||||
// could be -0.5 or -0.75, use the same value in UpSampleBicubic2d.h
|
||||
const Vec A = Vec(-0.75);
|
||||
const Vec A = Vec(static_cast<scalar_t>(-0.75));
|
||||
|
||||
ApplyGridSample(const TensorAccessor<const scalar_t, 4>& input)
|
||||
: inp_H(input.size(2))
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
#include <ATen/native/ReduceOpsUtils.h>
|
||||
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/TensorIterator.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
@ -78,12 +79,12 @@ void min_all_kernel_impl(Tensor& result, const Tensor& input) {
|
||||
reduce_all_impl<int64_t>(result, input, upper_bound<int64_t>(),
|
||||
[=](int64_t a, int64_t b) -> int64_t { return min_impl(a, b); });
|
||||
} else {
|
||||
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "min_all", [&] {
|
||||
AT_DISPATCH_V2(input.scalar_type(), "min_all", AT_WRAP([&] {
|
||||
using Vec = Vectorized<opmath_type<scalar_t>>;
|
||||
reduce_all_impl_vec<scalar_t>(result, input, upper_bound<scalar_t>(),
|
||||
[=] (scalar_t a , scalar_t b) -> scalar_t { return min_impl(a, b); },
|
||||
[=](Vec a, Vec b) -> Vec { return minimum(a, b); });
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
|
||||
}
|
||||
}
|
||||
|
||||
@ -103,12 +104,12 @@ void max_all_kernel_impl(Tensor& result, const Tensor& input) {
|
||||
reduce_all_impl<int64_t>(result, input, lower_bound<int64_t>(),
|
||||
[=](int64_t a, int64_t b) -> int64_t { return max_impl(a, b); });
|
||||
} else {
|
||||
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_all", [&] {
|
||||
AT_DISPATCH_V2(input.scalar_type(), "max_all", AT_WRAP([&] {
|
||||
using Vec = Vectorized<opmath_type<scalar_t>>;
|
||||
reduce_all_impl_vec<scalar_t>(result, input, lower_bound<scalar_t>(),
|
||||
[=] (scalar_t a , scalar_t b) -> scalar_t { return max_impl(a, b); },
|
||||
[=](Vec a, Vec b) -> Vec { return maximum(a, b); });
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
|
||||
}
|
||||
}
|
||||
|
||||
@ -199,7 +200,7 @@ void aminmax_allreduce_kernel(
|
||||
}
|
||||
);
|
||||
} else {
|
||||
AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "aminmax_cpu", [&] {
|
||||
AT_DISPATCH_V2(input.scalar_type(), "aminmax_cpu", AT_WRAP([&] {
|
||||
using Vec = Vectorized<opmath_type<scalar_t>>;
|
||||
using scalar_t_pair = std::pair<scalar_t, scalar_t>;
|
||||
reduce_all_impl_vec_two_outputs<scalar_t>(
|
||||
@ -214,7 +215,7 @@ void aminmax_allreduce_kernel(
|
||||
[=](Vec a, Vec b) -> Vec { return minimum(a, b); },
|
||||
[=](Vec a, Vec b) -> Vec { return maximum(a, b); }
|
||||
);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
#include <ATen/cpu/vec/vec.h>
|
||||
#include <ATen/cpu/vec/functional.h>
|
||||
@ -347,34 +348,35 @@ struct MinValuesOps: public at::native::MinOps<scalar_t> {
|
||||
};
|
||||
|
||||
void min_values_kernel_impl(TensorIterator& iter) {
|
||||
if (iter.dtype() == kLong) {
|
||||
// This case is special because of Vectorized<int64_t> does not
|
||||
// handle upper_bound<int64_t>().
|
||||
// See: https://github.com/pytorch/pytorch/issues/43254
|
||||
using scalar_t = int64_t;
|
||||
binary_kernel_reduce(
|
||||
iter,
|
||||
MinValuesOps<scalar_t>{},
|
||||
std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
|
||||
// This case is special because of Vectorized<int64_t> does not
|
||||
// handle upper_bound<int64_t>().
|
||||
// See: https://github.com/pytorch/pytorch/issues/43254
|
||||
if (iter.dtype() == kLong || iter.dtype() == kUInt64) {
|
||||
AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
|
||||
binary_kernel_reduce(
|
||||
iter,
|
||||
MinValuesOps<scalar_t>{},
|
||||
std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
|
||||
}), kLong, kUInt64);
|
||||
return;
|
||||
}
|
||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cpu", [&iter] {
|
||||
AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
|
||||
binary_kernel_reduce_vec(
|
||||
iter,
|
||||
[](scalar_t a, scalar_t b) -> scalar_t { return min_impl(a, b); },
|
||||
[](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return minimum(a, b); },
|
||||
static_cast<double>(upper_bound<scalar_t>()));
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
void max_values_kernel_impl(TensorIterator& iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cpu", [&iter] {
|
||||
AT_DISPATCH_V2(iter.dtype(), "max_values_cpu", AT_WRAP([&iter] {
|
||||
binary_kernel_reduce_vec(
|
||||
iter,
|
||||
[](scalar_t a, scalar_t b) -> scalar_t { return max_impl(a, b); },
|
||||
[](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return maximum(a, b); },
|
||||
lower_bound<scalar_t>());
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
void argmax_kernel_impl(TensorIterator &iter) {
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/NumericUtils.h>
|
||||
#include <ATen/TensorIterator.h>
|
||||
@ -106,7 +107,7 @@ void min_kernel_impl(
|
||||
bool keepdim) {
|
||||
int64_t self_dim_size = ensure_nonempty_size(self, dim);
|
||||
|
||||
AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "min_cpu", [&] {
|
||||
AT_DISPATCH_V2(self.scalar_type(), "min_cpu", AT_WRAP([&] {
|
||||
compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
|
||||
scalar_t* result_data, int64_t* indice_data,
|
||||
const scalar_t* self_data, auto self_dim_stride) {
|
||||
@ -128,7 +129,7 @@ void min_kernel_impl(
|
||||
*indice_data = index;
|
||||
}
|
||||
);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool);
|
||||
}
|
||||
|
||||
void max_kernel_impl(
|
||||
@ -139,7 +140,7 @@ void max_kernel_impl(
|
||||
bool keepdim) {
|
||||
int64_t self_dim_size = ensure_nonempty_size(self, dim);
|
||||
|
||||
AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "max_cpu", [&] {
|
||||
AT_DISPATCH_V2(self.scalar_type(), "max_cpu", AT_WRAP([&] {
|
||||
compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
|
||||
scalar_t* result_data, int64_t* indice_data,
|
||||
const scalar_t* self_data, auto self_dim_stride) {
|
||||
@ -161,7 +162,7 @@ void max_kernel_impl(
|
||||
*indice_data = index;
|
||||
}
|
||||
);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool);
|
||||
}
|
||||
|
||||
void aminmax_kernel(
|
||||
@ -186,7 +187,7 @@ void aminmax_kernel(
|
||||
return;
|
||||
}
|
||||
|
||||
AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half, self.scalar_type(), "aminmax_cpu", [&] {
|
||||
AT_DISPATCH_V2(self.scalar_type(), "aminmax_cpu", AT_WRAP([&] {
|
||||
compare_base_kernel<scalar_t, scalar_t>(min_result, max_result, self, wrap_dim, keepdim, [&] (
|
||||
scalar_t* min_result_data, scalar_t* max_result_data,
|
||||
const scalar_t* self_data, auto self_dim_stride) {
|
||||
@ -209,7 +210,7 @@ void aminmax_kernel(
|
||||
*max_result_data = max_number;
|
||||
}
|
||||
);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half);
|
||||
}
|
||||
|
||||
void where_kernel_impl(TensorIterator &iter) {
|
||||
|
||||
@ -22,6 +22,9 @@
|
||||
#include <ATen/native/cuda/RowwiseScaledMM.h>
|
||||
#include <ATen/native/cuda/ScaledGroupMM.h>
|
||||
#include <ATen/native/cuda/GroupMM.h>
|
||||
#ifdef USE_ROCM
|
||||
#include <ATen/native/hip/ck_group_gemm.h>
|
||||
#endif
|
||||
#include <ATen/ceil_div.h>
|
||||
|
||||
#ifdef USE_FBGEMM_GENAI
|
||||
@ -666,12 +669,19 @@ std::optional<c10::ScalarType> out_dtype) {
|
||||
// _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
|
||||
// the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
|
||||
bool use_fast_path = false;
|
||||
if (at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950"})) {
|
||||
use_fast_path = true;
|
||||
}
|
||||
#endif
|
||||
const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
|
||||
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
|
||||
if (use_fast_path) {
|
||||
// fast path, no d2h sync needed
|
||||
#ifndef USE_ROCM
|
||||
at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
|
||||
#else
|
||||
at::hip::detail::group_gemm_ck(mat_a, mat_b, offs, bias, out);
|
||||
#endif
|
||||
} else {
|
||||
_grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
|
||||
}
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
#include <array>
|
||||
#include <type_traits>
|
||||
#include <ATen/core/TensorBase.h>
|
||||
#include <ATen/ceil_div.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
@ -74,7 +73,6 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
|
||||
|
||||
char* const out_ptr = static_cast<char*>(iter.data_ptr(0));
|
||||
char* const in_ptr = static_cast<char*>(iter.data_ptr(1));
|
||||
|
||||
if (is_gather_like && num_indices==1) {
|
||||
const size_t element_size = iter.element_size(0);
|
||||
constexpr size_t alignment = 16;
|
||||
@ -84,16 +82,9 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
|
||||
auto ind_dim_size = index_size[0];
|
||||
auto inp_stride_bytes = index_stride[0];
|
||||
auto out_stride_bytes = iter.strides(0)[1];
|
||||
// avoid grid overflow in the fast kernel
|
||||
const int64_t vec_chunks = ceil_div(slice_size, alignment);
|
||||
const int64_t blocks_per_slice_upper = ceil_div(vec_chunks, (int64_t)launch_size_nd);
|
||||
const int max_grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
|
||||
// if it's an eligible grid we use the fast path, otherwise default to slower path
|
||||
if (blocks_per_slice_upper <= max_grid_y) {
|
||||
at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
|
||||
slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
|
||||
return;
|
||||
}
|
||||
at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
|
||||
slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -13,11 +13,12 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx,
|
||||
if (allow_neg_indices) {
|
||||
ind = (ind < 0) ? ind + ind_dim_size : ind;
|
||||
}
|
||||
CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds", "Expected 0 <= index < ind_dim_size(%ld), but got index = %ld", ind_dim_size, ind);
|
||||
int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
|
||||
if (off >= slice_size) return;
|
||||
auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
|
||||
at::native::memory::st_vec<Alignment>(out + blockIdx.x * (int32_t)out_stride + off, vec); // out offset is guaranteed to be within int32 limits
|
||||
CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds");
|
||||
// off is guaranteed to be within int32 limits
|
||||
for (int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; off < slice_size; off += blockDim.x * gridDim.y * Alignment) {
|
||||
auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
|
||||
at::native::memory::st_vec<Alignment>(out + blockIdx.x * (int32_t)out_stride + off, vec); // out offset is guaranteed to be within int32 limits
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -30,7 +31,9 @@ void vectorized_gather_kernel_launch(char * out, char * inp, index_t * idx, int
|
||||
auto num_threads = at::round_up(
|
||||
at::ceil_div(slice_size_in_bytes, Alignment),
|
||||
static_cast<int64_t>(C10_WARP_SIZE));
|
||||
dim3 grid = {static_cast<uint32_t>(num_ind), static_cast<uint32_t>(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), 1};
|
||||
uint32_t grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
|
||||
grid_y = std::min(static_cast<uint32_t>(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), grid_y);
|
||||
dim3 grid = {static_cast<uint32_t>(num_ind), grid_y, 1};
|
||||
auto block = std::min(max_num_threads, num_threads);
|
||||
vectorized_gather_kernel<Alignment, index_t><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(out, inp, idx, num_ind, slice_size_in_bytes,
|
||||
ind_dim_size, inp_stride_bytes, out_stride_bytes, allow_neg_indices);
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
#define TORCH_ASSERT_NO_OPERATORS
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/NumericUtils.h>
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
#include <ATen/native/ReduceAllOps.h>
|
||||
@ -28,22 +29,22 @@ void _min_max_values_kernel_cuda_impl(TensorIterator& iter) {
|
||||
}
|
||||
|
||||
void aminmax_allreduce_launch_kernel(TensorIterator& iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(
|
||||
kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_all_cuda", [&] {
|
||||
AT_DISPATCH_V2(
|
||||
iter.input_dtype(), "aminmax_all_cuda", AT_WRAP([&] {
|
||||
_min_max_values_kernel_cuda_impl<scalar_t>(iter);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
void aminmax_launch_kernel(TensorIterator& iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(
|
||||
kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_cuda", [&]() {
|
||||
AT_DISPATCH_V2(
|
||||
iter.input_dtype(), "aminmax_cuda", AT_WRAP([&]() {
|
||||
gpu_reduce_kernel<scalar_t, scalar_t>(
|
||||
iter,
|
||||
MinMaxOps<scalar_t, scalar_t, int32_t>{},
|
||||
thrust::pair<scalar_t, scalar_t>(
|
||||
at::numeric_limits<scalar_t>::upper_bound(),
|
||||
at::numeric_limits<scalar_t>::lower_bound()));
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
#define TORCH_ASSERT_NO_OPERATORS
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/NumericUtils.h>
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
#include <ATen/native/ReduceAllOps.h>
|
||||
@ -33,27 +34,27 @@ void max_values_kernel_cuda_impl(TensorIterator& iter) {
|
||||
}
|
||||
|
||||
void max_values_kernel_cuda(TensorIterator& iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(
|
||||
kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cuda", [&]() {
|
||||
AT_DISPATCH_V2(
|
||||
iter.dtype(), "max_values_cuda", AT_WRAP([&]() {
|
||||
max_values_kernel_cuda_impl<scalar_t>(iter);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
void max_launch_kernel(TensorIterator& iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(
|
||||
kBFloat16, kHalf, kBool, iter.input_dtype(), "max_cuda", [&]() {
|
||||
AT_DISPATCH_V2(
|
||||
iter.input_dtype(), "max_cuda", AT_WRAP([&]() {
|
||||
gpu_reduce_kernel<scalar_t, scalar_t>(
|
||||
iter,
|
||||
MaxOps<scalar_t>{},
|
||||
thrust::pair<scalar_t, int64_t>(
|
||||
at::numeric_limits<scalar_t>::lower_bound(), 0));
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
void max_all_launch_kernel(TensorIterator &iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "max_all_cuda", [&] {
|
||||
AT_DISPATCH_V2(iter.input_dtype(), "max_all_cuda", AT_WRAP([&] {
|
||||
max_values_kernel_cuda_impl<scalar_t>(iter);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
REGISTER_DISPATCH(max_values_stub, &max_values_kernel_cuda)
|
||||
|
||||
@ -12,6 +12,7 @@
|
||||
#include <ATen/NumericUtils.h>
|
||||
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/NumericUtils.h>
|
||||
#include <ATen/cuda/NumericLimits.cuh>
|
||||
|
||||
@ -33,24 +34,24 @@ void min_values_kernel_cuda_impl(TensorIterator& iter) {
|
||||
}
|
||||
|
||||
void min_values_kernel_cuda(TensorIterator& iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cuda", [&]() {
|
||||
AT_DISPATCH_V2(iter.dtype(), "min_values_cuda", AT_WRAP([&]() {
|
||||
min_values_kernel_cuda_impl<scalar_t>(iter);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
void min_launch_kernel(TensorIterator &iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_cuda", [&]() {
|
||||
AT_DISPATCH_V2(iter.input_dtype(), "min_cuda", AT_WRAP([&]() {
|
||||
gpu_reduce_kernel<scalar_t, scalar_t>(
|
||||
iter,
|
||||
MinOps<scalar_t>{},
|
||||
thrust::pair<scalar_t, int64_t>(at::numeric_limits<scalar_t>::upper_bound(), 0));
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
void min_all_launch_kernel(TensorIterator &iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_all_cuda", [&] {
|
||||
AT_DISPATCH_V2(iter.input_dtype(), "min_all_cuda", AT_WRAP([&] {
|
||||
min_values_kernel_cuda_impl<scalar_t>(iter);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
REGISTER_DISPATCH(min_values_stub, &min_values_kernel_cuda)
|
||||
|
||||
19
aten/src/ATen/native/hip/ck_group_gemm.h
Normal file
19
aten/src/ATen/native/hip/ck_group_gemm.h
Normal file
@ -0,0 +1,19 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <optional>
|
||||
|
||||
namespace at {
|
||||
namespace hip {
|
||||
namespace detail {
|
||||
void group_gemm_ck(
|
||||
const at::Tensor& mat_a,
|
||||
const at::Tensor& mat_b,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
at::Tensor& out);
|
||||
|
||||
} // namespace detail
|
||||
} // namespace hip
|
||||
} // namespace at
|
||||
462
aten/src/ATen/native/hip/ck_group_gemm.hip
Normal file
462
aten/src/ATen/native/hip/ck_group_gemm.hip
Normal file
@ -0,0 +1,462 @@
|
||||
#undef __HIP_NO_HALF_CONVERSIONS__
|
||||
#include <ATen/hip/HIPContext.h>
|
||||
#include <ATen/Tensor.h>
|
||||
#include <ATen/TensorAccessor.h>
|
||||
#include <c10/hip/HIPStream.h>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <optional>
|
||||
#include <type_traits>
|
||||
|
||||
#include <ck/ck.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/tensor_layout.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
|
||||
#include <ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp>
|
||||
#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
|
||||
#include <ck/utility/tuple.hpp>
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
namespace at {
|
||||
namespace hip {
|
||||
namespace detail {
|
||||
|
||||
namespace CkTypes {
|
||||
using BF16 = ck::bhalf_t;
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
}
|
||||
|
||||
template <typename ALayout, typename BLayout, typename DataType>
|
||||
using GroupedGemmKernel = ck::tensor_operation::device::DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage<
|
||||
ALayout, BLayout, ck::Tuple<>, ck::tensor_layout::gemm::RowMajor,
|
||||
DataType, DataType, CkTypes::F32, DataType, ck::Tuple<>, DataType,
|
||||
CkTypes::PassThrough, CkTypes::PassThrough, CkTypes::PassThrough,
|
||||
ck::tensor_operation::device::GemmSpecialization::MNKPadding,
|
||||
1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2,
|
||||
S<1,4,64,1>, S<0,2,1,3>, S<0,2,1,3>,
|
||||
3, 8, 8, 1,
|
||||
S<1,4,64,1>, S<0,2,1,3>, S<0,2,1,3>,
|
||||
3, 8, 8, 1,
|
||||
1, 1,
|
||||
S<1,32,1,8>, 4
|
||||
>;
|
||||
|
||||
template <typename ALayout, typename BLayout, typename DataType>
|
||||
void launch_grouped_bgemm_ck_impl_dispatch(
|
||||
const at::Tensor& mat_a,
|
||||
const at::Tensor& mat_b,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
at::Tensor& out)
|
||||
{
|
||||
using DeviceOp = GroupedGemmKernel<ALayout, BLayout, DataType>;
|
||||
using PassThrough = CkTypes::PassThrough;
|
||||
|
||||
std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
|
||||
std::vector<const void*> p_a_ptrs, p_b_ptrs;
|
||||
std::vector<void*> p_e_ptrs;
|
||||
// Note: d_ptrs will be resized after we populate the other vectors
|
||||
|
||||
const int mat_a_dim = mat_a.dim();
|
||||
const int mat_b_dim = mat_b.dim();
|
||||
|
||||
const char* a_ptr_base = reinterpret_cast<const char*>(mat_a.data_ptr());
|
||||
const char* b_ptr_base = reinterpret_cast<const char*>(mat_b.data_ptr());
|
||||
char* out_ptr_base = reinterpret_cast<char*>(out.data_ptr());
|
||||
const size_t a_element_size = mat_a.element_size();
|
||||
const size_t b_element_size = mat_b.element_size();
|
||||
const size_t out_element_size = out.element_size();
|
||||
|
||||
// for each group, calculate m,n,k,lda,ldb,ldc and A,B,out pointer base addresses.
|
||||
if (mat_a_dim == 2 && mat_b_dim == 2) {
|
||||
// 2D*2D case requires offset tensor
|
||||
auto offs_accessor = offs->accessor<int, 1>();
|
||||
int num_groups = offs_accessor.size(0);
|
||||
const int M = mat_a.size(0); // number of rows in A
|
||||
const int N = mat_b.size(1); // number of columns in B
|
||||
const int K = mat_a.size(1); // columns in A == rows in B
|
||||
// for 2d*2d input, output is 3d.
|
||||
// for each group, A columns (K) are sliced. M and N dimensions are not sliced.
|
||||
for (int i = 0; i < num_groups; ++i) {
|
||||
int start_k = (i == 0) ? 0 : offs_accessor[i-1];
|
||||
int end_k = offs_accessor[i];
|
||||
int k = end_k - start_k;
|
||||
|
||||
//K dimension are sliced, hence select stride(1) always.
|
||||
//K dimension is always dimension 1, regardless of memory layout (row/column major)
|
||||
const void* group_a_ptr = a_ptr_base + start_k * mat_a.stride(1) * a_element_size;
|
||||
const void* group_b_ptr;
|
||||
int ldb;
|
||||
|
||||
if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
|
||||
// Row-major B [K,N]: K values are horizontally adjacent, use stride(1) for K offset
|
||||
group_b_ptr = b_ptr_base + start_k * mat_b.stride(1) * b_element_size;
|
||||
// Leading dimension = distance between rows = stride(0)
|
||||
ldb = mat_b.stride(0);
|
||||
} else {
|
||||
// Column-major B [K,N]: K values are vertically adjacent, use stride(0) for K offset
|
||||
group_b_ptr = b_ptr_base + start_k * mat_b.stride(0) * b_element_size;
|
||||
// Leading dimension = distance between columns = stride(1)
|
||||
ldb = mat_b.stride(1);
|
||||
}
|
||||
|
||||
// Calculate output pointer for group i in 3D tensor [num_groups, M, N]
|
||||
// stride(0) = M*N elements between groups, so skip i*stride(0) elements to reach group i
|
||||
void* group_e_ptr = out_ptr_base + i * out.stride(0) * out_element_size;
|
||||
int lda, ldc;
|
||||
if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
|
||||
// Row-major A [M,K]: leading dimension = distance between rows = stride(0)
|
||||
lda = mat_a.stride(0);
|
||||
} else {
|
||||
// Column-major A [M,K]: leading dimension = distance between columns = stride(1)
|
||||
lda = mat_a.stride(1);
|
||||
}
|
||||
// Output is always row-major in 3D tensor [num_groups, M, N]
|
||||
// Leading dimension for each group's [M,N] slice = stride(1) = N
|
||||
ldc = out.stride(1);
|
||||
size_t output_group_bytes = M * N * out_element_size;
|
||||
void* group_e_ptr_end = (char*)group_e_ptr + output_group_bytes;
|
||||
|
||||
gemm_descs.push_back({
|
||||
static_cast<ck::index_t>(M),
|
||||
static_cast<ck::index_t>(N),
|
||||
static_cast<ck::index_t>(k),
|
||||
static_cast<ck::index_t>(lda),
|
||||
static_cast<ck::index_t>(ldb),
|
||||
static_cast<ck::index_t>(ldc),
|
||||
{} // --> stride_Ds_
|
||||
});
|
||||
p_a_ptrs.push_back(group_a_ptr);
|
||||
p_b_ptrs.push_back(group_b_ptr);
|
||||
p_e_ptrs.push_back(group_e_ptr);
|
||||
}
|
||||
} else if (mat_a_dim == 2 && mat_b_dim == 3) {
|
||||
// 2D*3D case requires offset tensor
|
||||
auto offs_accessor = offs->accessor<int, 1>();
|
||||
int num_groups = offs_accessor.size(0);
|
||||
|
||||
// 2d*3d input, output is 2d.
|
||||
// A: [m * n_groups, k], B: [n_groups, n, k] or [n_groups, k, n], Output: [m * n_groups, n]
|
||||
// Offset divides M dimension (rows of A), each group gets different rows of A and different batch of B
|
||||
const int K = mat_a.size(1); // columns in A
|
||||
// For 2D-3D case: The output determines N (result width)
|
||||
const int N = out.size(1); // N is the width of the output tensor
|
||||
|
||||
for (int i = 0; i < num_groups; ++i) {
|
||||
int start_m = (i == 0) ? 0 : offs_accessor[i - 1];
|
||||
int end_m = offs_accessor[i];
|
||||
int m = end_m - start_m;
|
||||
|
||||
// Skip zero-sized groups but continue processing subsequent groups
|
||||
if (m <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Select A rows for group i: skip start_m rows
|
||||
const void* group_a_ptr;
|
||||
int lda;
|
||||
if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
|
||||
// Row-major A [total_m, K]: skip start_m rows, each row is stride(0) elements apart
|
||||
group_a_ptr = a_ptr_base + start_m * mat_a.stride(0) * a_element_size;
|
||||
lda = mat_a.stride(0); // distance between rows
|
||||
} else {
|
||||
// Column-major A [total_m, K]: skip start_m elements in the first dimension (stride(0) is between rows)
|
||||
group_a_ptr = a_ptr_base + start_m * mat_a.stride(0) * a_element_size;
|
||||
|
||||
// Detect stride pattern for A tensor to determine appropriate lda calculation
|
||||
bool a_is_strided_tensor = (mat_a.stride(0) > mat_a.size(0));
|
||||
|
||||
if (a_is_strided_tensor) {
|
||||
// For strided A tensors: stride(0) gives the actual leading dimension
|
||||
lda = mat_a.stride(0);
|
||||
} else {
|
||||
// For non-strided A tensors: use the M dimension (total rows)
|
||||
lda = mat_a.size(0); // Total M dimension for column-major layout
|
||||
}
|
||||
}
|
||||
|
||||
// Select B batch for group i: B[i, :, :]
|
||||
const void* group_b_ptr = b_ptr_base + i * mat_b.stride(0) * b_element_size;
|
||||
int ldb;
|
||||
|
||||
if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
|
||||
// Row-major GEMM: expecting B as [K, N] but we have [N, K], so transpose needed
|
||||
ldb = mat_b.stride(2); // Leading dimension for accessing as [K, N]
|
||||
} else {
|
||||
// Detect stride pattern to determine appropriate ldb calculation
|
||||
bool is_strided_tensor = (mat_b.stride(2) > mat_b.size(2));
|
||||
|
||||
if (is_strided_tensor) {
|
||||
// For strided tensors: stride(2) gives the actual leading dimension
|
||||
ldb = mat_b.stride(2);
|
||||
} else {
|
||||
// For non-strided tensors: use the N dimension
|
||||
ldb = mat_b.size(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Output for this group: rows [start_m:end_m, :] in 2D output [total_m, N]
|
||||
void* group_e_ptr = out_ptr_base + start_m * out.stride(0) * out_element_size;
|
||||
int ldc = out.stride(0); // distance between rows in output (should be N for 2D case)
|
||||
|
||||
gemm_descs.push_back({
|
||||
static_cast<ck::index_t>(m),
|
||||
static_cast<ck::index_t>(N),
|
||||
static_cast<ck::index_t>(K),
|
||||
static_cast<ck::index_t>(lda),
|
||||
static_cast<ck::index_t>(ldb),
|
||||
static_cast<ck::index_t>(ldc),
|
||||
{} // --> stride_Ds_
|
||||
});
|
||||
p_a_ptrs.push_back(group_a_ptr);
|
||||
p_b_ptrs.push_back(group_b_ptr);
|
||||
p_e_ptrs.push_back(group_e_ptr);
|
||||
}
|
||||
} else if (mat_a_dim == 3 && mat_b_dim == 3) {
|
||||
// 3d*3d input, output is 3d - batched matrix multiplication
|
||||
// A: [batch, m, k], B: [batch, k, n] or [batch, n, k] (depending on transpose), Output: [batch, m, n]
|
||||
// Each batch is processed as a separate GEMM operation
|
||||
const int batch_size = mat_a.size(0);
|
||||
const int M = mat_a.size(1); // rows in each A matrix
|
||||
const int K = mat_a.size(2); // columns in A == rows in B (or columns if B is transposed)
|
||||
|
||||
// Determine N from B tensor - it could be B.size(1) or B.size(2) depending on layout
|
||||
int N;
|
||||
if (mat_b.size(1) == K) {
|
||||
// B is [batch, k, n] - normal layout
|
||||
N = mat_b.size(2);
|
||||
} else if (mat_b.size(2) == K) {
|
||||
// B is [batch, n, k] - transposed layout
|
||||
N = mat_b.size(1);
|
||||
} else {
|
||||
TORCH_CHECK(false, "CK Group GEMM 3D-3D: B tensor dimensions incompatible with A. A=[",
|
||||
batch_size, ",", M, ",", K, "], B=[", mat_b.size(0), ",", mat_b.size(1), ",", mat_b.size(2), "]");
|
||||
}
|
||||
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
// Select A batch for group i: A[i, :, :]
|
||||
const void* group_a_ptr = a_ptr_base + i * mat_a.stride(0) * a_element_size;
|
||||
|
||||
// Select B batch for group i: B[i, :, :]
|
||||
const void* group_b_ptr = b_ptr_base + i * mat_b.stride(0) * b_element_size;
|
||||
|
||||
// Select output batch for group i: Output[i, :, :]
|
||||
void* group_e_ptr = out_ptr_base + i * out.stride(0) * out_element_size;
|
||||
|
||||
int lda, ldb, ldc;
|
||||
|
||||
if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
|
||||
// Row-major A: leading dimension = distance between rows = stride(1)
|
||||
lda = mat_a.stride(1);
|
||||
} else {
|
||||
// Column-major A: leading dimension = distance between columns = stride(2)
|
||||
lda = mat_a.stride(2);
|
||||
}
|
||||
|
||||
if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
|
||||
// Row-major B: leading dimension = distance between rows
|
||||
if (mat_b.size(1) == K) {
|
||||
// B is [batch, k, n] - normal layout
|
||||
ldb = mat_b.stride(1); // stride between K rows
|
||||
} else {
|
||||
// B is [batch, n, k] - transposed layout, treat as [k, n] for GEMM
|
||||
ldb = mat_b.stride(2); // stride between N rows (since we're accessing as [k,n])
|
||||
}
|
||||
} else {
|
||||
// Column-major B: leading dimension = distance between columns
|
||||
if (mat_b.size(1) == K) {
|
||||
// B is [batch, k, n] - normal layout
|
||||
ldb = mat_b.stride(2); // stride between N columns
|
||||
} else {
|
||||
// B is [batch, n, k] - transposed layout
|
||||
ldb = mat_b.stride(1); // stride between K columns (since we're accessing as [n,k]→[k,n])
|
||||
}
|
||||
}
|
||||
|
||||
// Output is typically row-major: leading dimension = distance between rows = stride(1)
|
||||
ldc = out.stride(1);
|
||||
|
||||
gemm_descs.push_back({
|
||||
static_cast<ck::index_t>(M),
|
||||
static_cast<ck::index_t>(N),
|
||||
static_cast<ck::index_t>(K),
|
||||
static_cast<ck::index_t>(lda),
|
||||
static_cast<ck::index_t>(ldb),
|
||||
static_cast<ck::index_t>(ldc),
|
||||
{} // --> stride_Ds_
|
||||
});
|
||||
p_a_ptrs.push_back(group_a_ptr);
|
||||
p_b_ptrs.push_back(group_b_ptr);
|
||||
p_e_ptrs.push_back(group_e_ptr);
|
||||
}
|
||||
} else if (mat_a_dim == 3 && mat_b_dim == 2) {
|
||||
// 3D*2D case requires offset tensor
|
||||
auto offs_accessor = offs->accessor<int, 1>();
|
||||
int num_groups = offs_accessor.size(0);
|
||||
// 3d*2d input, output is 3d.
|
||||
// A: [n_groups, m, k], B: [k, total_n] (assuming row-major for both)
|
||||
// Offset divides N dimension of B, each group gets different slice of B and different batch of A
|
||||
const int batch_size = mat_a.size(0); // n_groups
|
||||
const int M = mat_a.size(1); // rows in each A matrix
|
||||
const int K = mat_a.size(2); // columns in A
|
||||
|
||||
// For row-major A and B case: B should be [K, total_N]
|
||||
const int total_N = mat_b.size(1); // B is [K, total_N] for row-major
|
||||
|
||||
for (int i = 0; i < num_groups; ++i) {
|
||||
int start_n = (i == 0) ? 0 : offs_accessor[i - 1];
|
||||
int end_n = offs_accessor[i];
|
||||
int n = end_n - start_n;
|
||||
|
||||
// Skip zero-sized groups but continue processing subsequent groups
|
||||
if (n <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Select A batch for group i: A[i, :, :]
|
||||
const void* group_a_ptr = a_ptr_base + i * mat_a.stride(0) * a_element_size;
|
||||
|
||||
// Select B slice for group i: B[:, start_n:end_n] (B[K, total_N])
|
||||
const void* group_b_ptr;
|
||||
int ldb;
|
||||
|
||||
// Check if B is row-major or column-major
|
||||
if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
|
||||
// Row-major B [K, total_N]: slice columns [start_n:end_n]
|
||||
group_b_ptr = b_ptr_base + start_n * mat_b.stride(1) * b_element_size;
|
||||
ldb = mat_b.stride(0); // distance between rows (should be total_N)
|
||||
} else {
|
||||
// Column-major B [K, total_N]: slice columns [start_n:end_n]
|
||||
group_b_ptr = b_ptr_base + start_n * mat_b.stride(1) * b_element_size;
|
||||
ldb = mat_b.stride(1); // distance between columns (should be K)
|
||||
}
|
||||
|
||||
// Select output slice for group i: Output[:, start_n:end_n]
|
||||
void* group_e_ptr = out_ptr_base + start_n * out.stride(1) * out_element_size;
|
||||
|
||||
int lda, ldc;
|
||||
|
||||
// Row-major A: leading dimension = distance between rows = stride(1)
|
||||
lda = mat_a.stride(1);
|
||||
// Output is row-major: leading dimension = distance between rows = stride(0)
|
||||
ldc = out.stride(0);
|
||||
|
||||
gemm_descs.push_back({
|
||||
static_cast<ck::index_t>(M),
|
||||
static_cast<ck::index_t>(n),
|
||||
static_cast<ck::index_t>(K),
|
||||
static_cast<ck::index_t>(lda),
|
||||
static_cast<ck::index_t>(ldb),
|
||||
static_cast<ck::index_t>(ldc),
|
||||
{} // --> stride_Ds_
|
||||
});
|
||||
p_a_ptrs.push_back(group_a_ptr);
|
||||
p_b_ptrs.push_back(group_b_ptr);
|
||||
p_e_ptrs.push_back(group_e_ptr);
|
||||
}
|
||||
} else {
|
||||
TORCH_CHECK(false, "CK Group GEMM: Unsupported dimensions, mat A dim is ", mat_a_dim, ", mat B dim is ", mat_b_dim);
|
||||
}
|
||||
|
||||
TORCH_INTERNAL_ASSERT(p_a_ptrs.size() > 0, "CK Group GEMM: No valid groups");
|
||||
|
||||
// Initialize d_ptrs with the correct size
|
||||
std::vector<std::array<const void*, 0>> d_ptrs(p_a_ptrs.size());
|
||||
|
||||
static DeviceOp gemm_instance;
|
||||
auto argument = gemm_instance.MakeArgument(
|
||||
p_a_ptrs, p_b_ptrs, d_ptrs, p_e_ptrs,
|
||||
gemm_descs, PassThrough{}, PassThrough{}, PassThrough{}
|
||||
);
|
||||
TORCH_INTERNAL_ASSERT(gemm_instance.IsSupportedArgument(argument),
|
||||
"CK Group GEMM: argument unsupported (shape/strides/type config)");
|
||||
size_t arg_buf_size = gemm_instance.GetDeviceKernelArgSize(&argument);
|
||||
size_t ws_size = gemm_instance.GetWorkSpaceSize(&argument);
|
||||
|
||||
void* gemm_arg_buf = nullptr;
|
||||
void* ws_buf = nullptr;
|
||||
|
||||
hipMalloc(&gemm_arg_buf, arg_buf_size);
|
||||
hipMalloc(&ws_buf, ws_size);
|
||||
|
||||
gemm_instance.SetDeviceKernelArgs(&argument, gemm_arg_buf);
|
||||
gemm_instance.SetWorkSpacePointer(&argument, ws_buf);
|
||||
|
||||
auto invoker = gemm_instance.MakeInvoker();
|
||||
hipStream_t stream = c10::hip::getCurrentHIPStream();
|
||||
invoker.Run(argument, {stream});
|
||||
hipFree(gemm_arg_buf);
|
||||
hipFree(ws_buf);
|
||||
}
|
||||
|
||||
void group_gemm_ck(
|
||||
const at::Tensor& input_a,
|
||||
const at::Tensor& input_b_colmajor,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
const std::optional<at::Tensor>& /*bias*/,
|
||||
at::Tensor& out)
|
||||
{
|
||||
// Detect if input_a is row-major based on stride pattern
|
||||
bool a_row_major = (input_a.dim() == 3) ? (input_a.stride(2) == 1) : (input_a.stride(1) == 1);
|
||||
bool b_col_major = (input_b_colmajor.dim() == 3) ? (input_b_colmajor.stride(1) == 1) : (input_b_colmajor.stride(0) == 1);
|
||||
// Ensure tensor A is row-major and contiguous if not already
|
||||
at::Tensor mat_a = input_a;
|
||||
if (!a_row_major) {
|
||||
// If A is not row-major, make it contiguous (row-major)
|
||||
mat_a = input_a.contiguous();
|
||||
}
|
||||
// Force tensor B to be column-major using double transpose trick
|
||||
// This guarantees stride(0) == 1 and stride(1) == K for [K, N] shape
|
||||
at::Tensor mat_b = input_b_colmajor;
|
||||
if (!b_col_major) {
|
||||
mat_b = input_b_colmajor.transpose(-2, -1).contiguous().transpose(-2, -1);
|
||||
}
|
||||
|
||||
// For 3D tensors, check the last dimension stride for row-major detection
|
||||
a_row_major = (mat_a.dim() == 3) ? (mat_a.stride(2) == 1) : (mat_a.stride(1) == 1);
|
||||
bool b_row_major = (mat_b.dim() == 3) ? (mat_b.stride(2) == 1) : (mat_b.stride(1) == 1);
|
||||
|
||||
if (mat_a.dtype() == at::kBFloat16) {
|
||||
// bf16 path
|
||||
if (a_row_major && b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
|
||||
} else if (a_row_major && !b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
|
||||
} else if (!a_row_major && b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
|
||||
} else {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
|
||||
}
|
||||
} else if (mat_a.dtype() == at::kHalf) {
|
||||
// fp16 path
|
||||
if (a_row_major && b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
|
||||
} else if (a_row_major && !b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
|
||||
} else if (!a_row_major && b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
|
||||
} else {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
|
||||
}
|
||||
} else if (mat_a.dtype() == at::kFloat) {
|
||||
// fp32 path
|
||||
if (a_row_major && b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
|
||||
} else if (a_row_major && !b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
|
||||
} else if (!a_row_major && b_row_major) {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
|
||||
} else {
|
||||
launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
|
||||
}
|
||||
} else {
|
||||
TORCH_CHECK(false, "CK Group GEMM: Unsupported mat_a dtype");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
} // namespace hip
|
||||
} // namespace at
|
||||
@ -212,17 +212,12 @@ static Tensor& bce_loss_out_impl(const Tensor& input,
|
||||
loss.resize_((reduction == Reduction::None || grad_output.defined()) ? target.sizes() : IntArrayRef({}));
|
||||
TORCH_CHECK(loss.is_mps());
|
||||
|
||||
Tensor loss_squeezed = loss.squeeze();
|
||||
Tensor input_squeezed = input.squeeze();
|
||||
Tensor target_squeezed = target.squeeze();
|
||||
|
||||
@autoreleasepool {
|
||||
std::string key =
|
||||
op_name + reductionToString(reduction) + getTensorsStringKey({input_squeezed, target_squeezed, weight});
|
||||
std::string key = op_name + reductionToString(reduction) + getTensorsStringKey({input, target, weight});
|
||||
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_squeezed);
|
||||
newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target_squeezed);
|
||||
newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
|
||||
newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
|
||||
|
||||
MPSGraphTensor* bceLossUnweighted = nil;
|
||||
// if grad_output is defined, then it's a backward pass
|
||||
@ -252,12 +247,12 @@ static Tensor& bce_loss_out_impl(const Tensor& input,
|
||||
newCachedGraph->gradInputTensor = bceLoss;
|
||||
}
|
||||
} else {
|
||||
newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input_squeezed.sizes().size());
|
||||
newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input.sizes().size());
|
||||
}
|
||||
});
|
||||
Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input_squeezed);
|
||||
Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target_squeezed);
|
||||
Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss_squeezed);
|
||||
Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input);
|
||||
Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target);
|
||||
Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss);
|
||||
|
||||
NSMutableDictionary* feeds = [[NSMutableDictionary new] autorelease];
|
||||
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
load("//tools/build_defs:fb_xplat_cxx_library.bzl", "fb_xplat_cxx_library")
|
||||
load("//tools/build_defs:fb_xplat_cxx_test.bzl", "fb_xplat_cxx_test")
|
||||
load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
|
||||
load("//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "APPLETVOS", "CXX", "IOS", "MACOSX")
|
||||
load("//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "CXX", "IOS", "MACOSX")
|
||||
|
||||
# Shared by internal and OSS BUCK
|
||||
def define_qnnpack(third_party, labels = []):
|
||||
@ -21,7 +21,7 @@ def define_qnnpack(third_party, labels = []):
|
||||
("src", "requantization/*.h"),
|
||||
]),
|
||||
header_namespace = "",
|
||||
apple_sdks = (IOS, MACOSX, APPLETVOS),
|
||||
apple_sdks = (IOS, MACOSX),
|
||||
compiler_flags = [
|
||||
"-O2",
|
||||
"-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION",
|
||||
@ -82,7 +82,7 @@ def define_qnnpack(third_party, labels = []):
|
||||
("src", "requantization/*.h"),
|
||||
]),
|
||||
header_namespace = "",
|
||||
apple_sdks = (IOS, MACOSX, APPLETVOS),
|
||||
apple_sdks = (IOS, MACOSX),
|
||||
compiler_flags = [
|
||||
"-O3",
|
||||
"-ffast-math",
|
||||
@ -129,7 +129,7 @@ def define_qnnpack(third_party, labels = []):
|
||||
("src", "requantization/*.h"),
|
||||
]),
|
||||
header_namespace = "",
|
||||
apple_sdks = (IOS, MACOSX, APPLETVOS),
|
||||
apple_sdks = (IOS, MACOSX),
|
||||
compiler_flags = [
|
||||
"-O3",
|
||||
"-ffast-math",
|
||||
@ -184,7 +184,7 @@ def define_qnnpack(third_party, labels = []):
|
||||
("src", "requantization/*.h"),
|
||||
]),
|
||||
header_namespace = "",
|
||||
apple_sdks = (IOS, MACOSX, APPLETVOS),
|
||||
apple_sdks = (IOS, MACOSX),
|
||||
compiler_flags = [
|
||||
"-O3",
|
||||
"-ffast-math",
|
||||
@ -236,7 +236,7 @@ def define_qnnpack(third_party, labels = []):
|
||||
],
|
||||
),
|
||||
header_namespace = "",
|
||||
apple_sdks = (IOS, MACOSX, APPLETVOS),
|
||||
apple_sdks = (IOS, MACOSX),
|
||||
compiler_flags = [
|
||||
"-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION",
|
||||
],
|
||||
@ -291,7 +291,7 @@ def define_qnnpack(third_party, labels = []):
|
||||
("src", "qnnpack/*.h"),
|
||||
("include", "*.h"),
|
||||
]),
|
||||
apple_sdks = (IOS, MACOSX, APPLETVOS),
|
||||
apple_sdks = (IOS, MACOSX),
|
||||
compiler_flags = [
|
||||
"-O2",
|
||||
"-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION",
|
||||
@ -398,7 +398,7 @@ def define_qnnpack(third_party, labels = []):
|
||||
("src", "requantization/*.h"),
|
||||
]),
|
||||
header_namespace = "",
|
||||
apple_sdks = (IOS, MACOSX, APPLETVOS),
|
||||
apple_sdks = (IOS, MACOSX),
|
||||
compiler_flags = [
|
||||
"-O3",
|
||||
"-ffast-math",
|
||||
@ -465,7 +465,7 @@ def define_qnnpack(third_party, labels = []):
|
||||
("src", "requantization/*.h"),
|
||||
]),
|
||||
header_namespace = "",
|
||||
apple_sdks = (IOS, MACOSX, APPLETVOS),
|
||||
apple_sdks = (IOS, MACOSX),
|
||||
compiler_flags = [
|
||||
"-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION",
|
||||
"-Wno-unused-command-line-argument",
|
||||
@ -525,7 +525,7 @@ def define_qnnpack(third_party, labels = []):
|
||||
("src", "qnnpack/*.h"),
|
||||
]),
|
||||
header_namespace = "",
|
||||
apple_sdks = (IOS, MACOSX, APPLETVOS),
|
||||
apple_sdks = (IOS, MACOSX),
|
||||
compiler_flags = [
|
||||
"-O3",
|
||||
"-ffast-math",
|
||||
|
||||
@ -53,10 +53,8 @@ class AddmmBenchmark(op_bench.TorchBenchmarkBase):
|
||||
return torch.addmm(input_one, mat1, mat2)
|
||||
|
||||
|
||||
op_bench.generate_pt_test(addmm_long_configs + addmm_long_configs, AddmmBenchmark)
|
||||
op_bench.generate_pt_gradient_test(
|
||||
addmm_long_configs + addmm_long_configs, AddmmBenchmark
|
||||
)
|
||||
op_bench.generate_pt_test(addmm_short_configs + addmm_long_configs, AddmmBenchmark)
|
||||
op_bench.generate_pt_gradient_test(addmm_long_configs, AddmmBenchmark)
|
||||
|
||||
"""Mircobenchmark for addbmm operator."""
|
||||
|
||||
@ -107,9 +105,7 @@ addbmm_short_configs = op_bench.cross_product_configs(
|
||||
)
|
||||
|
||||
op_bench.generate_pt_test(addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark)
|
||||
op_bench.generate_pt_gradient_test(
|
||||
addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark
|
||||
)
|
||||
op_bench.generate_pt_gradient_test(addbmm_long_configs, AddbmmBenchmark)
|
||||
|
||||
if __name__ == "__main__":
|
||||
op_bench.benchmark_runner.main()
|
||||
|
||||
@ -8,7 +8,7 @@ load("//tools/build_defs:fb_xplat_genrule.bzl", "fb_xplat_genrule")
|
||||
load("//tools/build_defs/windows:windows_flag_map.bzl", "windows_convert_gcc_clang_flags")
|
||||
load("//tools/build_defs:fbsource_utils.bzl", "is_arvr_mode")
|
||||
load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
|
||||
load("//tools/build_defs:platform_defs.bzl", "APPLETVOS", "IOS", "MACOSX")
|
||||
load("//tools/build_defs:platform_defs.bzl", "IOS", "MACOSX")
|
||||
load("//tools/build_defs:type_defs.bzl", "is_list", "is_string")
|
||||
load("//tools/build_defs/android:build_mode_defs.bzl", is_production_build_android = "is_production_build")
|
||||
load("//tools/build_defs/apple:build_mode_defs.bzl", is_production_build_ios = "is_production_build", is_profile_build_ios = "is_profile_build")
|
||||
@ -1090,7 +1090,7 @@ def define_buck_targets(
|
||||
srcs = [
|
||||
"caffe2/core/common.cc",
|
||||
],
|
||||
apple_sdks = (IOS, MACOSX, APPLETVOS),
|
||||
apple_sdks = (IOS, MACOSX),
|
||||
compiler_flags = get_pt_compiler_flags(),
|
||||
labels = labels,
|
||||
# @lint-ignore BUCKLINT link_whole
|
||||
|
||||
@ -1025,6 +1025,7 @@ libtorch_python_core_sources = [
|
||||
libtorch_python_distributed_core_sources = [
|
||||
"torch/csrc/distributed/c10d/init.cpp",
|
||||
"torch/csrc/distributed/c10d/python_comm_hook.cpp",
|
||||
"torch/csrc/distributed/c10d/python_callback_work.cpp",
|
||||
]
|
||||
|
||||
libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [
|
||||
|
||||
@ -59,6 +59,9 @@ constexpr DispatchKeySet nested_dispatch_keyset =
|
||||
{DispatchKey::AutogradNestedTensor, DispatchKey::NestedTensor}) |
|
||||
DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
|
||||
|
||||
constexpr DispatchKeySet functorch_batched_dispatch_keyset =
|
||||
DispatchKeySet(DispatchKey::FuncTorchBatched);
|
||||
|
||||
DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) {
|
||||
TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined);
|
||||
switch (t) {
|
||||
@ -77,6 +80,8 @@ DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) {
|
||||
return backend_dispatch_keyset;
|
||||
case DispatchKey::CompositeExplicitAutogradNonFunctional:
|
||||
return non_functional_backend_dispatch_keyset;
|
||||
case DispatchKey::FuncTorchBatchedDecomposition:
|
||||
return functorch_batched_dispatch_keyset;
|
||||
default:
|
||||
return DispatchKeySet(t);
|
||||
}
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
#include <c10/core/SymBool.h>
|
||||
#include <c10/core/SymInt.h>
|
||||
#include <c10/core/SymNodeImpl.h>
|
||||
|
||||
namespace c10 {
|
||||
@ -111,4 +112,17 @@ bool SymBool::has_hint() const {
|
||||
return toSymNodeImpl()->has_hint();
|
||||
}
|
||||
|
||||
SymInt SymBool::toSymInt() const {
|
||||
// If concrete bool, return concrete SymInt
|
||||
if (auto ma = maybe_as_bool()) {
|
||||
return SymInt(*ma ? 1 : 0);
|
||||
}
|
||||
|
||||
// Symbolic case: use sym_ite to convert bool to int (0 or 1)
|
||||
auto node = toSymNodeImpl();
|
||||
auto one_node = node->wrap_int(1);
|
||||
auto zero_node = node->wrap_int(0);
|
||||
return SymInt(node->sym_ite(one_node, zero_node));
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
||||
@ -12,6 +12,8 @@
|
||||
|
||||
namespace c10 {
|
||||
|
||||
class SymInt;
|
||||
|
||||
class C10_API SymBool {
|
||||
public:
|
||||
/*implicit*/ SymBool(bool b) : data_(b) {}
|
||||
@ -80,6 +82,10 @@ class C10_API SymBool {
|
||||
return toSymNodeImplUnowned()->constant_bool();
|
||||
}
|
||||
|
||||
// Convert SymBool to SymInt (0 or 1)
|
||||
// This is the C++ equivalent of Python's cast_symbool_to_symint_guardless
|
||||
SymInt toSymInt() const;
|
||||
|
||||
bool is_heap_allocated() const {
|
||||
return ptr_;
|
||||
}
|
||||
|
||||
@ -106,6 +106,9 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
|
||||
} else if (key == "graph_capture_record_stream_reuse") {
|
||||
i = parseGraphCaptureRecordStreamReuse(tokenizer, i);
|
||||
used_native_specific_option = true;
|
||||
} else if (key == "per_process_memory_fraction") {
|
||||
i = parsePerProcessMemoryFraction(tokenizer, i);
|
||||
used_native_specific_option = true;
|
||||
} else {
|
||||
const auto& keys =
|
||||
c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
|
||||
@ -146,6 +149,18 @@ size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
|
||||
return i;
|
||||
}
|
||||
|
||||
double CUDAAllocatorConfig::parsePerProcessMemoryFraction(
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
double val_env = tokenizer.toDouble(++i);
|
||||
TORCH_CHECK_VALUE(
|
||||
val_env >= 0.0 && val_env <= 1.0,
|
||||
"per_process_memory_fraction is invalid, set it in [0.0, 1.0]");
|
||||
m_per_process_memory_fraction = val_env;
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
size_t i) {
|
||||
|
||||
@ -61,6 +61,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
return instance().m_graph_capture_record_stream_reuse;
|
||||
}
|
||||
|
||||
static double per_process_memory_fraction() {
|
||||
return instance().m_per_process_memory_fraction;
|
||||
}
|
||||
|
||||
/** Pinned memory allocator settings */
|
||||
static bool pinned_use_cuda_host_register() {
|
||||
return instance().m_pinned_use_cuda_host_register;
|
||||
@ -152,7 +156,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
"pinned_use_hip_host_register",
|
||||
"graph_capture_record_stream_reuse",
|
||||
"pinned_reserve_segment_size_mb",
|
||||
"pinned_num_register_threads"};
|
||||
"pinned_num_register_threads",
|
||||
"per_process_memory_fraction"};
|
||||
return keys;
|
||||
}
|
||||
|
||||
@ -177,6 +182,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
size_t parseGraphCaptureRecordStreamReuse(
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
size_t i);
|
||||
double parsePerProcessMemoryFraction(
|
||||
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
|
||||
size_t i);
|
||||
|
||||
std::atomic<size_t> m_pinned_num_register_threads{1};
|
||||
std::atomic<size_t> m_pinned_reserve_segment_size_mb{0};
|
||||
@ -189,6 +197,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
std::atomic<bool> m_release_lock_on_cudamalloc{false};
|
||||
std::atomic<bool> m_pinned_use_cuda_host_register{false};
|
||||
std::atomic<bool> m_graph_capture_record_stream_reuse{false};
|
||||
std::atomic<double> m_per_process_memory_fraction{1.0};
|
||||
};
|
||||
|
||||
// Keep this for backwards compatibility
|
||||
|
||||
@ -1100,7 +1100,7 @@ class RingBuffer {
|
||||
} // anonymous namespace
|
||||
} // namespace Native
|
||||
|
||||
static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
|
||||
static std::string reportProcessMemoryInfo(const cudaDeviceProp& prop) {
|
||||
#ifdef PYTORCH_C10_DRIVER_API_SUPPORTED
|
||||
void* nvml_handle = DriverAPI::get_nvml_handle();
|
||||
if (!nvml_handle) {
|
||||
@ -1111,9 +1111,6 @@ static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
|
||||
return true;
|
||||
}();
|
||||
|
||||
cudaDeviceProp prop{};
|
||||
C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
||||
|
||||
// NOLINTNEXTLINE(*-c-arrays)
|
||||
char pci_id[80];
|
||||
snprintf(
|
||||
@ -1215,14 +1212,16 @@ class DeviceCachingAllocator {
|
||||
// record used memory.
|
||||
size_t total_allocated_memory = 0;
|
||||
|
||||
size_t allowed_memory_maximum = 0;
|
||||
cudaDeviceProp device_prop;
|
||||
|
||||
// maximum amount of memory that device is allowed to
|
||||
// allocate. This is set iff memory fraction is less than 1
|
||||
std::optional<size_t> allowed_memory_maximum{std::nullopt};
|
||||
|
||||
// all live expandable segments
|
||||
std::vector<ExpandableSegment*> expandable_segments_;
|
||||
std::vector<c10::DeviceIndex> devices_with_peer_access_;
|
||||
|
||||
bool set_fraction = false;
|
||||
|
||||
bool record_history = false;
|
||||
|
||||
std::atomic<CreateContextFn> context_recorder_;
|
||||
@ -1264,6 +1263,9 @@ class DeviceCachingAllocator {
|
||||
: device_id(id),
|
||||
large_blocks(/*small=*/false),
|
||||
small_blocks(/*small=*/true) {
|
||||
C10_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, id));
|
||||
|
||||
setMemoryFraction(CUDAAllocatorConfig::per_process_memory_fraction());
|
||||
stats.max_split_size =
|
||||
static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
|
||||
context_recorder_.store(nullptr);
|
||||
@ -1399,7 +1401,7 @@ class DeviceCachingAllocator {
|
||||
if (!block_found) {
|
||||
// Do garbage collection if the flag is set.
|
||||
if (C10_UNLIKELY(
|
||||
set_fraction &&
|
||||
allowed_memory_maximum.has_value() &&
|
||||
AcceleratorAllocatorConfig::garbage_collection_threshold() >
|
||||
0.0)) {
|
||||
garbage_collect_cached_blocks(context);
|
||||
@ -1456,11 +1458,12 @@ class DeviceCachingAllocator {
|
||||
C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
|
||||
std::string allowed_info;
|
||||
|
||||
if (set_fraction) {
|
||||
allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
|
||||
if (allowed_memory_maximum.has_value()) {
|
||||
allowed_info =
|
||||
format_size(allowed_memory_maximum.value()) + " allowed; ";
|
||||
}
|
||||
|
||||
std::string proc_info = reportProcessMemoryInfo(device_id);
|
||||
std::string proc_info = reportProcessMemoryInfo(device_prop);
|
||||
|
||||
record_trace(
|
||||
TraceEntry::OOM,
|
||||
@ -1518,7 +1521,7 @@ class DeviceCachingAllocator {
|
||||
for (const auto& obs : observers_local) {
|
||||
obs(device_id,
|
||||
alloc_size,
|
||||
set_fraction ? allowed_memory_maximum : device_total,
|
||||
allowed_memory_maximum.value_or(device_total),
|
||||
device_free);
|
||||
}
|
||||
|
||||
@ -2015,25 +2018,26 @@ class DeviceCachingAllocator {
|
||||
|
||||
/** get memory fraction limiting maximum allocated memory **/
|
||||
double getMemoryFraction() {
|
||||
if (!set_fraction) {
|
||||
if (!allowed_memory_maximum.has_value()) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
size_t device_free = 0;
|
||||
size_t device_total = 0;
|
||||
C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
|
||||
return static_cast<double>(allowed_memory_maximum) /
|
||||
static_cast<double>(device_total);
|
||||
return static_cast<double>(allowed_memory_maximum.value()) /
|
||||
static_cast<double>(device_prop.totalGlobalMem);
|
||||
}
|
||||
|
||||
/** set memory fraction to limit maximum allocated memory **/
|
||||
void setMemoryFraction(double fraction) {
|
||||
size_t device_free = 0;
|
||||
size_t device_total = 0;
|
||||
C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
|
||||
allowed_memory_maximum =
|
||||
static_cast<size_t>(fraction * static_cast<double>(device_total));
|
||||
set_fraction = true;
|
||||
TORCH_CHECK(
|
||||
0 <= fraction && fraction <= 1,
|
||||
"invalid fraction:",
|
||||
fraction,
|
||||
". Please set within [0, 1].");
|
||||
allowed_memory_maximum = std::nullopt;
|
||||
if (fraction < 1.0) {
|
||||
allowed_memory_maximum = static_cast<size_t>(
|
||||
fraction * static_cast<double>(device_prop.totalGlobalMem));
|
||||
}
|
||||
}
|
||||
|
||||
/** get expandable segment size for all the streams on device **/
|
||||
@ -3010,7 +3014,7 @@ class DeviceCachingAllocator {
|
||||
BlockPool& pool = *p.pool;
|
||||
|
||||
if (C10_UNLIKELY(
|
||||
set_fraction &&
|
||||
allowed_memory_maximum.has_value() &&
|
||||
AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
|
||||
// Track block reuse interval only when garbage collection is enabled.
|
||||
++pool.get_free_blocks_call_count;
|
||||
@ -3083,7 +3087,7 @@ class DeviceCachingAllocator {
|
||||
|
||||
size_t gc_threshold = static_cast<size_t>(
|
||||
AcceleratorAllocatorConfig::garbage_collection_threshold() *
|
||||
static_cast<double>(allowed_memory_maximum));
|
||||
static_cast<double>(allowed_memory_maximum.value()));
|
||||
// No need to trigger GC yet
|
||||
if (total_allocated_memory <= gc_threshold) {
|
||||
return;
|
||||
@ -3161,8 +3165,8 @@ class DeviceCachingAllocator {
|
||||
|
||||
bool active_pool =
|
||||
p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator();
|
||||
if (set_fraction &&
|
||||
total_allocated_memory + size > allowed_memory_maximum) {
|
||||
if (allowed_memory_maximum.has_value() &&
|
||||
total_allocated_memory + size > allowed_memory_maximum.value()) {
|
||||
p.err = cudaErrorMemoryAllocation;
|
||||
return false;
|
||||
// Temporarily disable checkpointing & cudagraphs internally
|
||||
@ -3859,7 +3863,6 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
"Allocator not initialized for device ",
|
||||
device,
|
||||
": did you call init?");
|
||||
C10_CUDA_CHECK(c10::cuda::SetDevice(device));
|
||||
return device_allocator[device]->getMemoryFraction();
|
||||
}
|
||||
|
||||
@ -3869,12 +3872,6 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
"Allocator not initialized for device ",
|
||||
device,
|
||||
": did you call init?");
|
||||
TORCH_CHECK(
|
||||
0 <= fraction && fraction <= 1,
|
||||
"invalid fraction:",
|
||||
fraction,
|
||||
". Please set within [0, 1].");
|
||||
C10_CUDA_CHECK(c10::cuda::SetDevice(device));
|
||||
device_allocator[device]->setMemoryFraction(fraction);
|
||||
}
|
||||
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
#include <c10/core/AllocatorConfig.h>
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
#include <c10/cuda/CUDAAllocatorConfig.h>
|
||||
#include <c10/cuda/CUDAGraphsC10Utils.h>
|
||||
#include <c10/cuda/CUDAMacros.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
|
||||
@ -427,7 +427,6 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
|
||||
// on the current device each later call sees.
|
||||
void init(int dev_count) override {
|
||||
static bool called = [](int dev_count) {
|
||||
;
|
||||
// Are there external guarantees init will be called before
|
||||
// any of the allocator's other functions?
|
||||
// std::lock_guard<std::mutex> lk(general_mutex);
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/SmallVector.h>
|
||||
#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
@ -40,200 +41,99 @@ namespace c10 {
|
||||
///
|
||||
/// This is intended to be trivially copyable, so it should be passed by
|
||||
/// value.
|
||||
///
|
||||
/// NOTE: We have refactored out the headeronly parts of the ArrayRef struct
|
||||
/// into HeaderOnlyArrayRef. As adding `virtual` would change the performance of
|
||||
/// the underlying constexpr calls, we rely on apparent-type dispatch for
|
||||
/// inheritance. This should be fine because their memory format is the same,
|
||||
/// and it is never incorrect for ArrayRef to call HeaderOnlyArrayRef methods.
|
||||
/// However, you should prefer to use ArrayRef when possible, because its use
|
||||
/// of TORCH_CHECK will lead to better user-facing error messages.
|
||||
template <typename T>
|
||||
class ArrayRef final {
|
||||
class ArrayRef final : public HeaderOnlyArrayRef<T> {
|
||||
public:
|
||||
using iterator = const T*;
|
||||
using const_iterator = const T*;
|
||||
using size_type = size_t;
|
||||
using value_type = T;
|
||||
|
||||
using reverse_iterator = std::reverse_iterator<iterator>;
|
||||
|
||||
private:
|
||||
/// The start of the array, in an external buffer.
|
||||
const T* Data;
|
||||
|
||||
/// The number of elements.
|
||||
size_type Length;
|
||||
|
||||
void debugCheckNullptrInvariant() {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
Data != nullptr || Length == 0,
|
||||
"created ArrayRef with nullptr and non-zero length! std::optional relies on this being illegal");
|
||||
}
|
||||
|
||||
public:
|
||||
/// @name Constructors
|
||||
/// @name Constructors, all inherited from HeaderOnlyArrayRef except for
|
||||
/// SmallVector. As inherited constructors won't work with class template
|
||||
/// argument deduction (CTAD) until C++23, we add deduction guides after
|
||||
/// the class definition to enable CTAD.
|
||||
/// @{
|
||||
|
||||
/// Construct an empty ArrayRef.
|
||||
/* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {}
|
||||
|
||||
/// Construct an ArrayRef from a single element.
|
||||
// TODO Make this explicit
|
||||
constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
|
||||
|
||||
/// Construct an ArrayRef from a pointer and length.
|
||||
constexpr ArrayRef(const T* data, size_t length)
|
||||
: Data(data), Length(length) {
|
||||
debugCheckNullptrInvariant();
|
||||
}
|
||||
|
||||
/// Construct an ArrayRef from a range.
|
||||
constexpr ArrayRef(const T* begin, const T* end)
|
||||
: Data(begin), Length(end - begin) {
|
||||
debugCheckNullptrInvariant();
|
||||
}
|
||||
using HeaderOnlyArrayRef<T>::HeaderOnlyArrayRef;
|
||||
|
||||
/// Construct an ArrayRef from a SmallVector. This is templated in order to
|
||||
/// avoid instantiating SmallVectorTemplateCommon<T> whenever we
|
||||
/// copy-construct an ArrayRef.
|
||||
/// NOTE: this is the only constructor that is not inherited from
|
||||
/// HeaderOnlyArrayRef.
|
||||
template <typename U>
|
||||
/* implicit */ ArrayRef(const SmallVectorTemplateCommon<T, U>& Vec)
|
||||
: Data(Vec.data()), Length(Vec.size()) {
|
||||
debugCheckNullptrInvariant();
|
||||
}
|
||||
|
||||
template <
|
||||
typename Container,
|
||||
typename U = decltype(std::declval<Container>().data()),
|
||||
typename = std::enable_if_t<
|
||||
(std::is_same_v<U, T*> || std::is_same_v<U, T const*>)>>
|
||||
/* implicit */ ArrayRef(const Container& container)
|
||||
: Data(container.data()), Length(container.size()) {
|
||||
debugCheckNullptrInvariant();
|
||||
}
|
||||
|
||||
/// Construct an ArrayRef from a std::vector.
|
||||
// The enable_if stuff here makes sure that this isn't used for
|
||||
// std::vector<bool>, because ArrayRef can't work on a std::vector<bool>
|
||||
// bitfield.
|
||||
template <typename A>
|
||||
/* implicit */ ArrayRef(const std::vector<T, A>& Vec)
|
||||
: Data(Vec.data()), Length(Vec.size()) {
|
||||
static_assert(
|
||||
!std::is_same_v<T, bool>,
|
||||
"ArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
|
||||
}
|
||||
|
||||
/// Construct an ArrayRef from a std::array
|
||||
template <size_t N>
|
||||
/* implicit */ constexpr ArrayRef(const std::array<T, N>& Arr)
|
||||
: Data(Arr.data()), Length(N) {}
|
||||
|
||||
/// Construct an ArrayRef from a C array.
|
||||
template <size_t N>
|
||||
// NOLINTNEXTLINE(*c-arrays*)
|
||||
/* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
|
||||
|
||||
/// Construct an ArrayRef from a std::initializer_list.
|
||||
/* implicit */ constexpr ArrayRef(const std::initializer_list<T>& Vec)
|
||||
: Data(
|
||||
std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr)
|
||||
: std::begin(Vec)),
|
||||
Length(Vec.size()) {}
|
||||
: HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {}
|
||||
|
||||
/// @}
|
||||
/// @name Simple Operations
|
||||
/// @name Simple Operations, mostly inherited from HeaderOnlyArrayRef
|
||||
/// @{
|
||||
|
||||
constexpr iterator begin() const {
|
||||
return Data;
|
||||
}
|
||||
constexpr iterator end() const {
|
||||
return Data + Length;
|
||||
}
|
||||
|
||||
// These are actually the same as iterator, since ArrayRef only
|
||||
// gives you const iterators.
|
||||
constexpr const_iterator cbegin() const {
|
||||
return Data;
|
||||
}
|
||||
constexpr const_iterator cend() const {
|
||||
return Data + Length;
|
||||
}
|
||||
|
||||
constexpr reverse_iterator rbegin() const {
|
||||
return reverse_iterator(end());
|
||||
}
|
||||
constexpr reverse_iterator rend() const {
|
||||
return reverse_iterator(begin());
|
||||
}
|
||||
|
||||
/// Check if all elements in the array satisfy the given expression
|
||||
constexpr bool allMatch(const std::function<bool(const T&)>& pred) const {
|
||||
return std::all_of(cbegin(), cend(), pred);
|
||||
}
|
||||
|
||||
/// empty - Check if the array is empty.
|
||||
constexpr bool empty() const {
|
||||
return Length == 0;
|
||||
}
|
||||
|
||||
constexpr const T* data() const {
|
||||
return Data;
|
||||
}
|
||||
|
||||
/// size - Get the array size.
|
||||
constexpr size_t size() const {
|
||||
return Length;
|
||||
}
|
||||
|
||||
/// front - Get the first element.
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr const T& front() const {
|
||||
TORCH_CHECK(
|
||||
!empty(), "ArrayRef: attempted to access front() of empty list");
|
||||
return Data[0];
|
||||
!this->empty(), "ArrayRef: attempted to access front() of empty list");
|
||||
return this->Data[0];
|
||||
}
|
||||
|
||||
/// back - Get the last element.
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr const T& back() const {
|
||||
TORCH_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
|
||||
return Data[Length - 1];
|
||||
}
|
||||
|
||||
/// equals - Check for element-wise equality.
|
||||
constexpr bool equals(ArrayRef RHS) const {
|
||||
return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
|
||||
TORCH_CHECK(
|
||||
!this->empty(), "ArrayRef: attempted to access back() of empty list");
|
||||
return this->Data[this->Length - 1];
|
||||
}
|
||||
|
||||
/// slice(n, m) - Take M elements of the array starting at element N
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr ArrayRef<T> slice(size_t N, size_t M) const {
|
||||
TORCH_CHECK(
|
||||
N + M <= size(),
|
||||
N + M <= this->size(),
|
||||
"ArrayRef: invalid slice, N = ",
|
||||
N,
|
||||
"; M = ",
|
||||
M,
|
||||
"; size = ",
|
||||
size());
|
||||
return ArrayRef<T>(data() + N, M);
|
||||
this->size());
|
||||
return ArrayRef<T>(this->data() + N, M);
|
||||
}
|
||||
|
||||
/// slice(n) - Chop off the first N elements of the array.
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr ArrayRef<T> slice(size_t N) const {
|
||||
TORCH_CHECK(
|
||||
N <= size(), "ArrayRef: invalid slice, N = ", N, "; size = ", size());
|
||||
return slice(N, size() - N);
|
||||
N <= this->size(),
|
||||
"ArrayRef: invalid slice, N = ",
|
||||
N,
|
||||
"; size = ",
|
||||
this->size());
|
||||
return slice(N, this->size() - N); // should this slice be this->slice?
|
||||
}
|
||||
|
||||
/// @}
|
||||
/// @name Operator Overloads
|
||||
/// @{
|
||||
constexpr const T& operator[](size_t Index) const {
|
||||
return Data[Index];
|
||||
}
|
||||
|
||||
/// Vector compatibility
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr const T& at(size_t Index) const {
|
||||
TORCH_CHECK(
|
||||
Index < Length,
|
||||
Index < this->Length,
|
||||
"ArrayRef: invalid index Index = ",
|
||||
Index,
|
||||
"; Length = ",
|
||||
Length);
|
||||
return Data[Index];
|
||||
this->Length);
|
||||
return this->Data[Index];
|
||||
}
|
||||
|
||||
/// Disallow accidental assignment from a temporary.
|
||||
@ -253,16 +153,48 @@ class ArrayRef final {
|
||||
std::enable_if_t<std::is_same_v<U, T>, ArrayRef<T>>& operator=(
|
||||
std::initializer_list<U>) = delete;
|
||||
|
||||
/// @}
|
||||
/// @name Expensive Operations
|
||||
/// @{
|
||||
std::vector<T> vec() const {
|
||||
return std::vector<T>(Data, Data + Length);
|
||||
}
|
||||
|
||||
/// @}
|
||||
};
|
||||
|
||||
/// Deduction guides for ArrayRef to support CTAD with inherited constructors
|
||||
/// These mirror the constructors inherited from HeaderOnlyArrayRef
|
||||
/// @{
|
||||
|
||||
// Single element constructor
|
||||
template <typename T>
|
||||
ArrayRef(const T&) -> ArrayRef<T>;
|
||||
|
||||
// Pointer and length constructor
|
||||
template <typename T>
|
||||
ArrayRef(const T*, size_t) -> ArrayRef<T>;
|
||||
|
||||
// Range constructor (begin, end)
|
||||
template <typename T>
|
||||
ArrayRef(const T*, const T*) -> ArrayRef<T>;
|
||||
|
||||
// Generic container constructor (anything with .data() and .size())
|
||||
template <typename Container>
|
||||
ArrayRef(const Container&) -> ArrayRef<
|
||||
std::remove_pointer_t<decltype(std::declval<Container>().data())>>;
|
||||
|
||||
// std::vector constructor
|
||||
template <typename T, typename A>
|
||||
ArrayRef(const std::vector<T, A>&) -> ArrayRef<T>;
|
||||
|
||||
// std::array constructor
|
||||
template <typename T, size_t N>
|
||||
ArrayRef(const std::array<T, N>&) -> ArrayRef<T>;
|
||||
|
||||
// C array constructor
|
||||
template <typename T, size_t N>
|
||||
ArrayRef(const T (&)[N]) -> ArrayRef<T>;
|
||||
|
||||
// std::initializer_list constructor
|
||||
template <typename T>
|
||||
ArrayRef(const std::initializer_list<T>&) -> ArrayRef<T>;
|
||||
|
||||
/// @}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& out, ArrayRef<T> list) {
|
||||
int i = 0;
|
||||
|
||||
@ -1307,7 +1307,7 @@ endif()
|
||||
|
||||
if(USE_MKLDNN_ACL)
|
||||
find_package(ACL REQUIRED)
|
||||
target_include_directories(torch_cpu PRIVATE ${ACL_INCLUDE_DIRS})
|
||||
target_include_directories(torch_cpu SYSTEM PRIVATE ${ACL_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
||||
target_include_directories(torch_cpu PRIVATE ${ATen_CPU_INCLUDE})
|
||||
|
||||
@ -26,7 +26,7 @@ find_library(Gloo_CUDA_LIBRARY
|
||||
# if Gloo + HIP is desired, Gloo_HIP_LIBRARY
|
||||
# needs to be linked to desired target
|
||||
find_library(Gloo_HIP_LIBRARY
|
||||
NAMES gloo_hiop
|
||||
NAMES gloo_hip
|
||||
DOC "Gloo's HIP support/code"
|
||||
)
|
||||
|
||||
|
||||
@ -28,6 +28,15 @@ endif()
|
||||
# Find CUDA.
|
||||
find_package(CUDA)
|
||||
if(NOT CUDA_FOUND)
|
||||
# If user explicitly set USE_CUDA=1, error out instead of falling back
|
||||
if(_USE_CUDA_EXPLICITLY_SET AND USE_CUDA)
|
||||
message(FATAL_ERROR
|
||||
"PyTorch: CUDA was explicitly requested (USE_CUDA=1) but cannot be found. "
|
||||
"Please check your CUDA installation, ensure CUDA toolkit is installed, "
|
||||
"and that CUDA_HOME or CMAKE_CUDA_COMPILER is set correctly. "
|
||||
"If you want to build without CUDA, please set USE_CUDA=0.")
|
||||
endif()
|
||||
|
||||
message(WARNING
|
||||
"PyTorch: CUDA cannot be found. Depending on whether you are building "
|
||||
"PyTorch or a PyTorch dependent library, the next warning / error will "
|
||||
|
||||
@ -45,7 +45,7 @@ supported for complex tensors.
|
||||
## Transition from the old representation
|
||||
|
||||
Users who currently worked around the lack of complex tensors with real tensors of shape {math}`(..., 2)`
|
||||
can easily to switch using the complex tensors in their code using {func}`torch.view_as_complex`
|
||||
can easily switch to using the complex tensors in their code using {func}`torch.view_as_complex`
|
||||
and {func}`torch.view_as_real`. Note that these functions don’t perform any copy and return a
|
||||
view of the input tensor.
|
||||
|
||||
@ -140,7 +140,7 @@ through the same optimizer on the {func}`torch.view_as_real` equivalent of the c
|
||||
|
||||
`real_optim` and `complex_optim` will compute the same updates on the parameters, though there may be slight numerical
|
||||
discrepancies between the two optimizers, similar to numerical discrepancies between foreach vs forloop optimizers
|
||||
and capturable vs default optimizers. For more details, see [numbercial accuracy](https://pytorch.org/docs/stable/notes/numerical_accuracy.html).
|
||||
and capturable vs default optimizers. For more details, see [numerical accuracy](https://pytorch.org/docs/stable/notes/numerical_accuracy.html).
|
||||
|
||||
Specifically, while you can think of our optimizer's handling of complex tensors as the same as optimizing over their
|
||||
`p.real` and `p.imag` pieces separately, the implementation details are not precisely that. Note that the
|
||||
|
||||
@ -206,6 +206,41 @@ templates_path = [
|
||||
os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
|
||||
]
|
||||
# TODO: document these and remove them from here.
|
||||
# Fixes the duplicated
|
||||
autosummary_filename_map = {
|
||||
"torch.nn.utils.prune.identity": "torch.nn.utils.prune.identity_function",
|
||||
"torch.nn.utils.prune.Identity": "torch.nn.utils.prune.Identity_class",
|
||||
"torch.optim.adamw.adamw": "torch.optim.adamw.adamw_function",
|
||||
"torch.optim.adamw.AdamW": "torch.optim.adamw.AdamW_class",
|
||||
"torch.optim.asgd.asgd": "torch.optim.asgd.asgd_function",
|
||||
"torch.optim.asgd.ASGD": "torch.optim.asgd.ASGD_class",
|
||||
"torch.optim.nadam.nadam": "torch.optim.nadam.nadam_function",
|
||||
"torch.optim.nadam.NAdam": "torch.optim.nadam.NAdam_class",
|
||||
"torch.optim.radam.radam": "torch.optim.radam.radam_function",
|
||||
"torch.optim.radam.RAdam": "torch.optim.radam.RAdam_class",
|
||||
"torch.optim.rmsprop.rmsprop": "torch.optim.rmsprop.rmsprop_function",
|
||||
"torch.optim.rmsprop.RMSprop": "torch.optim.rmsprop.RMSprop_class",
|
||||
"torch.optim.rprop.rprop": "torch.optim.rprop.rprop_function",
|
||||
"torch.optim.rprop.Rprop": "torch.optim.rprop.Rprop_class",
|
||||
"torch.optim.sgd.sgd": "torch.optim.sgd.sgd_function",
|
||||
"torch.optim.sgd.SGD": "torch.optim.sgd.SGD_class",
|
||||
"torch.optim.adadelta.adadelta": "torch.optim.adadelta.adadelta_function",
|
||||
"torch.optim.adadelta.Adadelta": "torch.optim.adadelta.Adadelta_class",
|
||||
"torch.optim.adagrad.adagrad": "torch.optim.adagrad.adagrad_function",
|
||||
"torch.optim.adagrad.Adagrad": "torch.optim.adagrad.Adagrad_class",
|
||||
"torch.optim.adam.adam": "torch.optim.adam.adam_function",
|
||||
"torch.optim.adam.Adam": "torch.optim.adam.Adam_class",
|
||||
"torch.optim.adamax.adamax": "torch.optim.adamax.adamax_function",
|
||||
"torch.optim.adamax.Adamax": "torch.optim.adamax.Adamax_class",
|
||||
"torch.mtia.stream": "torch.mtia.stream_function",
|
||||
"torch.mtia.Stream": "torch.mtia.Stream_class",
|
||||
"torch.cpu.stream": "torch.cpu.stream_function",
|
||||
"torch.cpu.Stream": "torch.cpu.Stream_class",
|
||||
"torch.cuda.stream": "torch.cuda.stream_function",
|
||||
"torch.cuda.Stream": "torch.cuda.Stream_class",
|
||||
"torch.xpu.stream": "torch.xpu.stream_function",
|
||||
"torch.xpu.Stream": "torch.xpu.Stream_class",
|
||||
}
|
||||
|
||||
coverage_ignore_functions = [
|
||||
# torch
|
||||
@ -3195,6 +3230,11 @@ autodoc_type_aliases = {
|
||||
# Enable overriding of function signatures in the first line of the docstring.
|
||||
autodoc_docstring_signature = True
|
||||
|
||||
# Exclude inherited IntEnum methods that have RST formatting issues in their docstrings
|
||||
autodoc_default_options = {
|
||||
"exclude-members": "from_bytes, to_bytes",
|
||||
}
|
||||
|
||||
# -- katex javascript in header
|
||||
#
|
||||
# def setup(app):
|
||||
|
||||
@ -394,6 +394,10 @@ an opaque group handle that can be given as a `group` argument to all collective
|
||||
.. autofunction:: new_group
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autofunction:: torch.distributed.distributed_c10d.shrink_group
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autofunction:: get_group_rank
|
||||
```
|
||||
|
||||
@ -619,6 +619,10 @@ Available options:
|
||||
and reallocate buffers across multiple streams, especially when the capture DAG frequently
|
||||
reaches joined frontiers.
|
||||
|
||||
* ``per_process_memory_fraction`` option limits the amount of memory that can be allocated
|
||||
on all the CUDA devices to a specified fraction of the available memory. This is a value
|
||||
between 0 and 1. Attempting to allocate more memory will raise an out of memory error.
|
||||
|
||||
.. note::
|
||||
|
||||
Some stats reported by the
|
||||
@ -1720,6 +1724,16 @@ and can be used to share memory across graphs as shown::
|
||||
g1.replay()
|
||||
g2.replay()
|
||||
|
||||
It's also safe to share a memory pool across separate graphs that do not depend
|
||||
on each other's outputs, provided they never run concurrently.
|
||||
Be aware that replaying one graph can clobber another graph's outputs when
|
||||
they share a pool, unless :meth:`~torch.Tensor.clone` is called on the outputs
|
||||
beforehand.
|
||||
This pattern is frequently used in inference servers that accept variable batch
|
||||
sizes at runtime.
|
||||
vLLM is a notable example; see `here <https://github.com/vllm-project/vllm/blob/938a81692ea318e59ead4750e7e7425bfd6a4896/vllm/platforms/interface.py#L508-L515>`__
|
||||
and `here <https://github.com/vllm-project/vllm/blob/938a81692ea318e59ead4750e7e7425bfd6a4896/vllm/compilation/cuda_graph.py#L86-L89>`__.
|
||||
|
||||
With :func:`torch.cuda.make_graphed_callables`, if you want to graph several
|
||||
callables and you know they'll always run in the same order (and never concurrently)
|
||||
pass them as a tuple in the same order they'll run in the live workload, and
|
||||
|
||||
@ -253,7 +253,6 @@ regular full-precision tensor.
|
||||
.. autosummary::
|
||||
:toctree: generated
|
||||
:nosignatures:
|
||||
:template: classtemplate.rst
|
||||
|
||||
view
|
||||
as_strided
|
||||
|
||||
34
setup.py
34
setup.py
@ -630,37 +630,6 @@ def mirror_files_into_torchgen() -> None:
|
||||
raise RuntimeError("Check the file paths in `mirror_files_into_torchgen()`")
|
||||
|
||||
|
||||
def mirror_inductor_external_kernels() -> None:
|
||||
"""
|
||||
Copy external kernels into Inductor so they are importable.
|
||||
"""
|
||||
paths = [
|
||||
(
|
||||
CWD / "torch/_inductor/kernel/vendored_templates/cutedsl_grouped_gemm.py",
|
||||
CWD
|
||||
/ "third_party/cutlass/examples/python/CuTeDSL/blackwell/grouped_gemm.py",
|
||||
),
|
||||
]
|
||||
for new_path, orig_path in paths:
|
||||
# Create the dirs involved in new_path if they don't exist
|
||||
if not new_path.exists():
|
||||
new_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copy the files from the orig location to the new location
|
||||
if orig_path.is_file():
|
||||
shutil.copyfile(orig_path, new_path)
|
||||
continue
|
||||
if orig_path.is_dir():
|
||||
if new_path.exists():
|
||||
# copytree fails if the tree exists already, so remove it.
|
||||
shutil.rmtree(new_path)
|
||||
shutil.copytree(orig_path, new_path)
|
||||
continue
|
||||
raise RuntimeError(
|
||||
"Check the file paths in `mirror_inductor_external_kernels()`"
|
||||
)
|
||||
|
||||
|
||||
# ATTENTION: THIS IS AI SLOP
|
||||
def extract_variant_from_version(version: str) -> str:
|
||||
"""Extract variant from version string, defaulting to 'cpu'."""
|
||||
@ -1647,8 +1616,6 @@ def main() -> None:
|
||||
if RUN_BUILD_DEPS:
|
||||
build_deps()
|
||||
|
||||
mirror_inductor_external_kernels()
|
||||
|
||||
(
|
||||
ext_modules,
|
||||
cmdclass,
|
||||
@ -1682,7 +1649,6 @@ def main() -> None:
|
||||
"_inductor/codegen/aoti_runtime/*.cpp",
|
||||
"_inductor/script.ld",
|
||||
"_inductor/kernel/flex/templates/*.jinja",
|
||||
"_inductor/kernel/templates/*.jinja",
|
||||
"_export/serde/*.yaml",
|
||||
"_export/serde/*.thrift",
|
||||
"share/cmake/ATen/*.cmake",
|
||||
|
||||
@ -75,6 +75,7 @@ class TestScheduler(TestCase):
|
||||
|
||||
class TestCubicScheduler(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.model_sparse_config = [
|
||||
{"tensor_fqn": "0.weight", "sparsity_level": 0.8},
|
||||
{"tensor_fqn": "2.weight", "sparsity_level": 0.4},
|
||||
|
||||
@ -11,6 +11,7 @@ from torch.testing._internal.common_utils import IS_LINUX, run_tests, TestCase
|
||||
@unittest.skipIf(not IS_LINUX, "Only works on linux")
|
||||
class TestTorchrun(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self._test_dir = tempfile.mkdtemp(prefix=self.__class__.__name__)
|
||||
|
||||
def tearDown(self):
|
||||
|
||||
@ -12,6 +12,7 @@ set(AOTI_ABI_CHECK_TEST_SRCS
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_exception.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_headeronlyarrayref.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_macros.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_math.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_rand.cpp
|
||||
@ -44,6 +45,10 @@ endif()
|
||||
# Disable unused-variable warnings for variables that are only used to test compilation
|
||||
target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-variable)
|
||||
target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-but-set-variable)
|
||||
# Add -Wno-dangling-pointer for GCC 13
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
|
||||
target_compile_options_if_supported(test_aoti_abi_check -Wno-dangling-pointer)
|
||||
endif()
|
||||
|
||||
foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
|
||||
foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
|
||||
|
||||
52
test/cpp/aoti_abi_check/test_headeronlyarrayref.cpp
Normal file
52
test/cpp/aoti_abi_check/test_headeronlyarrayref.cpp
Normal file
@ -0,0 +1,52 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
using torch::headeronly::HeaderOnlyArrayRef;
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestEmpty) {
|
||||
HeaderOnlyArrayRef<float> arr;
|
||||
ASSERT_TRUE(arr.empty());
|
||||
}
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestSingleton) {
|
||||
float val = 5.0f;
|
||||
HeaderOnlyArrayRef<float> arr(val);
|
||||
ASSERT_FALSE(arr.empty());
|
||||
EXPECT_EQ(arr.size(), 1);
|
||||
EXPECT_EQ(arr[0], val);
|
||||
}
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestAPIs) {
|
||||
std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
|
||||
HeaderOnlyArrayRef<int> arr(vec);
|
||||
ASSERT_FALSE(arr.empty());
|
||||
EXPECT_EQ(arr.size(), 7);
|
||||
for (size_t i = 0; i < arr.size(); i++) {
|
||||
EXPECT_EQ(arr[i], i + 1);
|
||||
EXPECT_EQ(arr.at(i), i + 1);
|
||||
}
|
||||
EXPECT_EQ(arr.front(), 1);
|
||||
EXPECT_EQ(arr.back(), 7);
|
||||
ASSERT_TRUE(arr.slice(3, 4).equals(arr.slice(3)));
|
||||
}
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestFromInitializerList) {
|
||||
std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
|
||||
HeaderOnlyArrayRef<int> arr({1, 2, 3, 4, 5, 6, 7});
|
||||
auto res_vec = arr.vec();
|
||||
for (size_t i = 0; i < vec.size(); i++) {
|
||||
EXPECT_EQ(vec[i], res_vec[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestFromRange) {
|
||||
std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
|
||||
HeaderOnlyArrayRef<int> arr(vec.data() + 3, vec.data() + 7);
|
||||
auto res_vec = arr.vec();
|
||||
for (size_t i = 0; i < res_vec.size(); i++) {
|
||||
EXPECT_EQ(vec[i + 3], res_vec[i]);
|
||||
}
|
||||
}
|
||||
@ -70,6 +70,13 @@ if(NOT MSVC)
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12)
|
||||
target_compile_options_if_supported(test_api "-Wno-error=nonnull")
|
||||
endif()
|
||||
|
||||
# Add -Wno-error=array-bounds for GCC 13+
|
||||
# See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113239
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
|
||||
target_compile_options_if_supported(test_api "-Wno-error=array-bounds")
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
||||
if(INSTALL_TEST)
|
||||
|
||||
@ -64,7 +64,7 @@ def run(initializer):
|
||||
|
||||
def main():
|
||||
initializer_parameter_map = {}
|
||||
for initializer in INITIALIZERS.keys():
|
||||
for initializer in INITIALIZERS:
|
||||
sys.stderr.write(f"Evaluating {initializer} ...\n")
|
||||
initializer_parameter_map[initializer] = run(initializer)
|
||||
|
||||
|
||||
@ -130,7 +130,7 @@ def main():
|
||||
options = parser.parse_args()
|
||||
|
||||
optimizer_parameter_map = {}
|
||||
for optimizer in OPTIMIZERS.keys():
|
||||
for optimizer in OPTIMIZERS:
|
||||
sys.stderr.write(f"Evaluating {optimizer} ...\n")
|
||||
optimizer_parameter_map[optimizer] = run(
|
||||
optimizer, options.iterations, options.sample_every
|
||||
|
||||
@ -47,20 +47,10 @@ Tensor sgd_out_of_place(
|
||||
STD_TORCH_CHECK(param.get_device() == -1, "CPU device index = -1");
|
||||
STD_TORCH_CHECK(param.get_device_index() == -1, "CPU device index = -1");
|
||||
|
||||
int64_t *param_sizes;
|
||||
int64_t *param_strides;
|
||||
aoti_torch_get_sizes(param.get(), ¶m_sizes);
|
||||
aoti_torch_get_strides(param.get(), ¶m_strides);
|
||||
// testing Tensor strides + stride
|
||||
STD_TORCH_CHECK(param.strides()[0] == param.stride(0));
|
||||
|
||||
int32_t param_dtype;
|
||||
aoti_torch_get_dtype(param.get(), ¶m_dtype);
|
||||
|
||||
int32_t param_device_type;
|
||||
aoti_torch_get_device_type(param.get(), ¶m_device_type);
|
||||
|
||||
AtenTensorHandle out_ath;
|
||||
aoti_torch_empty_strided(param.dim(), param_sizes, param_strides, param_dtype, param_device_type, param.get_device(), &out_ath);
|
||||
auto out = Tensor(out_ath);
|
||||
auto out = new_empty(param, param.sizes());
|
||||
|
||||
sgd_math(
|
||||
reinterpret_cast<float*>(param.data_ptr()),
|
||||
@ -311,10 +301,9 @@ void boxed_fill_infinity(
|
||||
}
|
||||
|
||||
Tensor my_pad(Tensor t) {
|
||||
std::vector<int64_t> padding = {1, 2, 2, 1};
|
||||
std::string mode = "constant";
|
||||
double value = 0.0;
|
||||
return pad(t, padding, mode, value);
|
||||
return pad(t, {1, 2, 2, 1}, mode, value);
|
||||
}
|
||||
|
||||
void boxed_my_pad(
|
||||
@ -342,6 +331,11 @@ void boxed_my_narrow(
|
||||
}
|
||||
|
||||
Tensor my_new_empty_dtype_variant(Tensor t) {
|
||||
// Still using a std::vector below even though people can just pass in an
|
||||
// initializer list (which will be implicitly converted to an HeaderOnlyArrayRef)
|
||||
// directly.
|
||||
// This is to test that passing in a std::vector works for BC. (It gets
|
||||
// implicitly converted to HeaderOnlyArrayRef too!)
|
||||
std::vector<int64_t> sizes = {2, 5};
|
||||
auto dtype = std::make_optional(torch::headeronly::ScalarType::BFloat16);
|
||||
return new_empty(t, sizes, dtype);
|
||||
@ -353,9 +347,8 @@ void boxed_my_new_empty_dtype_variant(StableIValue* stack, uint64_t num_args, ui
|
||||
}
|
||||
|
||||
Tensor my_new_zeros_dtype_variant(Tensor t) {
|
||||
std::vector<int64_t> sizes = {2, 5};
|
||||
auto dtype = std::make_optional(at::ScalarType::Float);
|
||||
return new_zeros(t, sizes, dtype);
|
||||
return new_zeros(t, {2, 5}, dtype);
|
||||
}
|
||||
|
||||
void boxed_my_new_zeros_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
@ -429,8 +422,7 @@ void boxed_my_amax(StableIValue* stack, uint64_t num_args, uint64_t num_outputs)
|
||||
}
|
||||
|
||||
Tensor my_amax_vec(Tensor t) {
|
||||
std::vector<int64_t> v = {0,1};
|
||||
return amax(t, v, false);
|
||||
return amax(t, {0,1}, false);
|
||||
}
|
||||
|
||||
void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
|
||||
@ -11,6 +11,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
class TestCustomBackend(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Load the library containing the custom backend.
|
||||
self.library_path = get_custom_backend_library_path()
|
||||
torch.ops.load_library(self.library_path)
|
||||
|
||||
@ -18,6 +18,7 @@ torch.ops.import_module("pointwise")
|
||||
|
||||
class TestCustomOperators(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.library_path = get_custom_op_library_path()
|
||||
ops.load_library(self.library_path)
|
||||
|
||||
|
||||
@ -22,6 +22,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
class TestMakeCheckpointer(TestCase):
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
# Create a temporary directory for checkpoints
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
|
||||
@ -161,6 +161,7 @@ class TestCheckpointProcessConfig(TestCase):
|
||||
|
||||
class TestCheckpointProcess(TestCase):
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
"""Set up common test fixtures."""
|
||||
self.rank_info = RankInfo(
|
||||
global_world_size=1,
|
||||
|
||||
@ -14,6 +14,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
class TestCheckpointReader(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Create a temporary directory for test checkpoints
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
|
||||
@ -52,6 +52,7 @@ class TestCheckpointWriterConfig(TestCase):
|
||||
|
||||
class TestCheckpointWriter(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Create a temporary directory for test checkpoints
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
|
||||
@ -52,6 +52,7 @@ class TestCheckpointer(TestCase):
|
||||
"""Parameterized tests that work with both sync and async checkpointers."""
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Create a temporary directory for checkpoints
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
@ -397,6 +398,7 @@ class TestAsyncCheckpointerSpecific(TestCase):
|
||||
"""Tests specific to AsyncCheckpointer functionality."""
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Create a temporary directory for checkpoints
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
|
||||
@ -12,6 +12,7 @@ from torch.testing._internal.common_utils import requires_cuda, run_tests, TestC
|
||||
|
||||
class TestDefaultStager(TestCase):
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
# Create a test state dictionary with various data types
|
||||
self.state_dict = {
|
||||
"model": torch.nn.Linear(10, 5).state_dict(),
|
||||
|
||||
@ -208,7 +208,7 @@ class TestSingleRankSaveLoad(TestCase):
|
||||
|
||||
# Create model.safetensors.index.json with weight mapping
|
||||
weight_map = {}
|
||||
for key in quantized_checkpoint.keys():
|
||||
for key in quantized_checkpoint:
|
||||
weight_map[key] = "model.safetensors"
|
||||
|
||||
index_data = {
|
||||
@ -245,7 +245,7 @@ class TestSingleRankSaveLoad(TestCase):
|
||||
sorted(original_tensors.keys()), sorted(state_dict_to_load.keys())
|
||||
)
|
||||
|
||||
for tensor_name in original_tensors.keys():
|
||||
for tensor_name in original_tensors:
|
||||
original = original_tensors[tensor_name]
|
||||
loaded = state_dict_to_load[tensor_name]
|
||||
|
||||
|
||||
@ -15,6 +15,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
class TestQuantizedHfStorage(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
"""Set up common test fixtures."""
|
||||
self.temp_dir = tempfile.TemporaryDirectory()
|
||||
self.path = self.temp_dir.name
|
||||
|
||||
@ -21,6 +21,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
class SignalHandlingTest(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Save original environment variable if it exists
|
||||
self.original_signals_env = os.environ.get(
|
||||
"TORCHELASTIC_SIGNALS_TO_HANDLE", None
|
||||
|
||||
@ -498,7 +498,7 @@ class TestFSDPMixedPrecision(FSDPTest):
|
||||
for name, tensor in state_dict.items():
|
||||
# Parameters and buffers are checkpointed in their
|
||||
# original dtypes, which may be different.
|
||||
if name in named_buffers.keys():
|
||||
if name in named_buffers:
|
||||
self.assertEqual(tensor.dtype, _BUFFER_ORIG_DTYPE)
|
||||
else:
|
||||
self.assertEqual(
|
||||
|
||||
@ -16,6 +16,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
class LauncherApiTest(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# Save original environment variable if it exists
|
||||
self.original_signals_env = os.environ.get(
|
||||
"TORCHELASTIC_SIGNALS_TO_HANDLE", None
|
||||
|
||||
@ -5,8 +5,16 @@ import contextlib
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch._subclasses.fake_tensor import FakeTensorMode
|
||||
from torch.distributed.tensor import DeviceMesh, DTensor, Partial, Replicate, Shard
|
||||
from torch.distributed.tensor import (
|
||||
DeviceMesh,
|
||||
distribute_tensor,
|
||||
DTensor,
|
||||
Partial,
|
||||
Replicate,
|
||||
Shard,
|
||||
)
|
||||
from torch.distributed.tensor._dtensor_spec import ShardOrderEntry
|
||||
from torch.fx.experimental.proxy_tensor import make_fx
|
||||
from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
@ -42,22 +50,24 @@ class TestDTensorDebugMode(TestCase):
|
||||
x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
|
||||
y_dtensor = DTensor.from_local(y, mesh, [Shard(0)], run_check=False)
|
||||
|
||||
with DebugMode(record_torchfunction=True) as debug_mode:
|
||||
with DebugMode(
|
||||
record_torchfunction=True, record_ids=True, record_output=True
|
||||
) as debug_mode:
|
||||
torch.mm(x_dtensor, y_dtensor).sum()
|
||||
|
||||
self.assertExpectedInline(
|
||||
debug_mode.debug_string(),
|
||||
"""\
|
||||
torch.mm(dt: f32[8, 8]| S(0), dt: f32[8, 32]| S(0))
|
||||
aten::mm(dt: f32[8, 8]| S(0), dt: f32[8, 32]| S(0))
|
||||
torch.mm(dt$0: f32[8, 8]| S(0), dt$1: f32[8, 32]| S(0)) -> dt$6: f32[8, 32]| S(0)
|
||||
aten::mm(dt$0: f32[8, 8]| S(0), dt$1: f32[8, 32]| S(0))
|
||||
redistribute_input(1, S(0) -> R)
|
||||
redistribute_input(t: f32[1, 32], trace: S(0)->R)
|
||||
_c10d_functional::all_gather_into_tensor(t: f32[1, 32], 8, 0)
|
||||
_c10d_functional::wait_tensor(t: f32[8, 32])
|
||||
aten::mm(t: f32[1, 8], t: f32[8, 32])
|
||||
<method 'sum' of 'torch._C.TensorBase' objects>(dt: f32[8, 32]| S(0))
|
||||
aten::sum(dt: f32[8, 32]| S(0))
|
||||
aten::sum(t: f32[1, 32])""",
|
||||
redistribute_input(t$2: f32[1, 32], trace: S(0)->R)
|
||||
_c10d_functional::all_gather_into_tensor(t$2: f32[1, 32], 8, 0) -> t$3: f32[8, 32]
|
||||
_c10d_functional::wait_tensor(t$3: f32[8, 32]) -> t$3: f32[8, 32]
|
||||
aten::mm(t$4: f32[1, 8], t$3: f32[8, 32]) -> t$5: f32[1, 32]
|
||||
<method 'sum' of 'torch._C.TensorBase' objects>(dt$6: f32[8, 32]| S(0)) -> dt$8: f32[]| P
|
||||
aten::sum(dt$6: f32[8, 32]| S(0))
|
||||
aten::sum(t$5: f32[1, 32]) -> t$7: f32[]""",
|
||||
)
|
||||
|
||||
self.assertTrue(isinstance(debug_mode.operators[0], _OpCall))
|
||||
@ -424,6 +434,31 @@ class TestDTensorDebugMode(TestCase):
|
||||
][-1]
|
||||
self.assertTrue("self.l2(self.l1(x))" in sum_op.fwd_stack_trace)
|
||||
|
||||
def test_pretty_print_dtensor_make_fx(self):
|
||||
mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
|
||||
|
||||
A = torch.randn(8, 32)
|
||||
B = torch.randn(32, 32)
|
||||
dA = distribute_tensor(A, mesh, [Shard(0)]).requires_grad_()
|
||||
dB = distribute_tensor(B, mesh, [Replicate()]).requires_grad_()
|
||||
|
||||
def f(dA, dB):
|
||||
dy = dA @ dB
|
||||
loss = dy.sum()
|
||||
loss.backward()
|
||||
return dA.grad, dB.grad
|
||||
|
||||
# We actually need the tracing_mode='fake' here, or to trace under a FakeTensorMode.
|
||||
# make_fx has some logic to ensure we don't accidentally stash real tensors in the graph
|
||||
# so we won't stash our DTensors properly if they don't hold Fake inner tensors
|
||||
gm = make_fx(f, tracing_mode="fake")(dA, dB)
|
||||
# DCE isn't necessary here, there were just a lot of dead detach() nodes that spammed the graph
|
||||
gm.graph.eliminate_dead_code()
|
||||
gm.recompile()
|
||||
# Colored is nice for actual viewing, not using in this test though
|
||||
gm_str = gm.print_readable(colored=False, print_output=False)
|
||||
self.assertTrue('"DTensor(f32[8, 32], S(0))" = torch.ops.aten.mm' in gm_str)
|
||||
|
||||
|
||||
instantiate_parametrized_tests(TestDTensorDebugMode)
|
||||
|
||||
|
||||
@ -3,7 +3,8 @@
|
||||
import itertools
|
||||
import random
|
||||
import unittest
|
||||
from typing import Any, Callable, ClassVar, Optional
|
||||
from collections.abc import Callable
|
||||
from typing import Any, ClassVar, Optional
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
@ -1189,9 +1189,7 @@ class AbstractCommTest:
|
||||
self.assertEqual(len(set(rank_to_seq_num.values())), 2)
|
||||
self.assertEqual(rank_to_seq_num[0], rank_to_seq_num[2])
|
||||
expected_same = {
|
||||
rank_to_seq_num[i]
|
||||
for i in rank_to_seq_num.keys()
|
||||
if i not in [0, 2]
|
||||
rank_to_seq_num[i] for i in rank_to_seq_num if i not in [0, 2]
|
||||
}
|
||||
self.assertEqual(len(expected_same), 1)
|
||||
self.assertEqual(rank_to_seq_num[0] + 1, rank_to_seq_num[1])
|
||||
@ -1558,7 +1556,7 @@ class CommTest(AbstractCommTest, MultiProcessTestCase):
|
||||
}
|
||||
invalid_debug_modes = ["foo", 0, 1, -1]
|
||||
|
||||
for mode in mapping.keys():
|
||||
for mode in mapping:
|
||||
os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode)
|
||||
dist.set_debug_level_from_env()
|
||||
set_debug_mode = dist.get_debug_level()
|
||||
|
||||
@ -2357,6 +2357,7 @@ class ReducerModule(nn.Module):
|
||||
|
||||
class ReducerTest(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.file = tempfile.NamedTemporaryFile(delete=False)
|
||||
world_size = 1
|
||||
self.store = c10d.FileStore(self.file.name, world_size)
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import random
|
||||
@ -21,6 +22,7 @@ from unittest import mock, SkipTest
|
||||
import torch
|
||||
import torch.distributed as c10d
|
||||
import torch.distributed._functional_collectives as _functional_collectives
|
||||
from torch.distributed.distributed_c10d import SHRINK_ABORT as NCCL_SHRINK_ABORT
|
||||
|
||||
|
||||
if not c10d.is_available() or not c10d.is_nccl_available():
|
||||
@ -47,12 +49,15 @@ from torch._C._distributed_c10d import ErrorType, OpType, WorkResult
|
||||
from torch.nn.parallel import DistributedDataParallel
|
||||
from torch.testing._internal.common_cuda import _get_torch_rocm_version, TEST_MULTIGPU
|
||||
from torch.testing._internal.common_distributed import (
|
||||
get_required_world_size,
|
||||
get_timeout,
|
||||
init_multigpu_helper,
|
||||
MultiProcessTestCase,
|
||||
requires_multicast_support,
|
||||
requires_nccl,
|
||||
requires_nccl_shrink,
|
||||
requires_nccl_version,
|
||||
requires_world_size,
|
||||
skip_if_lt_x_gpu,
|
||||
skip_if_rocm_multiprocess,
|
||||
sm_is_or_higher_than,
|
||||
@ -88,6 +93,53 @@ BFLOAT16_AVAILABLE = torch.cuda.is_available() and (
|
||||
)
|
||||
|
||||
|
||||
_start_time = time.time()
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _ts():
|
||||
return time.time() - _start_time
|
||||
|
||||
|
||||
def configure(level=logging.INFO, force=False):
|
||||
try:
|
||||
logging.basicConfig(
|
||||
level=level,
|
||||
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
|
||||
force=force,
|
||||
)
|
||||
except TypeError:
|
||||
logging.basicConfig(
|
||||
level=level, format="%(asctime)s %(name)s %(levelname)s: %(message)s"
|
||||
)
|
||||
|
||||
|
||||
def log_test_info(rank, message):
|
||||
_logger.info("[%7.3fs][Rank %s] %s", _ts(), rank, message)
|
||||
|
||||
|
||||
def log_test_success(rank, message):
|
||||
_logger.info("[%7.3fs][Rank %s] ✅ %s", _ts(), rank, message)
|
||||
|
||||
|
||||
def log_test_validation(rank, message):
|
||||
_logger.info("[%7.3fs][Rank %s] ✓ %s", _ts(), rank, message)
|
||||
|
||||
|
||||
def log_test_warning(rank, message):
|
||||
_logger.warning("[%7.3fs][Rank %s] ⚠️ %s", _ts(), rank, message)
|
||||
|
||||
|
||||
def log_test_error(rank, message):
|
||||
_logger.error("[%7.3fs][Rank %s] ✗ %s", _ts(), rank, message)
|
||||
|
||||
|
||||
_log_configure = configure
|
||||
|
||||
|
||||
_log_configure(level=logging.INFO, force=True)
|
||||
|
||||
|
||||
class RendezvousEnvTest(TestCase):
|
||||
@retry_on_connect_failures
|
||||
@requires_nccl()
|
||||
@ -200,6 +252,7 @@ class ProcessGroupNCCLNoGPUTest(TestCase):
|
||||
MAIN_PROCESS_RANK = 0
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.rank = self.MAIN_PROCESS_RANK
|
||||
self.world_size = 1
|
||||
self.file = tempfile.NamedTemporaryFile(delete=False)
|
||||
@ -317,7 +370,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
|
||||
|
||||
@property
|
||||
def world_size(self):
|
||||
return 2
|
||||
return get_required_world_size(self, 2)
|
||||
|
||||
@property
|
||||
def rank_to_GPU(self):
|
||||
@ -1255,6 +1308,628 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
|
||||
pg_2 = c10d.new_group([0, 1])
|
||||
self.assertEqual(pg_2.group_desc, "undefined")
|
||||
|
||||
@requires_nccl_shrink()
|
||||
@requires_world_size(2)
|
||||
def test_shrink_group_basic(self):
|
||||
"""Test basic shrink_group functionality."""
|
||||
self._perform_shrink_test([1], "Basic shrink test")
|
||||
|
||||
@requires_nccl_shrink()
|
||||
@requires_world_size(2)
|
||||
def test_shrink_group_validation(self):
|
||||
"""Test input validation in shrink_group."""
|
||||
device, pg = self._setup_shrink_test("validation")
|
||||
|
||||
def _test_invalid_input(ranks, description, expected_exception):
|
||||
"""Helper to test invalid inputs."""
|
||||
try:
|
||||
c10d.shrink_group(ranks)
|
||||
self.fail(f"Expected {expected_exception.__name__} for {description}")
|
||||
except expected_exception:
|
||||
log_test_validation(self.rank, f"✓ {description}")
|
||||
except Exception:
|
||||
if expected_exception is Exception: # Accept any exception
|
||||
log_test_validation(self.rank, f"✓ {description}")
|
||||
else:
|
||||
raise
|
||||
|
||||
# Test cases
|
||||
_test_invalid_input([], "Empty exclusion list", ValueError)
|
||||
if self.world_size > 1:
|
||||
_test_invalid_input([0, 0, 1], "Duplicate ranks", Exception)
|
||||
_test_invalid_input([self.world_size + 1], "Out of bounds rank", Exception)
|
||||
|
||||
log_test_success(self.rank, "All validation tests passed")
|
||||
dist.destroy_process_group()
|
||||
|
||||
@requires_nccl_shrink()
|
||||
@requires_world_size(2)
|
||||
def test_shrink_group_backend_properties(self):
|
||||
"""Test that backend properties are preserved after shrinking."""
|
||||
|
||||
test_name = "Backend Properties Test"
|
||||
ranks_to_exclude = [0]
|
||||
|
||||
# Reuse _setup_shrink_test for complete setup (device, environment, and process group)
|
||||
device, pg = self._setup_shrink_test("backend_properties")
|
||||
|
||||
# Follow _perform_shrink_test pattern from here
|
||||
log_test_info(self.rank, f"{test_name} (world_size={self.world_size})")
|
||||
|
||||
is_excluded = self.rank in ranks_to_exclude
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"Excluding ranks: {ranks_to_exclude}, am_excluded: {is_excluded}",
|
||||
)
|
||||
|
||||
# Store original backend property values (not references) before shrinking
|
||||
original_timeout = None
|
||||
original_high_priority = None
|
||||
if not is_excluded:
|
||||
original_backend = pg._get_backend(device)
|
||||
original_timeout = original_backend.options._timeout
|
||||
original_high_priority = original_backend.options.is_high_priority_stream
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"Storing original backend properties: timeout={original_timeout}, high_priority={original_high_priority}",
|
||||
)
|
||||
|
||||
if is_excluded:
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"Excluded rank {self.rank} - setup complete, skipping shrink operation",
|
||||
)
|
||||
dist.destroy_process_group() # hang without it
|
||||
return
|
||||
|
||||
# Only non-excluded ranks proceed with shrink (same as _perform_shrink_test)
|
||||
log_test_info(self.rank, "Non-excluded rank calling shrink_group")
|
||||
shrunk_pg = c10d.shrink_group(ranks_to_exclude)
|
||||
|
||||
# Reuse _validate_shrunk_group helper (same as _perform_shrink_test)
|
||||
expected_size = self.world_size - len(ranks_to_exclude)
|
||||
_ = self._validate_shrunk_group(shrunk_pg, expected_size, test_name)
|
||||
|
||||
# Add custom backend properties validation
|
||||
new_backend = shrunk_pg._get_backend(device)
|
||||
log_test_info(self.rank, "Validating backend properties are preserved")
|
||||
|
||||
new_timeout = new_backend.options._timeout
|
||||
new_high_priority = new_backend.options.is_high_priority_stream
|
||||
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"Timeout comparison - original: {original_timeout}, new: {new_timeout}",
|
||||
)
|
||||
self.assertEqual(
|
||||
original_timeout, new_timeout, f"{test_name}: timeout not preserved"
|
||||
)
|
||||
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"High priority stream comparison - original: {original_high_priority}, new: {new_high_priority}",
|
||||
)
|
||||
self.assertEqual(
|
||||
original_high_priority,
|
||||
new_high_priority,
|
||||
f"{test_name}: high_priority_stream not preserved",
|
||||
)
|
||||
|
||||
log_test_validation(
|
||||
self.rank, f"{test_name}: Backend properties preserved successfully"
|
||||
)
|
||||
log_test_success(
|
||||
self.rank, f"{test_name} successful (shrink + backend validation)"
|
||||
)
|
||||
|
||||
# Cleanup (same as _perform_shrink_test)
|
||||
dist.destroy_process_group()
|
||||
|
||||
@requires_nccl_shrink()
|
||||
@requires_world_size(2)
|
||||
def test_shrink_group_multiple_comms(self):
|
||||
"""Test shrink_group with multiple communicators and subgroup invalidation."""
|
||||
|
||||
device, pg = self._setup_shrink_test("multiple_comms")
|
||||
|
||||
# Create subgroup [0, 1] and test shrinking it
|
||||
subgroup = c10d.new_group([0, 1])
|
||||
if self.rank <= 1:
|
||||
# Shrink subgroup: exclude rank 1
|
||||
if self.rank == 0: # Only rank 0 remains
|
||||
shrunk_subgroup = c10d.shrink_group([1], group=subgroup)
|
||||
self.assertEqual(shrunk_subgroup.size(), 1)
|
||||
# Test communication on shrunk subgroup
|
||||
tensor = torch.full((1,), self.rank).cuda(device)
|
||||
c10d.all_reduce(tensor, group=shrunk_subgroup)
|
||||
self.assertEqual(tensor.item(), 0) # Only rank 0
|
||||
log_test_success(self.rank, "Subgroup shrinking successful")
|
||||
|
||||
dist.barrier() # Sync before default group test
|
||||
|
||||
# Shrink default group: exclude last rank
|
||||
ranks_to_exclude = [self.world_size - 1]
|
||||
if self.rank not in ranks_to_exclude:
|
||||
shrunk_default = c10d.shrink_group(ranks_to_exclude)
|
||||
expected_size = self.world_size - 1
|
||||
self.assertEqual(shrunk_default.size(), expected_size)
|
||||
|
||||
# Test collective on shrunk default group
|
||||
tensor = torch.full((1,), self.rank).cuda(device)
|
||||
c10d.all_reduce(tensor, group=shrunk_default)
|
||||
expected_sum = sum(
|
||||
range(self.world_size - 1)
|
||||
) # 0 + 1 + ... + (world_size-2)
|
||||
self.assertEqual(tensor.item(), expected_sum)
|
||||
log_test_success(self.rank, "Default group shrinking successful")
|
||||
|
||||
# Note: After shrinking default group, the old subgroup is invalid
|
||||
# due to global rank reassignment
|
||||
|
||||
dist.destroy_process_group()
|
||||
|
||||
def _test_shrink_group_with_flag(self, shrink_flag, flag_name, rank_to_exclude):
|
||||
"""Helper method to test shrink_group with a specific flag."""
|
||||
if self.world_size < 2:
|
||||
log_test_info(self.rank, f"Skipping (needs ≥2 GPUs, got {self.world_size})")
|
||||
return
|
||||
ranks_to_exclude = [rank_to_exclude]
|
||||
log_test_info(self.rank, f"Using {flag_name} flag (value: {shrink_flag})")
|
||||
if flag_name == "NCCL_SHRINK_ABORT":
|
||||
log_test_info(
|
||||
self.rank,
|
||||
"ABORT flag will terminate ongoing operations before shrinking",
|
||||
)
|
||||
|
||||
self._perform_shrink_test(
|
||||
ranks_to_exclude, f"{flag_name} flag test", shrink_flags=shrink_flag
|
||||
)
|
||||
|
||||
@requires_nccl_shrink()
|
||||
@requires_world_size(2)
|
||||
def test_shrink_group_flags(self):
|
||||
"""Test shrink_group with different shrink flags."""
|
||||
# Test ABORT flags
|
||||
log_test_info(self.rank, "Testing NCCL_SHRINK_ABORT flag")
|
||||
self._test_shrink_group_with_flag(NCCL_SHRINK_ABORT, "NCCL_SHRINK_ABORT", 1)
|
||||
|
||||
@requires_nccl_shrink()
|
||||
@requires_world_size(2)
|
||||
def test_shrink_group_nccl_config(self):
|
||||
"""Verify that passing NCCL config via pg_options influences the shrunk group's backend options."""
|
||||
device, pg = self._setup_shrink_test("config")
|
||||
if self.rank == self.world_size - 1:
|
||||
# excluded rank should not call shrink_group
|
||||
dist.destroy_process_group()
|
||||
return
|
||||
|
||||
# Prepare pg_options with NCCL config overrides
|
||||
# Capture parent's current backend options to ensure we can prove override vs inherit
|
||||
parent_backend = pg._get_backend(torch.device("cuda"))
|
||||
parent_hp = parent_backend.options.is_high_priority_stream
|
||||
parent_blocking = parent_backend.options.config.blocking
|
||||
|
||||
# Choose overrides that differ from the parent (flip where possible)
|
||||
override_hp = not parent_hp
|
||||
if parent_blocking in (0, 1):
|
||||
override_blocking = 1 - parent_blocking
|
||||
else:
|
||||
# If undefined or unexpected, set to 1 which is a concrete value
|
||||
override_blocking = 1
|
||||
|
||||
opts = c10d.ProcessGroupNCCL.Options()
|
||||
opts.is_high_priority_stream = override_hp
|
||||
opts.config.blocking = override_blocking
|
||||
|
||||
shrunk_pg = c10d.shrink_group([self.world_size - 1], pg_options=opts)
|
||||
|
||||
# Validate backend options propagated
|
||||
backend = shrunk_pg._get_backend(torch.device("cuda"))
|
||||
# is_high_priority_stream should exactly match our override and differ from parent
|
||||
self.assertEqual(backend.options.is_high_priority_stream, override_hp)
|
||||
self.assertNotEqual(backend.options.is_high_priority_stream, parent_hp)
|
||||
# config is a struct; check representative field and difference from parent when meaningful
|
||||
self.assertEqual(backend.options.config.blocking, override_blocking)
|
||||
if parent_blocking in (0, 1):
|
||||
self.assertNotEqual(backend.options.config.blocking, parent_blocking)
|
||||
|
||||
dist.destroy_process_group()
|
||||
|
||||
@requires_nccl_shrink()
|
||||
@requires_world_size(2)
|
||||
def test_shrink_group_performance(self):
|
||||
"""Test shrink_group performance and regression detection."""
|
||||
import time
|
||||
|
||||
ranks_to_exclude = self._get_default_ranks_to_exclude()
|
||||
is_excluded = self.rank in ranks_to_exclude
|
||||
|
||||
if not ranks_to_exclude:
|
||||
log_test_info(self.rank, "Skipping performance test (world_size=1)")
|
||||
return
|
||||
|
||||
log_test_info(self.rank, f"Performance test with {self.world_size} processes")
|
||||
device, pg = self._setup_shrink_test("performance")
|
||||
|
||||
if not is_excluded:
|
||||
log_test_info(self.rank, "Measuring shrink_group performance")
|
||||
start_time = time.time()
|
||||
shrunk_pg = c10d.shrink_group(ranks_to_exclude)
|
||||
end_time = time.time()
|
||||
|
||||
elapsed_time = end_time - start_time
|
||||
log_test_info(self.rank, f"shrink_group: {elapsed_time:.3f}s")
|
||||
|
||||
# Regression check: should complete within reasonable time
|
||||
self.assertLess(
|
||||
elapsed_time,
|
||||
30.0,
|
||||
f"shrink_group took {elapsed_time:.3f}s, possible regression",
|
||||
)
|
||||
|
||||
# Test collective performance
|
||||
expected_size = self.world_size - len(ranks_to_exclude)
|
||||
self._validate_shrunk_group(shrunk_pg, expected_size, "performance")
|
||||
|
||||
collective_start = time.time()
|
||||
_ = self._test_collective_on_shrunk_group(
|
||||
shrunk_pg, device, ranks_to_exclude, "performance"
|
||||
)
|
||||
collective_time = time.time() - collective_start
|
||||
|
||||
log_test_info(self.rank, f"all_reduce: {collective_time:.3f}s")
|
||||
log_test_success(self.rank, "Performance test passed")
|
||||
else:
|
||||
log_test_info(self.rank, "Excluded rank - waiting")
|
||||
|
||||
dist.destroy_process_group()
|
||||
|
||||
@requires_nccl_shrink()
|
||||
@requires_world_size(4)
|
||||
def test_shrink_group_multiple_exclusions(self):
|
||||
"""Test shrink_group with multiple ranks excluded at once."""
|
||||
# Scale exclusions with world size
|
||||
ranks_to_exclude = list(range(2, self.world_size, 2)) # Every other rank from 2
|
||||
|
||||
self._perform_shrink_test(ranks_to_exclude, "Multiple exclusions test")
|
||||
|
||||
@requires_nccl_shrink()
|
||||
@requires_world_size(3)
|
||||
def test_shrink_group_multiple_iterations(self):
|
||||
"""Test multiple shrink operations in sequence."""
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"Starting test_shrink_group_multiple_iterations with world_size={self.world_size}",
|
||||
)
|
||||
|
||||
store = c10d.FileStore(self.file_name, self.world_size)
|
||||
device = torch.device(f"cuda:{self.rank}")
|
||||
_ = self._create_process_group_nccl(store, self.opts(), device_id=device)
|
||||
|
||||
# Track current effective world size throughout shrinking operations
|
||||
current_world_size = self.world_size
|
||||
log_test_info(self.rank, f"Initial world_size: {current_world_size}")
|
||||
|
||||
# First shrinking: exclude the last rank(s)
|
||||
first_exclusion = [self.world_size - 1]
|
||||
if self.world_size >= 6:
|
||||
first_exclusion.append(
|
||||
self.world_size - 2
|
||||
) # Exclude last two ranks for larger sizes
|
||||
|
||||
log_test_info(self.rank, f"First shrinking: excluding ranks {first_exclusion}")
|
||||
|
||||
if self.rank not in first_exclusion:
|
||||
# Only non-excluded ranks should call shrink_group
|
||||
first_pg = c10d.shrink_group(first_exclusion)
|
||||
self.assertIsNotNone(first_pg)
|
||||
# IMPORTANT: Update world size after first shrinking
|
||||
current_world_size = first_pg.size()
|
||||
expected_first_size = self.world_size - len(first_exclusion)
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"After first shrinking: world_size {self.world_size} -> {current_world_size}",
|
||||
)
|
||||
self.assertEqual(first_pg.size(), expected_first_size)
|
||||
|
||||
# Second shrinking: exclude another rank from the remaining group
|
||||
# Choose a rank that's in the middle range
|
||||
if current_world_size >= 3:
|
||||
second_exclusion = [
|
||||
current_world_size - 1
|
||||
] # Exclude the new "last" rank
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"Second shrinking from group of size {current_world_size}: excluding ranks {second_exclusion}",
|
||||
)
|
||||
|
||||
if self.rank not in second_exclusion:
|
||||
# Only non-excluded ranks should call shrink_group for second iteration
|
||||
second_pg = c10d.shrink_group(second_exclusion, group=first_pg)
|
||||
self.assertIsNotNone(second_pg)
|
||||
# IMPORTANT: Update world size after second shrinking
|
||||
final_world_size = second_pg.size()
|
||||
expected_final_size = current_world_size - len(second_exclusion)
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"After second shrinking: world_size {current_world_size} -> {final_world_size}",
|
||||
)
|
||||
self.assertEqual(second_pg.size(), expected_final_size)
|
||||
|
||||
# Test collective on final group
|
||||
tensor = torch.full((1,), self.rank).cuda(device)
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"Performing all_reduce on final group (size {final_world_size}) with tensor: {tensor.item()}",
|
||||
)
|
||||
c10d.all_reduce(tensor, group=second_pg)
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"Final all_reduce completed, result: {tensor.item()}",
|
||||
)
|
||||
|
||||
# Calculate expected sum of remaining ranks
|
||||
all_excluded = set(first_exclusion + second_exclusion)
|
||||
remaining_ranks = [
|
||||
r for r in range(self.world_size) if r not in all_excluded
|
||||
]
|
||||
expected_sum = sum(remaining_ranks)
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"Remaining ranks: {remaining_ranks}, expected sum: {expected_sum}, actual: {tensor.item()}",
|
||||
)
|
||||
self.assertEqual(tensor.item(), expected_sum)
|
||||
log_test_info(self.rank, "Final verification passed")
|
||||
else:
|
||||
log_test_info(
|
||||
self.rank,
|
||||
"This rank excluded in second shrinking, not calling shrink_group",
|
||||
)
|
||||
else:
|
||||
log_test_info(
|
||||
self.rank, "Skipping second shrinking (remaining group too small)"
|
||||
)
|
||||
else:
|
||||
log_test_info(
|
||||
self.rank,
|
||||
"This rank excluded in first shrinking, not calling shrink_group",
|
||||
)
|
||||
|
||||
log_test_info(self.rank, "Destroying process group")
|
||||
dist.destroy_process_group()
|
||||
log_test_info(self.rank, "test_shrink_group_multiple_iterations completed")
|
||||
|
||||
# Helper methods for optimized shrink group tests
|
||||
def _setup_shrink_test(self, test_suffix, world_size=None, warmup=True):
|
||||
"""Common setup for shrink group tests."""
|
||||
os.environ["TORCH_NCCL_USE_COMM_NONBLOCKING"] = "1"
|
||||
world_size = world_size or self.world_size
|
||||
store = c10d.FileStore(self.file_name + f"_{test_suffix}", world_size)
|
||||
device = torch.device(f"cuda:{self.rank}")
|
||||
c10d.init_process_group(
|
||||
"nccl",
|
||||
world_size=world_size,
|
||||
rank=self.rank,
|
||||
store=store,
|
||||
pg_options=self.opts(),
|
||||
device_id=device,
|
||||
)
|
||||
pg = c10d.distributed_c10d._get_default_group()
|
||||
|
||||
if warmup:
|
||||
c10d.all_reduce(torch.ones(1).cuda(device), group=pg)
|
||||
|
||||
return device, pg
|
||||
|
||||
def _validate_shrunk_group(self, shrunk_pg, expected_size, test_name=""):
|
||||
"""Validate properties of a shrunk process group."""
|
||||
self.assertIsNotNone(shrunk_pg, f"{test_name}: shrunk_pg should not be None")
|
||||
actual_size = shrunk_pg.size()
|
||||
self.assertEqual(
|
||||
actual_size, expected_size, f"{test_name}: group size mismatch"
|
||||
)
|
||||
|
||||
new_rank = shrunk_pg.rank()
|
||||
self.assertTrue(
|
||||
0 <= new_rank < expected_size, f"{test_name}: invalid new rank {new_rank}"
|
||||
)
|
||||
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"{test_name}: world_size {self.world_size} -> {actual_size}, rank {self.rank} -> {new_rank}",
|
||||
)
|
||||
return new_rank
|
||||
|
||||
def _test_collective_on_shrunk_group(
|
||||
self, shrunk_pg, device, ranks_to_exclude, test_name=""
|
||||
):
|
||||
"""Test collective communication on shrunk group and verify correctness."""
|
||||
test_tensor = torch.full((1,), self.rank, device=device, dtype=torch.float32)
|
||||
c10d.all_reduce(test_tensor, group=shrunk_pg)
|
||||
|
||||
result = test_tensor.item()
|
||||
expected_sum = sum(
|
||||
r for r in range(self.world_size) if r not in ranks_to_exclude
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
result, expected_sum, f"{test_name}: collective result mismatch"
|
||||
)
|
||||
log_test_info(
|
||||
self.rank, f"{test_name}: collective passed ({result} == {expected_sum})"
|
||||
)
|
||||
return result
|
||||
|
||||
def _perform_shrink_test(
|
||||
self, ranks_to_exclude, test_name, shrink_flags=0, with_collective=True
|
||||
):
|
||||
"""Complete shrink test flow: setup, shrink, validate, test collective, cleanup.
|
||||
|
||||
Consistent API: All ranks perform setup to initialize distributed environment.
|
||||
ONLY non-excluded ranks call shrink_group() for both default and non-default groups.
|
||||
Excluded ranks perform setup, then exit without calling shrink_group() or waiting.
|
||||
"""
|
||||
log_test_info(self.rank, f"{test_name} (world_size={self.world_size})")
|
||||
|
||||
is_excluded = self.rank in ranks_to_exclude
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"Excluding ranks: {ranks_to_exclude}, am_excluded: {is_excluded}",
|
||||
)
|
||||
|
||||
# All ranks (including excluded ones) perform setup to initialize distributed environment
|
||||
device, pg = self._setup_shrink_test(test_name.lower().replace(" ", "_"))
|
||||
is_default_group = pg == c10d.distributed_c10d._get_default_group()
|
||||
|
||||
if is_excluded:
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"Excluded rank {self.rank} - setup complete, skipping shrink operation",
|
||||
)
|
||||
if shrink_flags & NCCL_SHRINK_ABORT:
|
||||
log_test_info(self.rank, f"Using abort for excluded rank {self.rank}")
|
||||
pg._get_backend(torch.device(device)).abort()
|
||||
log_test_info(
|
||||
self.rank, f"cleanup resources for excluded rank {self.rank}"
|
||||
)
|
||||
dist.destroy_process_group()
|
||||
log_test_info(self.rank, f"Excluded rank {self.rank} - exit")
|
||||
else:
|
||||
log_test_info(
|
||||
self.rank, f"Using regular destroy for excluded rank {self.rank}"
|
||||
)
|
||||
dist.destroy_process_group()
|
||||
return None
|
||||
|
||||
# Only non-excluded ranks proceed with shrink
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"Non-excluded rank calling shrink_group (default_group={is_default_group})",
|
||||
)
|
||||
shrunk_pg = c10d.shrink_group(ranks_to_exclude, shrink_flags=shrink_flags)
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"Non-excluded rank calling shrink_group (default_group={is_default_group}) done",
|
||||
)
|
||||
|
||||
# Non-excluded ranks: validate and test the new group
|
||||
expected_size = self.world_size - len(ranks_to_exclude)
|
||||
_ = self._validate_shrunk_group(shrunk_pg, expected_size, test_name)
|
||||
|
||||
if with_collective:
|
||||
_ = self._test_collective_on_shrunk_group(
|
||||
shrunk_pg, device, ranks_to_exclude, test_name
|
||||
)
|
||||
log_test_success(self.rank, f"{test_name} successful (shrink + collective)")
|
||||
else:
|
||||
log_test_success(self.rank, f"{test_name} successful (shrink only)")
|
||||
|
||||
dist.destroy_process_group()
|
||||
return shrunk_pg
|
||||
|
||||
def _get_default_ranks_to_exclude(self):
|
||||
"""Get default ranks to exclude based on world size."""
|
||||
if self.world_size <= 1:
|
||||
return []
|
||||
return [self.world_size - 1] # Exclude last rank by default
|
||||
|
||||
@requires_nccl_shrink()
|
||||
@requires_world_size(3)
|
||||
def test_shrink_group_vs_abort_reinit_performance(self):
|
||||
"""Compare performance of shrink_group vs traditional abort+reinit (simplified for reliability)."""
|
||||
log_test_info(self.rank, "=== TEST 1: abort+reinit ===")
|
||||
|
||||
device, pg1 = self._setup_shrink_test("_perf_reinit")
|
||||
torch.cuda.synchronize(device)
|
||||
|
||||
# Test 1: Traditional abort + reinit
|
||||
start_time = time.perf_counter()
|
||||
dist.destroy_process_group()
|
||||
|
||||
device, new_pg = self._setup_shrink_test("perf_shrink_test1")
|
||||
reinit_time = time.perf_counter() - start_time
|
||||
|
||||
# Test collective with original rank values for fair comparison (non-blocking mode)
|
||||
test_tensor = torch.full((1,), self.rank, device=device, dtype=torch.float32)
|
||||
work = c10d.all_reduce(test_tensor, group=new_pg, async_op=True)
|
||||
work.wait()
|
||||
|
||||
torch.cuda.synchronize(device)
|
||||
|
||||
# Verify correctness
|
||||
expected_sum = sum(r for r in range(self.world_size))
|
||||
self.assertEqual(test_tensor.item(), expected_sum, "Reinit collective failed")
|
||||
|
||||
log_test_info(self.rank, f"abort+reinit: {reinit_time:.4f}s")
|
||||
dist.destroy_process_group(new_pg)
|
||||
|
||||
# Test 2: shrink_group with NCCL_SHRINK_ABORT
|
||||
log_test_info(self.rank, "=== TEST 2: shrink_group ===")
|
||||
|
||||
ranks_to_exclude = [self.world_size - 1]
|
||||
is_excluded = self.rank in ranks_to_exclude
|
||||
log_test_info(
|
||||
self.rank,
|
||||
f"Excluding ranks: {ranks_to_exclude}, am_excluded: {is_excluded}",
|
||||
)
|
||||
|
||||
device, pg1 = self._setup_shrink_test("perf_shrink_test2") # Unique suffix
|
||||
|
||||
shrink_time = 0
|
||||
if not is_excluded:
|
||||
torch.cuda.synchronize(device) # Ensure accurate timing
|
||||
start_time = time.perf_counter()
|
||||
shrunk_pg = c10d.shrink_group(
|
||||
ranks_to_exclude, shrink_flags=NCCL_SHRINK_ABORT
|
||||
)
|
||||
c10d.all_reduce(torch.ones(1).cuda(device), group=shrunk_pg)
|
||||
shrink_time = time.perf_counter() - start_time
|
||||
|
||||
# Test collective communication on shrunk group (non-blocking mode)
|
||||
test_tensor = torch.full(
|
||||
(1,), self.rank, device=device, dtype=torch.float32
|
||||
)
|
||||
work = c10d.all_reduce(test_tensor, group=shrunk_pg, async_op=True)
|
||||
work.wait()
|
||||
|
||||
# Verify correctness
|
||||
expected_sum = sum(
|
||||
r for r in range(self.world_size) if r not in ranks_to_exclude
|
||||
)
|
||||
self.assertEqual(
|
||||
test_tensor.item(),
|
||||
expected_sum,
|
||||
"shrink_test: collective result mismatch",
|
||||
)
|
||||
|
||||
torch.cuda.synchronize(device) # Ensure operations complete
|
||||
log_test_info(self.rank, f"shrink_group: {shrink_time:.4f}s")
|
||||
dist.destroy_process_group()
|
||||
else:
|
||||
log_test_info(self.rank, "Excluded from shrink test - exiting immediately")
|
||||
dist.destroy_process_group()
|
||||
return
|
||||
|
||||
# Performance analysis (only for participating ranks)
|
||||
if shrink_time > 0 and reinit_time > 0:
|
||||
speedup = reinit_time / shrink_time
|
||||
time_saved = reinit_time - shrink_time
|
||||
|
||||
log_test_info(self.rank, "=== PERFORMANCE RESULTS ===")
|
||||
log_test_info(self.rank, f"shrink_group: {shrink_time:.4f}s")
|
||||
log_test_info(self.rank, f"abort+reinit: {reinit_time:.4f}s")
|
||||
log_test_info(self.rank, f"time_saved: {time_saved:+.4f}s")
|
||||
log_test_info(self.rank, f"speedup: {speedup:.2f}x")
|
||||
|
||||
if speedup > 1.1:
|
||||
log_test_success(self.rank, "shrink_group significantly faster")
|
||||
elif speedup > 0.9:
|
||||
log_test_info(self.rank, "≈ comparable performance")
|
||||
else:
|
||||
log_test_warning(self.rank, "abort+reinit faster")
|
||||
|
||||
log_test_info(self.rank, "Performance test completed")
|
||||
|
||||
@requires_nccl()
|
||||
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
|
||||
def test_deterministic_mode_no_break(self):
|
||||
@ -5115,6 +5790,229 @@ class NCCLTraceTest(NCCLTraceTestBase):
|
||||
else:
|
||||
self.assertTrue("duration_ms" not in t["entries"][0])
|
||||
|
||||
@requires_nccl()
|
||||
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
|
||||
@parametrize("timing_enabled", [True, False])
|
||||
def test_fr_record_reset_circular_buffer_full(self, timing_enabled):
|
||||
"""
|
||||
Test that when the circular buffer in entries_ is full and we call reset,
|
||||
then fill the buffer with new entries, dump_entries returns only the new
|
||||
entries and not the old ones.
|
||||
"""
|
||||
if self.rank == self.MAIN_PROCESS_RANK:
|
||||
return
|
||||
|
||||
# Override buffer size to 10 for faster testing
|
||||
os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "10"
|
||||
|
||||
pg = self._create_process_group_nccl()
|
||||
if timing_enabled:
|
||||
pg._enable_collectives_timing()
|
||||
device = self.local_device
|
||||
self.set_thread_name("fr_test_thread")
|
||||
a = torch.full((3, 4), float(self.rank), device=device)
|
||||
|
||||
# Fill the buffer completely with 10 entries
|
||||
for _ in range(10):
|
||||
f = pg.allreduce(a)
|
||||
f.wait()
|
||||
torch.cuda.synchronize(device=device)
|
||||
time.sleep(1)
|
||||
|
||||
# Verify buffer is full with 10 entries
|
||||
t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
|
||||
self.assertEqual(len(t["entries"]), 10)
|
||||
|
||||
# Now reset the flight recorder
|
||||
torch._C._distributed_c10d._reset_fr_recording_nccl()
|
||||
|
||||
# Add new entries after reset - fill the buffer completely again
|
||||
for _ in range(10):
|
||||
f = pg.allreduce(a)
|
||||
f.wait()
|
||||
torch.cuda.synchronize(device=device)
|
||||
time.sleep(1)
|
||||
|
||||
# Verify we get exactly 10 new entries, not 20
|
||||
t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
|
||||
self.assertEqual(len(t["entries"]), 10)
|
||||
|
||||
# Verify all entries have the expected properties (from after reset)
|
||||
# After reset, record IDs should start from 0 again
|
||||
for i, entry in enumerate(t["entries"]):
|
||||
self.assertIn("profiling_name", entry)
|
||||
self.assertEqual(entry["profiling_name"], "nccl:all_reduce")
|
||||
self.assertIn("record_id", entry)
|
||||
# Record IDs should be sequential starting from 0 after reset
|
||||
self.assertEqual(entry["record_id"], i)
|
||||
|
||||
dist.destroy_process_group()
|
||||
|
||||
@requires_nccl()
|
||||
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
|
||||
@parametrize("timing_enabled", [True, False])
|
||||
def test_fr_record_reset_partial_overwrite(self, timing_enabled):
|
||||
"""
|
||||
Test that when the circular buffer is full, we reset, and then add fewer
|
||||
entries than the buffer size, we only get the new entries.
|
||||
This tests that old entries at the end of the circular buffer are properly
|
||||
filtered out based on reset_epoch.
|
||||
"""
|
||||
if self.rank == self.MAIN_PROCESS_RANK:
|
||||
return
|
||||
|
||||
# Override buffer size to 10 for faster testing
|
||||
os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "10"
|
||||
|
||||
pg = self._create_process_group_nccl()
|
||||
if timing_enabled:
|
||||
pg._enable_collectives_timing()
|
||||
device = self.local_device
|
||||
self.set_thread_name("fr_test_thread")
|
||||
a = torch.full((3, 4), float(self.rank), device=device)
|
||||
|
||||
# Fill the buffer completely
|
||||
for _ in range(10):
|
||||
f = pg.allreduce(a)
|
||||
f.wait()
|
||||
torch.cuda.synchronize(device=device)
|
||||
time.sleep(1)
|
||||
|
||||
# Reset the flight recorder
|
||||
torch._C._distributed_c10d._reset_fr_recording_nccl()
|
||||
|
||||
# Add only 3 new entries (much less than buffer size)
|
||||
for _ in range(3):
|
||||
f = pg.allreduce(a)
|
||||
f.wait()
|
||||
torch.cuda.synchronize(device=device)
|
||||
time.sleep(1)
|
||||
|
||||
# Verify we only get the 3 new entries, not 10
|
||||
t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
|
||||
self.assertEqual(len(t["entries"]), 3)
|
||||
|
||||
# Verify record IDs start from 0 after reset
|
||||
for i, entry in enumerate(t["entries"]):
|
||||
self.assertIn("record_id", entry)
|
||||
self.assertEqual(entry["record_id"], i)
|
||||
|
||||
dist.destroy_process_group()
|
||||
|
||||
@requires_nccl()
|
||||
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
|
||||
@parametrize("timing_enabled", [True, False])
|
||||
def test_fr_record_reset_wraparound(self, timing_enabled):
|
||||
"""
|
||||
Test that when we reset in the middle of the circular buffer and then
|
||||
wrap around, dump_entries correctly returns only entries from the current
|
||||
epoch in the correct order.
|
||||
"""
|
||||
if self.rank == self.MAIN_PROCESS_RANK:
|
||||
return
|
||||
|
||||
# Override buffer size to 10 for faster testing
|
||||
os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "10"
|
||||
|
||||
pg = self._create_process_group_nccl()
|
||||
if timing_enabled:
|
||||
pg._enable_collectives_timing()
|
||||
device = self.local_device
|
||||
self.set_thread_name("fr_test_thread")
|
||||
a = torch.full((3, 4), float(self.rank), device=device)
|
||||
|
||||
# Fill half the buffer
|
||||
for _ in range(5):
|
||||
f = pg.allreduce(a)
|
||||
f.wait()
|
||||
torch.cuda.synchronize(device=device)
|
||||
time.sleep(1)
|
||||
|
||||
# Reset at this point (reset happens at index 5)
|
||||
torch._C._distributed_c10d._reset_fr_recording_nccl()
|
||||
|
||||
# Now add 8 entries, which will wrap around
|
||||
# (5->9 fills rest of buffer, then 0->2 wraps around)
|
||||
for _ in range(8):
|
||||
f = pg.allreduce(a)
|
||||
f.wait()
|
||||
torch.cuda.synchronize(device=device)
|
||||
time.sleep(1)
|
||||
|
||||
# Should get exactly 8 entries, properly ordered
|
||||
t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
|
||||
self.assertEqual(len(t["entries"]), 8)
|
||||
|
||||
# Entries should be in chronological order
|
||||
# The dump_entries() method returns entries from next_ to end, then 0 to next_
|
||||
# After filtering old entries, we should have 8 entries in order
|
||||
# Verify record IDs start from 0 after reset (id_ is reset in reset_all())
|
||||
for i, entry in enumerate(t["entries"]):
|
||||
self.assertIn("profiling_name", entry)
|
||||
self.assertIn("record_id", entry)
|
||||
self.assertEqual(entry["record_id"], i)
|
||||
|
||||
dist.destroy_process_group()
|
||||
|
||||
@requires_nccl()
|
||||
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
|
||||
@parametrize("timing_enabled", [True, False])
|
||||
def test_fr_record_multiple_resets(self, timing_enabled):
|
||||
"""
|
||||
Test multiple consecutive resets to ensure each reset properly increments
|
||||
the epoch and filters out entries from previous epochs.
|
||||
"""
|
||||
if self.rank == self.MAIN_PROCESS_RANK:
|
||||
return
|
||||
|
||||
# Override buffer size to 10 for faster testing
|
||||
os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "10"
|
||||
|
||||
pg = self._create_process_group_nccl()
|
||||
if timing_enabled:
|
||||
pg._enable_collectives_timing()
|
||||
device = self.local_device
|
||||
self.set_thread_name("fr_test_thread")
|
||||
a = torch.full((3, 4), float(self.rank), device=device)
|
||||
|
||||
# First batch: 2 entries
|
||||
for _ in range(2):
|
||||
f = pg.allreduce(a)
|
||||
f.wait()
|
||||
torch.cuda.synchronize(device=device)
|
||||
time.sleep(1)
|
||||
|
||||
# First reset
|
||||
torch._C._distributed_c10d._reset_fr_recording_nccl()
|
||||
|
||||
# Second batch: 3 entries
|
||||
for _ in range(3):
|
||||
f = pg.allreduce(a)
|
||||
f.wait()
|
||||
torch.cuda.synchronize(device=device)
|
||||
time.sleep(1)
|
||||
|
||||
# Second reset
|
||||
torch._C._distributed_c10d._reset_fr_recording_nccl()
|
||||
|
||||
# Third batch: 4 entries
|
||||
for _ in range(4):
|
||||
f = pg.allreduce(a)
|
||||
f.wait()
|
||||
torch.cuda.synchronize(device=device)
|
||||
time.sleep(1)
|
||||
|
||||
# Should only see the last 4 entries
|
||||
t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
|
||||
self.assertEqual(len(t["entries"]), 4)
|
||||
|
||||
# Verify record IDs start from 0 after the last reset
|
||||
for i, entry in enumerate(t["entries"]):
|
||||
self.assertIn("record_id", entry)
|
||||
self.assertEqual(entry["record_id"], i)
|
||||
|
||||
dist.destroy_process_group()
|
||||
|
||||
|
||||
def check_if_test_is_skipped(fn):
|
||||
def wrapper(self, *args, **kwargs):
|
||||
@ -5446,6 +6344,14 @@ class ProcessGroupNCCLLargerScaleTest(MultiProcessTestCase):
|
||||
if self.rank == 6 or self.rank == 7:
|
||||
dist.broadcast(tensor2, 6, group=ng2)
|
||||
self.assertEqual(tensor2, torch.full((1,), 6))
|
||||
|
||||
# Test the case when the split changes the pg option of split group
|
||||
# while the parent pg option is not changed.
|
||||
new_pg = c10d.new_group([0, 1, 2, 3, 4, 5, 6, 7], device_id=device)
|
||||
backend_new_pg = new_pg._get_backend(torch.device(device))
|
||||
self.assertEqual(len(backend_new_pg.options.global_ranks_in_group), 8)
|
||||
c10d.split_group(new_pg, [[0, 2, 4, 6], [1, 3, 5, 7]])
|
||||
self.assertEqual(len(backend_new_pg.options.global_ranks_in_group), 8)
|
||||
# a barrier and a cuda sync before destroying all pgs.
|
||||
dist.barrier(pg)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
@ -1985,6 +1985,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
|
||||
"bucket_reduce_scatters_fx_bucket_size_determinator": lambda _: 2,
|
||||
"reorder_for_compute_comm_overlap": True,
|
||||
"reorder_for_compute_comm_overlap_passes": [
|
||||
_reorder_communication_preserving_peak_memory,
|
||||
sink_waits_iterative,
|
||||
_reorder_communication_preserving_peak_memory,
|
||||
],
|
||||
@ -2046,11 +2047,6 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
|
||||
assert node_stats is not None
|
||||
self.assertTrue(isinstance(node_stats, dict))
|
||||
self.assertEqual(len(node_stats), 4)
|
||||
it = iter(node_stats.values())
|
||||
node_stat0 = next(it)
|
||||
self.assertTrue(node_stat0.limiting_factor == "None")
|
||||
node_stat1 = next(it)
|
||||
self.assertTrue("collective ordering" in node_stat1.limiting_factor)
|
||||
|
||||
@skipIfXpu # https://github.com/intel/torch-xpu-ops/issues/1581
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
|
||||
@ -7,6 +7,8 @@ import torch
|
||||
import torch.distributed as dist
|
||||
from torch.distributed._local_tensor import (
|
||||
local_tensor_mode,
|
||||
LocalIntNode,
|
||||
LocalRunnerMode,
|
||||
LocalTensor,
|
||||
LocalTensorMode,
|
||||
)
|
||||
@ -17,8 +19,10 @@ from torch.distributed.tensor import (
|
||||
Partial,
|
||||
Replicate,
|
||||
Shard,
|
||||
zeros,
|
||||
)
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import reduce_local_int
|
||||
|
||||
|
||||
class LocalTensorTestBase(TestCase):
|
||||
@ -124,14 +128,14 @@ class TestLocalTensorWorld2(LocalTensorTestBase):
|
||||
self.assertEqual(len(result_add._local_tensors), 2)
|
||||
|
||||
# Verify the operation was applied to each local tensor
|
||||
for rank in identical_local_tensors.keys():
|
||||
for rank in identical_local_tensors:
|
||||
expected = identical_local_tensors[rank] + identical_local_tensors[rank]
|
||||
self.assertEqual(result_add._local_tensors[rank], expected)
|
||||
|
||||
# Test multiplication
|
||||
result_mul = lt1 * 2.0
|
||||
self.assertIsInstance(result_mul, LocalTensor)
|
||||
for rank in identical_local_tensors.keys():
|
||||
for rank in identical_local_tensors:
|
||||
expected = identical_local_tensors[rank] * 2.0
|
||||
self.assertEqual(result_mul._local_tensors[rank], expected)
|
||||
|
||||
@ -159,7 +163,7 @@ class TestLocalTensorWorld2(LocalTensorTestBase):
|
||||
result = lt + regular_tensor
|
||||
self.assertIsInstance(result, LocalTensor)
|
||||
|
||||
for rank in identical_local_tensors.keys():
|
||||
for rank in identical_local_tensors:
|
||||
expected = identical_local_tensors[rank] + regular_tensor
|
||||
self.assertEqual(result._local_tensors[rank], expected)
|
||||
|
||||
@ -208,14 +212,14 @@ class TestLocalTensorWorld2(LocalTensorTestBase):
|
||||
dist.all_reduce(lt_sum, group=fake_pg)
|
||||
|
||||
expected_sum = torch.tensor([[6.0, 8.0], [10.0, 12.0]])
|
||||
for rank in test_tensors.keys():
|
||||
for rank in test_tensors:
|
||||
self.assertEqual(lt_sum._local_tensors[rank], expected_sum)
|
||||
|
||||
# Test broadcast within mode
|
||||
lt_broadcast = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
|
||||
dist.broadcast(lt_broadcast, src=0, group=fake_pg)
|
||||
|
||||
for rank in test_tensors.keys():
|
||||
for rank in test_tensors:
|
||||
self.assertEqual(lt_broadcast._local_tensors[rank], test_tensors[0])
|
||||
|
||||
# Test that regular operations still work
|
||||
@ -289,21 +293,21 @@ class TestLocalTensorWorld3(LocalTensorTestBase):
|
||||
lt_sum = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
|
||||
dist.all_reduce(lt_sum, op=dist.ReduceOp.SUM, group=fake_pg)
|
||||
expected_sum = torch.tensor([[6.0, 7.0], [6.0, 15.0]]) # Sum of all tensors
|
||||
for rank in test_tensors.keys():
|
||||
for rank in test_tensors:
|
||||
self.assertEqual(lt_sum._local_tensors[rank], expected_sum)
|
||||
|
||||
# Test MAX reduction
|
||||
lt_max = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
|
||||
dist.all_reduce(lt_max, op=dist.ReduceOp.MAX, group=fake_pg)
|
||||
expected_max = torch.tensor([[3.0, 4.0], [3.0, 6.0]]) # Max across all tensors
|
||||
for rank in test_tensors.keys():
|
||||
for rank in test_tensors:
|
||||
self.assertEqual(lt_max._local_tensors[rank], expected_max)
|
||||
|
||||
# Test MIN reduction
|
||||
lt_min = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
|
||||
dist.all_reduce(lt_min, op=dist.ReduceOp.MIN, group=fake_pg)
|
||||
expected_min = torch.tensor([[1.0, 1.0], [1.0, 4.0]]) # Min across all tensors
|
||||
for rank in test_tensors.keys():
|
||||
for rank in test_tensors:
|
||||
self.assertEqual(lt_min._local_tensors[rank], expected_min)
|
||||
|
||||
def test_all_reduce_collective(self):
|
||||
@ -324,7 +328,7 @@ class TestLocalTensorWorld3(LocalTensorTestBase):
|
||||
|
||||
# Verify all ranks have the sum of all tensors (after adding 1 to each)
|
||||
expected_sum = torch.tensor([[114.0, 225.0, 336.0], [447.0, 558.0, 669.0]])
|
||||
for rank in different_tensors.keys():
|
||||
for rank in different_tensors:
|
||||
self.assertEqual(lt_sum._local_tensors[rank], expected_sum)
|
||||
|
||||
def test_broadcast_collective(self):
|
||||
@ -344,7 +348,7 @@ class TestLocalTensorWorld3(LocalTensorTestBase):
|
||||
|
||||
# Verify all ranks have rank 1's original tensor
|
||||
expected_broadcast = different_tensors[1]
|
||||
for rank in different_tensors.keys():
|
||||
for rank in different_tensors:
|
||||
self.assertEqual(lt_broadcast._local_tensors[rank], expected_broadcast)
|
||||
|
||||
def test_all_gather_collective(self):
|
||||
@ -411,5 +415,78 @@ class TestLocalTensorWorld8(LocalTensorTestBase):
|
||||
self.assertEqual(full_tensor, local_res)
|
||||
|
||||
|
||||
from torch.distributed._local_tensor._c10d import local_p2p_op, wait_all
|
||||
|
||||
|
||||
class TestLocalRunner(LocalTensorTestBase):
|
||||
world_size = 6
|
||||
|
||||
@staticmethod
|
||||
def _get_pp_peer(pp_index, mesh, dim, dir):
|
||||
pp_meshes = mesh._get_all_submeshes(dim)
|
||||
pp_ret = {}
|
||||
for pp_mesh in pp_meshes:
|
||||
global_rank = pp_mesh.mesh[pp_index].item()
|
||||
global_peer = pp_mesh.mesh[(pp_index + dir) % pp_mesh.size()].item()
|
||||
pp_ret[global_rank] = global_peer
|
||||
|
||||
return torch.SymInt(LocalIntNode(pp_ret))
|
||||
|
||||
def _run_dp_pp(
|
||||
self,
|
||||
mesh: DeviceMesh,
|
||||
pp_index: int,
|
||||
actual: list[torch.Tensor | None],
|
||||
expected: list[torch.Tensor | None],
|
||||
) -> None:
|
||||
ltm = LocalTensorMode(mesh.size())
|
||||
with ltm:
|
||||
dp_mesh = mesh["dp"]
|
||||
pp_mesh = mesh["pp"]
|
||||
|
||||
x = torch.rand(2, 4)
|
||||
xd = distribute_tensor(x, dp_mesh, [Shard(0)])
|
||||
xd = xd * 2
|
||||
x = x * 2
|
||||
|
||||
yd = zeros(*xd.shape, device_mesh=dp_mesh, placements=[Shard(0)])
|
||||
|
||||
if pp_index != pp_mesh.size(0) - 1:
|
||||
# Send to next pp rank
|
||||
pp_next_rank = TestLocalRunner._get_pp_peer(pp_index, mesh, "pp", +1)
|
||||
local_p2p_op(pp_next_rank, xd, dist.isend)
|
||||
expected[pp_index + 1] = ltm.tensor_map(
|
||||
x,
|
||||
lambda r, t: t
|
||||
if reduce_local_int(pp_next_rank, lambda vals: r in vals.values())
|
||||
else torch.zeros_like(t),
|
||||
)
|
||||
|
||||
if pp_index != 0:
|
||||
# Receive from prev pp rank
|
||||
pp_prev_rank = TestLocalRunner._get_pp_peer(pp_index, mesh, "pp", -1)
|
||||
rw = local_p2p_op(pp_prev_rank, yd, dist.irecv)
|
||||
wait_all(rw)
|
||||
|
||||
y = yd.full_tensor()
|
||||
actual[pp_index] = y
|
||||
|
||||
def test_dp_pp(self):
|
||||
pp_size = 3
|
||||
mesh = init_device_mesh(
|
||||
"cpu", (self.world_size // pp_size, pp_size), mesh_dim_names=("dp", "pp")
|
||||
)
|
||||
actual: list[torch.Tensor | None] = [None] * pp_size
|
||||
expected: list[torch.Tensor | None] = [None] * pp_size
|
||||
with LocalRunnerMode(
|
||||
self.world_size,
|
||||
pp_size,
|
||||
lambda pp_index: self._run_dp_pp(mesh, pp_index, actual, expected),
|
||||
):
|
||||
pass
|
||||
|
||||
self.assertEqual(actual, expected)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user