[ghstack-poisoned]
This commit is contained in:
Pearu Peterson
2025-11-07 12:15:54 +02:00
520 changed files with 11411 additions and 3171 deletions

View File

@ -7,13 +7,13 @@ ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
ARG DEVTOOLSET_VERSION=11
ARG DEVTOOLSET_VERSION=13
RUN yum -y update
RUN yum -y install epel-release
# install glibc-langpack-en make sure en_US.UTF-8 locale is available
RUN yum -y install glibc-langpack-en
RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
# Just add everything as a safe.directory for git since these will be used in multiple places with git
RUN git config --global --add safe.directory '*'
ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
@ -41,6 +41,7 @@ RUN bash ./install_conda.sh && rm install_conda.sh
# Install CUDA
FROM base as cuda
ARG CUDA_VERSION=12.6
ARG DEVTOOLSET_VERSION=13
RUN rm -rf /usr/local/cuda-*
ADD ./common/install_cuda.sh install_cuda.sh
COPY ./common/install_nccl.sh install_nccl.sh
@ -50,7 +51,8 @@ ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
# Preserve CUDA_VERSION for the builds
ENV CUDA_VERSION=${CUDA_VERSION}
# Make things in our path by default
ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH
ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
FROM cuda as cuda12.6
RUN bash ./install_cuda.sh 12.6
@ -68,8 +70,22 @@ FROM cuda as cuda13.0
RUN bash ./install_cuda.sh 13.0
ENV DESIRED_CUDA=13.0
FROM ${ROCM_IMAGE} as rocm
FROM ${ROCM_IMAGE} as rocm_base
ARG DEVTOOLSET_VERSION=13
ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
# Install devtoolset on ROCm base image
RUN yum -y update && \
yum -y install epel-release && \
yum -y install glibc-langpack-en && \
yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
RUN git config --global --add safe.directory '*'
ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
FROM rocm_base as rocm
ARG PYTORCH_ROCM_ARCH
ARG DEVTOOLSET_VERSION=13
ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
ADD ./common/install_mkl.sh install_mkl.sh
RUN bash ./install_mkl.sh && rm install_mkl.sh
@ -88,6 +104,7 @@ COPY --from=cuda13.0 /usr/local/cuda-13.0 /usr/local/cuda-13.0
# Final step
FROM ${BASE_TARGET} as final
ARG DEVTOOLSET_VERSION=13
COPY --from=openssl /opt/openssl /opt/openssl
COPY --from=patchelf /patchelf /usr/local/bin/patchelf
COPY --from=conda /opt/conda /opt/conda

View File

@ -63,7 +63,7 @@ docker build \
--target final \
--progress plain \
--build-arg "BASE_TARGET=${BASE_TARGET}" \
--build-arg "DEVTOOLSET_VERSION=11" \
--build-arg "DEVTOOLSET_VERSION=13" \
${EXTRA_BUILD_ARGS} \
-t ${tmp_tag} \
$@ \

View File

@ -168,6 +168,18 @@ case "$tag" in
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-py3.11-clang12)
ANACONDA_PYTHON_VERSION=3.11
CLANG_VERSION=12
VISION=no
TRITON=no
;;
pytorch-linux-jammy-py3.12-clang12)
ANACONDA_PYTHON_VERSION=3.12
CLANG_VERSION=12
VISION=no
TRITON=no
;;
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
if [[ $tag =~ "jammy" ]]; then
ANACONDA_PYTHON_VERSION=3.10
@ -261,9 +273,9 @@ case "$tag" in
PYTHON_VERSION=3.10
CUDA_VERSION=12.8.1
;;
pytorch-linux-jammy-aarch64-py3.10-gcc11)
pytorch-linux-jammy-aarch64-py3.10-gcc13)
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
GCC_VERSION=13
ACL=yes
VISION=yes
OPENBLAS=yes
@ -281,9 +293,9 @@ case "$tag" in
# from pytorch/llvm:9.0.1 is x86 specific
SKIP_LLVM_SRC_BUILD_INSTALL=yes
;;
pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks)
pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
GCC_VERSION=13
ACL=yes
VISION=yes
OPENBLAS=yes

View File

@ -7,11 +7,11 @@ if [ -n "$GCC_VERSION" ]; then
# Need the official toolchain repo to get alternate packages
add-apt-repository ppa:ubuntu-toolchain-r/test
apt-get update
apt-get install -y g++-$GCC_VERSION
apt-get install -y g++-$GCC_VERSION gfortran-$GCC_VERSION
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50
update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50
update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-"$GCC_VERSION" 50
# Cleanup package manager
apt-get autoclean && apt-get clean

View File

@ -0,0 +1,56 @@
#!/bin/bash
# Script used only in CD pipeline
set -ex
# install dependencies
dnf -y install gmp-devel libmpc-devel texinfo flex bison
cd /usr/local/src
# fetch source for gcc 13
git clone --depth 1 --single-branch -b releases/gcc-13.3.0 https://github.com/gcc-mirror/gcc.git gcc-13.3.0
mkdir -p gcc-13.3.0/build-gomp
cd gcc-13.3.0/build-gomp
# configure gcc build
# I got these flags by:
# 1. downloading the source rpm for gcc-11 on AlmaLinux 8 container
# dnf install -y dnf-plugins-core rpmdevtools
# dnf download --source libgomp
# 2. extracting the gcc.spec from the source.
# rpmdev-extract gcc-xx.src.rpm
# 3. extracting optflags and ld_flags from gcc.spec:
# rpm --eval '%{optflags}'
# rpm --eval '%{build_ldflags}'
#
# I had to remove the following flags because they didn't compile for this version of libgomp:
# -Werror=format-security
# -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1
# -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1
#
# I added -march=armv8-a -mtune=generic to make them explicit. I don't think they're strictly needed.
OPT_FLAGS='-O2 -march=armv8-a -mtune=generic'\
' -fexceptions -g -grecord-gcc-switches -pipe -Wall'\
' -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS'\
' -fstack-protector-strong -fasynchronous-unwind-tables'\
' -fstack-clash-protection'
LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now'
CFLAGS="$OPT_FLAGS" \
CXXFLAGS="$OPT_FLAGS" \
LDFLAGS="$LDFLAGS" \
../configure \
--prefix=/usr \
--libdir=/usr/lib64 \
--enable-languages=c,c++ \
--disable-multilib \
--disable-bootstrap \
--enable-libgomp
# only build libgomp
make -j$(nproc) all-target-libgomp
make install-target-libgomp

View File

@ -50,6 +50,10 @@ RUN rm install_ninja.sh
ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
# Build a newer version of libgomp than that supported in in Almalinux 8.
COPY ./common/install_libgomp.sh install_libgomp.sh
RUN bash ./install_libgomp.sh && rm install_libgomp.sh
# git236+ would refuse to run git commands in repos owned by other users
# Which causes version check to fail, as pytorch repo is bind-mounted into the image
# Override this behaviour by treating every folder as safe

View File

@ -1,15 +1,11 @@
sphinx==5.3.0
sphinx==7.2.6
#Description: This is used to generate PyTorch docs
#Pinned versions: 5.3.0
#Pinned versions: 7.2.6
standard-imghdr==3.13.0; python_version >= "3.13"
#Description: This is needed by Sphinx, so it needs to be added here.
# The reasons are as follows:
# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
pytorch_sphinx_theme2==0.2.0
#Description: This is needed to generate PyTorch docs
#Pinned versions: 0.2.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
# something related to Docker setup. We can investigate this later.
@ -36,17 +32,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
#Description: This is used to generate PyTorch docs
#Pinned versions: 2.13.0
breathe==4.34.0
breathe==4.36.0
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 4.34.0
#Pinned versions: 4.36.0
exhale==0.2.3
exhale==0.3.7
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 0.2.3
#Pinned versions: 0.3.7
docutils==0.16
docutils==0.20
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 0.16
#Pinned versions: 0.20
bs4==0.0.1
#Description: This is used to generate PyTorch C++ docs
@ -56,13 +52,13 @@ IPython==8.12.0
#Description: This is used to generate PyTorch functorch docs
#Pinned versions: 8.12.0
myst-nb==0.17.2
myst-nb==1.3.0
#Description: This is used to generate PyTorch functorch and torch.compile docs.
#Pinned versions: 0.17.2
#Pinned versions: 1.3.0
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
python-etcd==0.4.5
sphinx-copybutton==0.5.0
sphinx-design==0.4.0
sphinx-design==0.6.1
sphinxcontrib-mermaid==1.0.0
myst-parser==0.18.1
myst-parser==4.0.1

View File

@ -89,23 +89,41 @@ if [ "$is_main_doc" = true ]; then
make coverage
# Now we have the coverage report, we need to make sure it is empty.
# Count the number of lines in the file and turn that number into a variable
# $lines. The `cut -f1 ...` is to only parse the number, not the filename
# Skip the report header by subtracting 2: the header will be output even if
# there are no undocumented items.
# Sphinx 7.2.6+ format: python.txt contains a statistics table with a TOTAL row
# showing the undocumented count in the third column.
# Example: | TOTAL | 99.83% | 2 |
#
# Also: see docs/source/conf.py for "coverage_ignore*" items, which should
# be documented then removed from there.
lines=$(wc -l build/coverage/python.txt 2>/dev/null |cut -f1 -d' ')
undocumented=$((lines - 2))
if [ $undocumented -lt 0 ]; then
# Extract undocumented count from TOTAL row in Sphinx 7.2.6 statistics table
# The table format is: | Module | Coverage | Undocumented |
# Extract the third column (undocumented count) from the TOTAL row
undocumented=$(grep "| TOTAL" build/coverage/python.txt | awk -F'|' '{print $4}' | tr -d ' ')
if [ -z "$undocumented" ] || ! [[ "$undocumented" =~ ^[0-9]+$ ]]; then
echo coverage output not found
exit 1
elif [ $undocumented -gt 0 ]; then
echo undocumented objects found:
cat build/coverage/python.txt
elif [ "$undocumented" -gt 0 ]; then
set +x # Disable command echoing for cleaner output
echo ""
echo "====================="
echo "UNDOCUMENTED OBJECTS:"
echo "====================="
echo ""
# Find the line number of the TOTAL row and print only what comes after it
total_line=$(grep -n "| TOTAL" build/coverage/python.txt | cut -d: -f1)
if [ -n "$total_line" ]; then
# Print only the detailed list (skip the statistics table)
tail -n +$((total_line + 2)) build/coverage/python.txt
else
# Fallback to showing entire file if TOTAL line not found
cat build/coverage/python.txt
fi
echo ""
echo "Make sure you've updated relevant .rsts in docs/source!"
echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
echo "You can reproduce locally by running 'cd docs && make coverage && tail -n +\$((grep -n \"| TOTAL\" build/coverage/python.txt | cut -d: -f1) + 2)) build/coverage/python.txt'"
set -x # Re-enable command echoing
exit 1
fi
else

View File

@ -337,7 +337,7 @@ test_python() {
test_python_smoke() {
# Smoke tests for H100/B200
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty
}

View File

@ -70,7 +70,7 @@ sccache --zero-stats
sccache --show-stats
# Build the wheel
python -m build --wheel --no-build-isolation
python -m build --wheel --no-isolation
if ($LASTEXITCODE -ne 0) { exit 1 }
# Install the wheel locally

View File

@ -1,11 +1,11 @@
name: 🚀 Release highlight for proposed Feature
name: 🚀 New Feature for Release
description: Submit a Release highlight for proposed Feature
labels: ["release-feature-request"]
body:
- type: textarea
attributes:
label: Release highlight for proposed Feature
label: New Feature for Release
description: >
Example: “A torch.special module, analogous to SciPy's special module.”
- type: input

View File

@ -1 +1 @@
3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2
ad5816f0eee1c873df1b7d371c69f1f811a89387

View File

@ -1 +1 @@
cfbc5c2f1c798991715a6b06bb3ce46478c4487c
ca2212438fdd8ce29b66999ed70ed54b0f9372d1

View File

@ -2,8 +2,8 @@ tracking_issue: 24422
ciflow_tracking_issue: 64124
ciflow_push_tags:
- ciflow/b200
- ciflow/b200-symm-mem
- ciflow/b200-distributed
- ciflow/b200-symm-mem
- ciflow/binaries
- ciflow/binaries_libtorch
- ciflow/binaries_wheel
@ -22,6 +22,8 @@ ciflow_push_tags:
- ciflow/inductor-perf-test-nightly-xpu
- ciflow/inductor-periodic
- ciflow/inductor-rocm
- ciflow/inductor-rocm-mi200
- ciflow/inductor-rocm-mi300
- ciflow/linux-aarch64
- ciflow/mps
- ciflow/nightly
@ -33,11 +35,13 @@ ciflow_push_tags:
- ciflow/quantization-periodic
- ciflow/riscv64
- ciflow/rocm
- ciflow/rocm-mi200
- ciflow/rocm-mi300
- ciflow/rocm-mi355
- ciflow/rocm-navi31
- ciflow/s390
- ciflow/slow
- ciflow/slow-rocm-mi200
- ciflow/torchbench
- ciflow/triton_binaries
- ciflow/trunk

View File

@ -56,6 +56,8 @@ jobs:
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
pytorch-linux-jammy-py3.10-clang12,
pytorch-linux-jammy-py3.11-clang12,
pytorch-linux-jammy-py3.12-clang12,
pytorch-linux-jammy-py3.13-clang12,
pytorch-linux-jammy-py3.14-clang12,
pytorch-linux-jammy-rocm-n-py3,
@ -77,11 +79,11 @@ jobs:
pytorch-linux-noble-riscv64-py3.12-gcc14
]
include:
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13
runner: linux.arm64.m7g.4xlarge
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-clang21
runner: linux.arm64.m7g.4xlarge
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks
runner: linux.arm64.m7g.4xlarge
timeout-minutes: 600
# Docker uploads fail from LF runners, see https://github.com/pytorch/pytorch/pull/137358

View File

@ -72,7 +72,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: linux.arm64.m7g.4xlarge
build-environment: linux-jammy-aarch64-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_cpu_aarch64", shard: 1, num_shards: 9, runner: "linux.arm64.m7g.metal" },

View File

@ -2,12 +2,12 @@ name: inductor-rocm
on:
schedule:
- cron: 0 * * * *
- cron: 0 */3 * * *
push:
branches:
- release/*
tags:
- ciflow/inductor-rocm/*
- ciflow/inductor-rocm-mi200/*
workflow_dispatch:
concurrency:

View File

@ -7,6 +7,7 @@ on:
- release/*
tags:
- ciflow/inductor-rocm/*
- ciflow/inductor-rocm-mi300/*
workflow_dispatch:
concurrency:

View File

@ -33,7 +33,7 @@ jobs:
with:
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
build-environment: linux-jammy-aarch64-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13
runner: linux.arm64.m7g.4xlarge
test-matrix: |
{ include: [

View File

@ -60,7 +60,7 @@ jobs:
with:
build-environment: linux-jammy-aarch64-py3.10
runner: linux.arm64.m7g.4xlarge
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13
test-matrix: |
{ include: [
{ config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" },

View File

@ -11,7 +11,6 @@ on:
- cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
push:
tags:
- ciflow/periodic/*
- ciflow/periodic-rocm-mi200/*
branches:
- release/*

View File

@ -11,6 +11,7 @@ on:
- cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
push:
tags:
- ciflow/periodic/*
- ciflow/periodic-rocm-mi300/*
branches:
- release/*

View File

@ -5,11 +5,12 @@ on:
branches:
- release/*
tags:
- ciflow/rocm/*
- ciflow/rocm-mi200/*
workflow_dispatch:
schedule:
- cron: 29 8 * * * # about 1:29am PDT
- cron: 0 * * * *
- cron: 0 */3 * * *
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}

View File

@ -6,6 +6,7 @@ on:
- main
- release/*
tags:
- ciflow/rocm/*
- ciflow/rocm-mi300/*
workflow_dispatch:
schedule:

81
.github/workflows/slow-rocm-mi200.yml vendored Normal file
View File

@ -0,0 +1,81 @@
# This workflow is dedicated to host slow jobs that are run only periodically because
# they are too slow to run in every commit. The list of slow tests can be found in
# https://github.com/pytorch/test-infra/blob/generated-stats/stats/slow-tests.json
name: slow-rocm-mi200
on:
push:
branches:
- release/*
tags:
- ciflow/slow/*
- ciflow/slow-rocm-mi200/*
schedule:
- cron: 0 */3 * * *
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
llm-td:
if: github.repository_owner == 'pytorch'
name: before-test
uses: ./.github/workflows/llm_td_retrieval.yml
permissions:
id-token: write
contents: read
target-determination:
name: before-test
uses: ./.github/workflows/target_determination.yml
needs: llm-td
permissions:
id-token: write
contents: read
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-rocm-py3_10-build:
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
{ config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
secrets: inherit

View File

@ -105,36 +105,6 @@ jobs:
test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-rocm-py3_10-build:
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
{ config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3_10-clang18-asan-build:
name: linux-jammy-py3.10-clang18-asan
uses: ./.github/workflows/_linux-build.yml

View File

@ -11,15 +11,16 @@ on:
- inductor
- unstable
- slow
- slow-rocm-mi200
- unstable-periodic
- inductor-periodic
- rocm
- rocm-mi200
- rocm-mi300
- rocm-mi355
- inductor-micro-benchmark
- inductor-micro-benchmark-x86
- inductor-cu124
- inductor-rocm
- inductor-rocm-mi200
- inductor-rocm-mi300
- mac-mps
- linux-aarch64

1
.gitignore vendored
View File

@ -127,6 +127,7 @@ torch/test/
torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
torch/version.py
torch/_inductor/kernel/vendored_templates/*
minifier_launcher.py
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*

View File

@ -143,7 +143,8 @@ init_command = [
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
'numpy==2.1.0 ; python_version >= "3.12"',
'numpy==2.1.0 ; python_version >= "3.12" and python_version <= "3.13"',
'numpy==2.3.4 ; python_version >= "3.14"',
'expecttest==0.3.0',
'pyrefly==0.36.2',
'sympy==1.13.3',

View File

@ -174,6 +174,12 @@ class TORCH_API Context {
static long versionCuDNN() {
return detail::getCUDAHooks().versionCuDNN();
}
static long versionRuntimeCuDNN() {
return detail::getCUDAHooks().versionRuntimeCuDNN();
}
static long versionCuDNNFrontend() {
return detail::getCUDAHooks().versionCuDNNFrontend();
}
static bool hasCuSOLVER() {
return detail::getCUDAHooks().hasCuSOLVER();
}

View File

@ -6,6 +6,7 @@
#include <c10/util/Half.h>
#include <c10/util/Metaprogramming.h>
#include <c10/util/complex.h>
#include <torch/headeronly/core/Dispatch.h>
#ifdef __CUDACC__
#include <cuda.h> // For CUDA_VERSION
@ -61,12 +62,9 @@ TORCH_API void record_kernel_function_dtype(std::string name);
} \
} while (0)
#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \
case enum_type: { \
AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type); \
using HINT [[maybe_unused]] = c10::impl::ScalarTypeToCPPTypeT<enum_type>; \
return __VA_ARGS__(); \
}
#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \
THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL( \
AT_PRIVATE_CHECK_SELECTIVE_BUILD, enum_type, HINT, __VA_ARGS__)
#define AT_DISPATCH_CASE(enum_type, ...) \
AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
@ -95,14 +93,6 @@ TORCH_API void record_kernel_function_dtype(std::string name);
return __VA_ARGS__(); \
}
namespace detail {
inline at::ScalarType scalar_type(at::ScalarType s) {
return s;
}
} // namespace detail
// The AT_DISPATCH_* family of macros provides the ability to
// conveniently generate specializations of a kernel over all of the
// dtypes we care about in PyTorch. We call it "dispatch" because
@ -190,27 +180,13 @@ inline at::ScalarType scalar_type(at::ScalarType s) {
// but we're just being safe (and it doesn't hurt.) Note we must
// use it to shut up warnings about unused store.
#define AT_DISPATCH_SWITCH(TYPE, NAME, ...) \
[&] { \
const auto& the_type = TYPE; \
constexpr const char* at_dispatch_name = NAME; \
/* don't use TYPE again in case it is an expensive or side-effect op */ \
at::ScalarType _st = ::detail::scalar_type(the_type); \
RECORD_KERNEL_FUNCTION_DTYPE(at_dispatch_name, _st); \
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") \
switch (_st) { \
__VA_ARGS__ \
default: \
TORCH_CHECK_NOT_IMPLEMENTED( \
false, \
'"', \
at_dispatch_name, \
"\" not implemented for '", \
toString(_st), \
"'"); \
} \
C10_DIAGNOSTIC_POP() \
}()
#define AT_DISPATCH_SWITCH(TYPE, NAME, ...) \
THO_DISPATCH_SWITCH_TMPL( \
RECORD_KERNEL_FUNCTION_DTYPE, \
TORCH_CHECK_NOT_IMPLEMENTED, \
TYPE, \
NAME, \
__VA_ARGS__)
#define AT_DISPATCH_CASE_FLOATING_TYPES(...) \
AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \

View File

@ -1,3 +1,8 @@
#pragma once
#include <torch/headeronly/core/Dispatch_v2.h>
// Get AT_DISPATCH_SWITCH and AT_DISPATCH_CASE:
#include <ATen/Dispatch.h>
// This is a new implementation of the AT_DISPATCH macro family from
@ -74,41 +79,19 @@
// macro expansion occurs, mediated with AT_EXPAND and AT_GUARD. I mostly
// relied on GPT4 to help me get it right.
// Public API macros
// See documentation above
#define AT_DISPATCH_V2(TYPE, NAME, BODY, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, AT_AP_VAR(AT_WRAP(BODY), TYPE, __VA_ARGS__))
// This macro lets you pass an arbitrary expression that may contain internal
// commas to another macro without having the commas causing the expression
// to be interpreted as being multiple arguments
#define AT_WRAP(...) __VA_ARGS__
#define AT_FLOAT8_TYPES \
c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \
c10::kFloat8_e4m3fnuz, c10::kFloat8_e8m0fnu
#define AT_INTEGRAL_TYPES \
c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort
#define AT_FLOATING_TYPES c10::kDouble, c10::kFloat
#define AT_BAREBONES_UNSIGNED_TYPES c10::kUInt16, c10::kUInt32, c10::kUInt64
#define AT_INTEGRAL_TYPES_V2 \
AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)
#define AT_COMPLEX_TYPES c10::kComplexDouble, c10::kComplexFloat
#define AT_QINT_TYPES c10::kQInt8, c10::kQUInt8, c10::kQInt32
// NB: not *actually* all types
#define AT_ALL_TYPES AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES)
#define AT_ALL_TYPES_AND_COMPLEX \
AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES)
// Helper macros
THO_DISPATCH_V2_TMPL( \
AT_DISPATCH_SWITCH, \
AT_DISPATCH_CASE, \
TYPE, \
NAME, \
AT_WRAP(BODY), \
__VA_ARGS__)
// Unused helper macros, kept for BC:
#define AT_AP_VAR(N, T, ...) \
AT_EXPAND(AT_CONCAT(AT_AP, AT_NUM_ARGS(__VA_ARGS__))(AT_WRAP(N), __VA_ARGS__))
#define AT_CONCAT(a, b) AT_CONCAT_AUX(a, b)
#define AT_CONCAT_AUX(a, b) a##b
#define AT_EXPAND(X) X
// Ensure we never have too many scalar types for the expansion here to
// support. To bump this, you must regenerate the macros below.
@ -119,12 +102,6 @@ static_assert(static_cast<int>(c10::ScalarType::NumOptions) < 60);
num_args = 60
nums = ', '.join(str(i) for i in reversed(range(num_args+1)))
args = ', '.join(f'_{i}' for i in range(1, num_args+1))
print(f'#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, {nums}))')
print(f'#define AT_NUM_ARGS_AUX({args}, N, ...) N')
for i in range(1, num_args+1):
args = ', '.join(f'_{i}' for i in range(1, i+1))
cases = ' '.join([f'AT_DISPATCH_CASE(_{j}, N)' for j in range(1, i+1)])
@ -135,8 +112,6 @@ for i in range(1, num_args+1):
// Begin generated code
// clang-format off
#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, N, ...) N
#define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N)
#define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N)
#define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N)

View File

@ -388,6 +388,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
#ifndef USE_ROCM
at::Half halpha;
at::Half hbeta;
uint32_t mask = -1;
#endif
void * alpha_ptr = &alpha;
void * beta_ptr = &beta;
@ -427,7 +428,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
if (fp16_reduction !=
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
uint32_t mask =
mask =
fp16_reduction ==
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
@ -444,7 +445,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
if (bf16_reduction !=
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
uint32_t mask =
mask =
bf16_reduction ==
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
@ -511,17 +512,41 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
cublasLtMatmulHeuristicResult_t heuristicResult = {};
int returnedResult = 0;
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
ltHandle,
computeDesc.descriptor(),
Adesc.descriptor(),
Bdesc.descriptor(),
Cdesc.descriptor(),
Cdesc.descriptor(),
preference.descriptor(),
1,
&heuristicResult,
&returnedResult));
// on Blackwell+, we fake a n > 1 matmul when querying heuristics
// to prevent cuBLASLt from dispatching to a GEMV kernel for batch-invariance
#ifndef USE_ROCM
const bool lie_to_cublaslt = mask == CUBLASLT_REDUCTION_SCHEME_NONE && n == 1 && at::cuda::getCurrentDeviceProperties()->major >= 10;
#else
const bool lie_to_cublaslt = false;
#endif
if (lie_to_cublaslt) {
CuBlasLtMatrixLayout FakeBdesc(abType, k, 2, ldb, opb == CUBLAS_OP_T);
CuBlasLtMatrixLayout FakeCdesc(cType, m, 2, ldc);
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
ltHandle,
computeDesc.descriptor(),
Adesc.descriptor(),
FakeBdesc.descriptor(),
FakeCdesc.descriptor(),
FakeCdesc.descriptor(),
preference.descriptor(),
1,
&heuristicResult,
&returnedResult));
} else {
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
ltHandle,
computeDesc.descriptor(),
Adesc.descriptor(),
Bdesc.descriptor(),
Cdesc.descriptor(),
Cdesc.descriptor(),
preference.descriptor(),
1,
&heuristicResult,
&returnedResult));
}
if (returnedResult == 0) {
cublasStatus = CUBLAS_STATUS_NOT_SUPPORTED;
}
@ -1572,7 +1597,7 @@ bool gemm_and_bias(
}
using opmath_t = at::opmath_type<Dtype>;
opmath_t beta_val = 0; // bias is added in epilogue
opmath_t beta_val = bias ? 0 : 1; // bias is added in epilogue unless nullptr
cudaDataType_t abType = CUDA_R_32F;
cudaDataType_t cType = CUDA_R_32F;
@ -1661,15 +1686,22 @@ bool gemm_and_bias(
_syncCurrentWithCarveoutStream(stream, true);
}
#endif
cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
if (activation == GEMMAndBiasActivationEpilogue::RELU) {
epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
} else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
}
const auto epilogue = [&]() -> cublasLtEpilogue_t {
// The cuBLAS documentation indicates that
// *_<ACTIVATION>_BIAS = *_<ACTIVATION>,
// but we keep it verbose here for clarity.
switch (activation) {
case GEMMAndBiasActivationEpilogue::RELU:
return bias ? CUBLASLT_EPILOGUE_RELU_BIAS : CUBLASLT_EPILOGUE_RELU;
case GEMMAndBiasActivationEpilogue::GELU:
return bias ? CUBLASLT_EPILOGUE_GELU_BIAS : CUBLASLT_EPILOGUE_GELU;
default:
return bias ? CUBLASLT_EPILOGUE_BIAS : CUBLASLT_EPILOGUE_DEFAULT;
}
}();
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue);
if (bias != nullptr) {
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue);
if (bias) {
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias);
}

View File

@ -24,7 +24,13 @@ namespace detail {
// radix_sort_pairs doesn't interact with value_t other than to copy
// the data, so we can save template instantiations by reinterpreting
// it as an opaque type.
// We use native integer types for 1/2/4/8-byte values to reduce
// register usage in CUDA kernels. For sizes > 8 fall back to char array.
template <int N> struct alignas(N) OpaqueType { char data[N]; };
template <> struct alignas(1) OpaqueType<1> { uint8_t data; };
template <> struct alignas(2) OpaqueType<2> { uint16_t data; };
template <> struct alignas(4) OpaqueType<4> { uint32_t data; };
template <> struct alignas(8) OpaqueType<8> { uint64_t data; };
template<typename key_t, int value_size>
void radix_sort_pairs_impl(

View File

@ -21,6 +21,7 @@
#if AT_CUDNN_ENABLED()
#include <ATen/cudnn/cudnn-wrapper.h>
#include <cudnn_frontend.h>
#endif
#if AT_MAGMA_ENABLED()
@ -351,6 +352,26 @@ long CUDAHooks::versionCuDNN() const {
#endif
}
long CUDAHooks::versionRuntimeCuDNN() const {
#if AT_CUDNN_ENABLED()
#ifndef USE_STATIC_CUDNN
return cudnnGetVersion();
#else
return CUDNN_VERSION;
#endif
#else
TORCH_CHECK(false, "Cannot query CuDNN version if ATen_cuda is not built with CuDNN");
#endif
}
long CUDAHooks::versionCuDNNFrontend() const {
#if AT_CUDNN_ENABLED()
return CUDNN_FRONTEND_VERSION;
#else
TORCH_CHECK(false, "Cannot query CuDNN Frontend version if ATen_cuda is not built with CuDNN");
#endif
}
long CUDAHooks::versionMIOpen() const {
#if AT_ROCM_ENABLED()
return MIOPEN_VERSION_MAJOR * 10000 +

View File

@ -49,6 +49,8 @@ struct CUDAHooks : public at::CUDAHooksInterface {
bool hasCUDART() const override;
long versionCUDART() const override;
long versionCuDNN() const override;
long versionRuntimeCuDNN() const override;
long versionCuDNNFrontend() const override;
long versionMIOpen() const override;
std::string showConfig() const override;
double batchnormMinEpsilonCuDNN() const override;

View File

@ -174,6 +174,14 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
}
virtual long versionRuntimeCuDNN() const {
TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
}
virtual long versionCuDNNFrontend() const {
TORCH_CHECK(false, "Cannot query cuDNN Frontend version without ATen_cuda library. ", CUDA_HELP);
}
virtual long versionMIOpen() const {
TORCH_CHECK(false, "Cannot query MIOpen version without ATen_cuda library. ", CUDA_HELP);
}

View File

@ -1009,12 +1009,25 @@ static Device correct_out_device(const Tensor& self, const Tensor& other) {
}
}
static Tensor send_to_meta(const Tensor& self, const Device& device) {
Tensor out_meta;
if (self._is_zerotensor() && self.unsafeGetTensorImpl()->is_wrapped_number()) {
out_meta = at::_efficientzerotensor(self.sizes(), self.options().device(device));
out_meta.unsafeGetTensorImpl()->set_wrapped_number(true);
} else {
out_meta = self.to(device);
}
return out_meta;
}
Tensor mul_zerotensor(const Tensor& self, const Tensor& other) {
auto out_device = correct_out_device(self, other);
// hack to use the TensorIterator to get the correct broadcasting and type promotion logic
auto device_ = Device(DeviceType::Meta);
constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
auto meta_out = at::_ops::mul_Tensor::redispatch(meta_dks, self.to(device_), other.to(device_));
auto self_meta = send_to_meta(self, device_);
auto other_meta = send_to_meta(other, device_);
auto meta_out = at::_ops::mul_Tensor::redispatch(meta_dks, self_meta, other_meta);
return at::_efficientzerotensor(meta_out.sizes(), meta_out.options().device(out_device));
}
@ -1023,7 +1036,9 @@ Tensor div_zerotensor(const Tensor& self, const Tensor& other) {
// hack to use the TensorIterator to get the correct broadcasting and type promotion logic
auto device_ = Device(DeviceType::Meta);
constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
auto meta_out = at::_ops::div_Tensor::redispatch(meta_dks, self.to(device_), other.to(device_));
auto self_meta = send_to_meta(self, device_);
auto other_meta = send_to_meta(other, device_);
auto meta_out = at::_ops::div_Tensor::redispatch(meta_dks, self_meta, other_meta);
if (self._is_zerotensor()) {
if (other._is_zerotensor()) {
@ -1052,8 +1067,9 @@ static Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const
// hack to use the TensorIterator to get the correct broadcasting and type promotion logic
auto device_ = Device(DeviceType::Meta);
constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
auto meta_out = at::_ops::add_Tensor::redispatch(
meta_dks, self.to(device_), other.to(device_), alpha);
auto self_meta = send_to_meta(self, device_);
auto other_meta = send_to_meta(other, device_);
auto meta_out = at::_ops::add_Tensor::redispatch(meta_dks, self_meta, other_meta, alpha);
auto get_out_like = [&] (const Tensor& tensor)
{

View File

@ -409,7 +409,7 @@ struct ConvParams {
if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
return false;
}
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
static long cudnn_version = detail::getCUDAHooks().versionRuntimeCuDNN();
// broken on cuDNN 9.8 - 9.14
if (cudnn_version >= 90800 && cudnn_version < 91500) {
if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
@ -453,7 +453,7 @@ struct ConvParams {
}
// native kernel doesn't support 64-bit non-splittable case
if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionRuntimeCuDNN() : -1;
// TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
if (cudnn_version < 0 || cudnn_version > 91000) {

View File

@ -50,18 +50,35 @@ static inline bool parseLinearFlatten3d() {
// `_flatten_nd_linear` flattens all but the last dimension of the input tensor
// before passing it to linear operation
static inline Tensor _flatten_nd_linear(const Tensor& input, const Tensor& weight, const Tensor& bias) {
const auto input_sizes = input.sym_sizes();
// can't use -1 in reshape because it errors when a dimension is 0
c10::SymInt flattened_dim = 1;
for (int64_t i = 0, ndim = input_sizes.size(); i < ndim - 1; ++i) {
flattened_dim = flattened_dim * input_sizes[i];
const auto input_sizes = input.sym_sizes();
const auto result_flattened = [&]() -> Tensor {
const auto input_ncols = input_sizes.back();
const auto input_flattened_nrows = [&]() -> c10::SymInt {
// can't use -1 in reshape because it errors when a dimension is 0
auto flattened_nrows = c10::SymInt{1};
for (const auto& size : input_sizes.slice(0, input_sizes.size() - 1)) {
flattened_nrows *= size;
}
return flattened_nrows;
}();
const auto input_flattened = input.view_symint({input_flattened_nrows, input_ncols});
if (weight.layout() == c10::kStrided) {
return at::addmm(bias, input_flattened, weight.t());
} else {
// weight is sparse, and addmm for sparse expects matmul lhs to be sparse,
// so we transpose the problem.
// NOTE: at::matmul handles (dense @ sparse) similarly.
const auto bias_t = (bias.dim() >= 2) ? bias.mT() : bias.unsqueeze(-1);
return at::addmm(bias_t, weight, input_flattened.t()).t();
}
auto inp_reshape = input.reshape_symint({flattened_dim, input_sizes.at(input_sizes.size() -1)});
const auto result = at::addmm(bias, inp_reshape, weight.t());
auto new_size = input_sizes.slice(0, input_sizes.size() - 1);
c10::SymDimVector sizes_vec(new_size.begin(), new_size.end());
sizes_vec.push_back(result.sym_size(1));
return result.view_symint(sizes_vec);
}();
// Unflatten flattened row dims
auto result_sizes = c10::SymDimVector{input_sizes.begin(), input_sizes.end()};
result_sizes.back() = result_flattened.sym_size(1);
return result_flattened.view_symint(result_sizes);
}
@ -90,15 +107,23 @@ Tensor linear(const Tensor& input, const Tensor& weight, const std::optional<Ten
// Fused op is marginally faster.
return at::addmm(*bias, input, weight.t());
}
if (bias->defined() && !input.is_xla()) {
// Also hit the fused path for contiguous 3D input, if not using xla
const auto is_bias_likely_fusable = (
bias->defined() &&
// cuBLASLt: will fuse in the epilogue without copies
// when input/weight/bias are all strided.
// When weight is not strided, bias will not be fused,
// but we can still dispatch here to avoid at::matmul
// path which will probably use a very similar
// flattening optimization.
((bias->dim() == 1 || bias->squeeze().dim() == 1) && bias->is_contiguous_or_false())
);
if (is_bias_likely_fusable && !input.is_xla()) {
// Also hit the fused path for contiguous nD input, if not using xla
// backend. Reshaping/flattening has some performance implications on xla.
bool is_contiguous = input.is_contiguous_or_false();
if (is_contiguous && input_dim == 3) {
if (input.is_contiguous_or_false()) {
return _flatten_nd_linear(input, weight, *bias);
} else if (is_contiguous && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) {
return _flatten_nd_linear(input, weight, *bias);
} else if (parseLinearFlatten3d() && input_dim == 3) {
} else if (parseLinearFlatten3d()) {
// If user forces flattening via env var
const Tensor input_cont = input.contiguous();
return _flatten_nd_linear(input_cont, weight, *bias);

View File

@ -1,5 +1,6 @@
#include <ATen/core/ATen_fwd.h>
#include <c10/core/ScalarType.h>
#include <c10/core/SymInt.h>
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/AccumulateType.h>
#include <ATen/Dispatch.h>
@ -1710,11 +1711,37 @@ Tensor narrow_symint(
"], but got ",
start,
")")
if (start < 0) {
start = start + cur_size;
auto cond1 = TORCH_GUARD_OR_FALSE(start.sym_lt(0));
auto cond2 = TORCH_GUARD_OR_FALSE(start.sym_ge(0));
if (cond1 || cond2) {
if (cond1) {
start = start + cur_size;
}
TORCH_SYM_CHECK(
start.sym_le(cur_size - length),
"start (",
start,
") + length (",
length,
") exceeds dimension size (",
cur_size,
").");
return at::slice_symint(self, dim, start, start + length, 1);
}
// Unbacked start handling!
// Bounds check without converting start:
// - If start < 0: need (start + cur_size) + length <= cur_size, i.e., start +
// length <= 0
// - If start >= 0: need start + length <= cur_size
auto end = start + length;
TORCH_SYM_CHECK(
start.sym_le(cur_size - length),
(start.sym_lt(0).sym_and((end).sym_le(0)))
.sym_or(start.sym_ge(0).sym_and((end).sym_le(cur_size))),
"start (",
start,
") + length (",
@ -1722,7 +1749,28 @@ Tensor narrow_symint(
") exceeds dimension size (",
cur_size,
").");
return at::slice_symint(self, dim, start, start + length, 1);
if (TORCH_GUARD_OR_FALSE(end.sym_ne(0))) {
return at::slice_symint(self, dim, start, end, 1);
} else {
// Cannot statically determine the condition due to unbacked.
// This is an interesting situation; when start is negative and
// start + length == 0, slice and narrow do different things.
// i.e., x.narrow(0, -2, 2) != x[-2:0]; in that case, we want to
// pass curr_size instead of 0. Otherwise, they would do the same thing.
// This says at runtime: if start < 0 and end == 0, then pass curr_size
// instead of 0.
auto use_different = start.sym_lt(0).sym_and(end.sym_eq(0)).toSymInt();
auto result =
at::slice_symint(self, dim, start, end + use_different * cur_size, 1);
// Ensure slice allocated unbacked size is specialized to length.
SymInt new_size = result.sym_size(dim);
TORCH_SYM_CHECK(new_size.sym_eq(length), "")
return result;
}
}
// This overload exists purely for XLA, because they wanted to pass in
@ -1736,8 +1784,8 @@ Tensor narrow_tensor_symint(
start.dim() == 0 &&
isIntegralType(start.scalar_type(), /*includeBool=*/false),
"start must be an 0-dim integral Tensor.");
int64_t st = start.item<int64_t>();
return at::narrow_symint(self, dim, c10::SymInt(st), std::move(length));
c10::SymInt st = start.item().toSymInt();
return at::narrow_symint(self, dim, std::move(st), std::move(length));
}
std::

View File

@ -247,8 +247,8 @@ void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) {
});
}
template <typename func_t, typename vec_func_t, typename ident_t = double>
void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, ident_t ident = static_cast<ident_t>(0)) {
template <typename func_t, typename vec_func_t>
void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, double ident = 0) {
using traits = binary_function_traits<func_t>;
static_assert(
all_same<

View File

@ -339,13 +339,33 @@ void or_kernel_impl(TensorIterator& iter) {
}
}
template<typename scalar_t>
struct MinValuesOps: public at::native::MinOps<scalar_t> {
using arg_t = typename MinOps<scalar_t>::arg_t;
static scalar_t project(arg_t arg) {
return arg.first;
}
};
void min_values_kernel_impl(TensorIterator& iter) {
// This case is special because of Vectorized<int64_t> does not
// handle upper_bound<int64_t>().
// See: https://github.com/pytorch/pytorch/issues/43254
if (iter.dtype() == kLong || iter.dtype() == kUInt64) {
AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
binary_kernel_reduce(
iter,
MinValuesOps<scalar_t>{},
std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
}), kLong, kUInt64);
return;
}
AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
binary_kernel_reduce_vec(
iter,
[](scalar_t a, scalar_t b) -> scalar_t { return min_impl(a, b); },
[](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return minimum(a, b); },
upper_bound<scalar_t>());
static_cast<double>(upper_bound<scalar_t>()));
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
}

View File

@ -147,14 +147,24 @@ static bool isGloballyDisabledAddmmCudaLt(const at::Device& device) {
/*
* Check whether for the given input we want to enable the Lt interface
*/
static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) {
static bool isInputCompliesAddmmCudaLt(
Tensor& result,
const Tensor& self,
const Tensor& mat1,
const Tensor& mat2,
const Scalar& beta,
const Scalar& alpha,
Activation activation
) {
#ifdef USE_ROCM
// Implies 2D bias which we currently not send through Lt.
// TODO: this check is done pre col-major input preparation,
// so, this condition can be ralexed in cases when a col-major
// copy of result is needed.
if (result.is_same(self)) {
if (self.is_same(result) || self.dim() == 2) {
return false;
}
#endif
#if defined(USE_ROCM) && ROCM_VERSION == 60400
// hipblaslt TT fp32 regression on ROCm 6.4, cannot use
@ -169,13 +179,33 @@ static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const
#if defined(CUDA_VERSION) || defined(USE_ROCM)
const auto scalar_type = mat1.scalar_type();
return (beta.toComplexDouble() == 1.0
// NOTE: row-major result is important when bias is 1D.
// This is because Lt broadcasts 1D bias over the columns
// while the aten::addmm API broadcasts it over the rows,
// and this is in conjuction with the data preparation
// procedure that does not transpose arguments with
// col-major result. For col-major result we need
// to explicitly transpose the problem so that bias is
// correctly applied.
// TODO: enable col-major result if needed.
// TODO: no need to check result's layout when
// !result.is_same(self) and self.dim() == 2, because
// self needs to be copied into result and the bias ptr
// will be ignored.
&& result.dim() == 2 && result.is_contiguous()
// Conditions for bias to be fusable
&& (
self.is_contiguous() &&
// NOTE: fine to have 1-len dims to the left from the right-most one
(self.dim() == 1 || self.squeeze().dim() == 1) &&
self.sizes().back() == mat2_sizes[1]
( // Conditions for bias to be fusable -- implies direct Lt path without copies.
self.is_contiguous() &&
// NOTE: fine to have 1-len dims to the left from the right-most one
(self.dim() == 1 || self.squeeze().dim() == 1) &&
self.sizes().back() == mat2_sizes[1]
)
|| ( // 2D bias restrictions. self.is_contiguous() is implicit when result.is_same(self),
// and we need to copy self into result otherwise, so the self's layout becomes irrelevant.
// See also TODO from above.
activation != Activation::None && // Lt is faster when activation is fused
(self.dim() == 2 && at::is_expandable_to(self.sizes(), {mat1_sizes[0], mat2_sizes[1]}))
)
)
&& ( // some dtype restrictions
#ifndef USE_ROCM
@ -270,7 +300,16 @@ bool launchGemmAndBiasCublasLt(
const Scalar& alpha,
Activation activation = Activation::None
) {
const auto* self_ptr = self.const_data_ptr<scalar_t>();
// We apply bias in the epilogue only when it is 1D,
// or when it can be squeezed to 1D.
// self_ptr == nullptr implies ignore bias epilogue
// and use standard gemm-like API.
const auto* self_ptr = [&]() -> auto {
if (self.dim() == 1 || self.squeeze().dim() == 1) {
return self.const_data_ptr<scalar_t>();
}
return static_cast<const scalar_t*>(nullptr);
}();
const auto tuning_ctx = at::cuda::tunable::getTuningContext();
if (tuning_ctx->IsTunableOpEnabled()) {
@ -356,7 +395,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
#endif
// Condition on the input
disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha) || disable_addmm_cuda_lt;
disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
// }
at::ScalarType scalar_type = mat1.scalar_type();
@ -366,19 +405,20 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
if (!result.is_same(self)) {
at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});
// We use bias ptr in the Lt path only when bias is 1D
const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
if (disable_addmm_cuda_lt) {
// When in non-Lt path we do expand self even before
if (!use_bias_ptr_lt) {
// We do expand self even before
// check for beta != 0.0 to make sure that
// test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
// runs green.
return expand_size(self, result.sizes(), "addmm");
}
// copy next, should broadcast
return c10::MaybeOwned<Tensor>::borrowed(self);
}();
// We copy bias when in the non-Lt path
if (beta.toComplexDouble() != 0.0 && disable_addmm_cuda_lt) {
// We do not copy bias only when we need the bias ptr
if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
// NOTE: self should broadcast over result
at::native::copy_(result, *self_maybe_expanded);
}

View File

@ -884,6 +884,69 @@ struct type_specialized_kernel_launcher {
}
};
template <int arg_index>
struct type_specialized_broadcast_kernel_launcher {
template <
typename func_t,
typename array_t,
typename dtypes_t,
typename calc_t>
static void apply(
int64_t numel,
func_t f,
array_t data,
dtypes_t dtypes,
calc_t offset_calc) {
using traits = function_traits<func_t>;
using ret_t = typename traits::result_type;
using arg0_t = typename traits::template arg<0>::type;
using arg1_t = typename traits::template arg<1>::type;
if (dtypes[0] == rt_binary_specializations[arg_index][0] &&
dtypes[1] == rt_binary_specializations[arg_index][1] &&
dtypes[2] == rt_binary_specializations[arg_index][2]) {
using ret_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][0]>;
using arg0_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][1]>;
using arg1_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][2]>;
constexpr int grp_sz = 128;
launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
if (unrl) {
auto offsets0 = offset_calc.get(idx);
auto offsets1 = offset_calc.get(idx + grp_sz);
auto offsets2 = offset_calc.get(idx + grp_sz * 2);
auto offsets3 = offset_calc.get(idx + grp_sz * 3);
void* out0 = data[0] + offsets0[0];
void* out1 = data[0] + offsets1[0];
void* out2 = data[0] + offsets2[0];
void* out3 = data[0] + offsets3[0];
auto u = c10::load<arg0_cpp_t>(data[1] + offsets0[1]);
auto v = c10::load<arg1_cpp_t>(data[2] + offsets0[2]);
ret_t result0 = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
auto u1 = c10::load<arg0_cpp_t>(data[1] + offsets1[1]);
auto v1 = c10::load<arg1_cpp_t>(data[2]+ offsets1[2]);
ret_t result1 = f(c10::convert<arg0_t>(u1), c10::convert<arg1_t>(v1));
auto u2 = c10::load<arg0_cpp_t>(data[1] + offsets2[1]);
auto v2 = c10::load<arg1_cpp_t>(data[2] + offsets2[2]);
ret_t result2 = f(c10::convert<arg0_t>(u2), c10::convert<arg1_t>(v2));
auto u3 = c10::load<arg0_cpp_t>(data[1] + offsets3[1]);
auto v3 = c10::load<arg1_cpp_t>(data[2] + offsets3[2]);
ret_t result3 = f(c10::convert<arg0_t>(u3), c10::convert<arg1_t>(v3));
*(ret_cpp_t*)out0 = c10::convert<ret_cpp_t>(result0);
*(ret_cpp_t*)out1 = c10::convert<ret_cpp_t>(result1);
*(ret_cpp_t*)out2 = c10::convert<ret_cpp_t>(result2);
*(ret_cpp_t*)out3 = c10::convert<ret_cpp_t>(result3);
} else {
auto offsets = offset_calc.get(idx);
void* out = data[0] + offsets[0];
auto u = c10::load<arg0_cpp_t>(data[1] + offsets[1]);
auto v = c10::load<arg1_cpp_t>(data[2] + offsets[2]);
ret_t result = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
*(ret_cpp_t*)out = c10::convert<ret_cpp_t>(result);
}
});
}
}
};
} // namespace
#endif
@ -1002,6 +1065,32 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
}
auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
#ifdef USE_ROCM
if (check_binary_rt_types_for_specialization(iter)) {
// constexpr to reduce the amount of kernels generated for
// broadcast elementwise with mexed dtypes and limit which functors are actually
// applied to the load and store at compile time.
using func_tuple = typename traits::ArgsTuple;
if constexpr (
std::is_same_v<float, arg0_t> && traits::arity == 2 &&
check_binary_functor_types_for_specialization<
func_tuple,
float,
float,
traits::arity,
/*arg_num=*/0>::check()) {
memory::detail::static_unroll<
type_specialized_broadcast_kernel_launcher,
rt_binary_specializations.size()>::with_args(
numel,
f,
data,
dtypes,
offset_calc
);
return;
}
}
constexpr int grp_sz = 128;
launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
if (unrl) {

View File

@ -212,17 +212,12 @@ static Tensor& bce_loss_out_impl(const Tensor& input,
loss.resize_((reduction == Reduction::None || grad_output.defined()) ? target.sizes() : IntArrayRef({}));
TORCH_CHECK(loss.is_mps());
Tensor loss_squeezed = loss.squeeze();
Tensor input_squeezed = input.squeeze();
Tensor target_squeezed = target.squeeze();
@autoreleasepool {
std::string key =
op_name + reductionToString(reduction) + getTensorsStringKey({input_squeezed, target_squeezed, weight});
std::string key = op_name + reductionToString(reduction) + getTensorsStringKey({input, target, weight});
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_squeezed);
newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target_squeezed);
newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
MPSGraphTensor* bceLossUnweighted = nil;
// if grad_output is defined, then it's a backward pass
@ -252,12 +247,12 @@ static Tensor& bce_loss_out_impl(const Tensor& input,
newCachedGraph->gradInputTensor = bceLoss;
}
} else {
newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input_squeezed.sizes().size());
newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input.sizes().size());
}
});
Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input_squeezed);
Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target_squeezed);
Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss_squeezed);
Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input);
Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target);
Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss);
NSMutableDictionary* feeds = [[NSMutableDictionary new] autorelease];

View File

@ -478,7 +478,7 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
const auto s_k = params.key.sym_size(2);
const auto d_qk = params.query.sym_size(3);
const auto d_v = params.value.sym_size(3);
long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
long cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
if (cudnn_version < 8903) {
if (debug) {
TORCH_WARN("SDPA fprop requires cudnn 8.9.3 or higher");
@ -709,7 +709,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
return false;
#endif
#if defined(CUDNN_VERSION)
static auto cudnn_version = cudnnGetVersion();
static auto cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) {
if (debug) {
TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13).");

View File

@ -53,10 +53,8 @@ class AddmmBenchmark(op_bench.TorchBenchmarkBase):
return torch.addmm(input_one, mat1, mat2)
op_bench.generate_pt_test(addmm_long_configs + addmm_long_configs, AddmmBenchmark)
op_bench.generate_pt_gradient_test(
addmm_long_configs + addmm_long_configs, AddmmBenchmark
)
op_bench.generate_pt_test(addmm_short_configs + addmm_long_configs, AddmmBenchmark)
op_bench.generate_pt_gradient_test(addmm_long_configs, AddmmBenchmark)
"""Mircobenchmark for addbmm operator."""
@ -107,9 +105,7 @@ addbmm_short_configs = op_bench.cross_product_configs(
)
op_bench.generate_pt_test(addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark)
op_bench.generate_pt_gradient_test(
addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark
)
op_bench.generate_pt_gradient_test(addbmm_long_configs, AddbmmBenchmark)
if __name__ == "__main__":
op_bench.benchmark_runner.main()

View File

@ -1,4 +1,5 @@
#include <c10/core/SymBool.h>
#include <c10/core/SymInt.h>
#include <c10/core/SymNodeImpl.h>
namespace c10 {
@ -111,4 +112,17 @@ bool SymBool::has_hint() const {
return toSymNodeImpl()->has_hint();
}
SymInt SymBool::toSymInt() const {
// If concrete bool, return concrete SymInt
if (auto ma = maybe_as_bool()) {
return SymInt(*ma ? 1 : 0);
}
// Symbolic case: use sym_ite to convert bool to int (0 or 1)
auto node = toSymNodeImpl();
auto one_node = node->wrap_int(1);
auto zero_node = node->wrap_int(0);
return SymInt(node->sym_ite(one_node, zero_node));
}
} // namespace c10

View File

@ -12,6 +12,8 @@
namespace c10 {
class SymInt;
class C10_API SymBool {
public:
/*implicit*/ SymBool(bool b) : data_(b) {}
@ -80,6 +82,10 @@ class C10_API SymBool {
return toSymNodeImplUnowned()->constant_bool();
}
// Convert SymBool to SymInt (0 or 1)
// This is the C++ equivalent of Python's cast_symbool_to_symint_guardless
SymInt toSymInt() const;
bool is_heap_allocated() const {
return ptr_;
}

View File

@ -106,6 +106,9 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
} else if (key == "graph_capture_record_stream_reuse") {
i = parseGraphCaptureRecordStreamReuse(tokenizer, i);
used_native_specific_option = true;
} else if (key == "per_process_memory_fraction") {
i = parsePerProcessMemoryFraction(tokenizer, i);
used_native_specific_option = true;
} else {
const auto& keys =
c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
@ -146,6 +149,18 @@ size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
return i;
}
double CUDAAllocatorConfig::parsePerProcessMemoryFraction(
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i) {
tokenizer.checkToken(++i, ":");
double val_env = tokenizer.toDouble(++i);
TORCH_CHECK_VALUE(
val_env >= 0.0 && val_env <= 1.0,
"per_process_memory_fraction is invalid, set it in [0.0, 1.0]");
m_per_process_memory_fraction = val_env;
return i;
}
size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i) {

View File

@ -61,6 +61,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
return instance().m_graph_capture_record_stream_reuse;
}
static double per_process_memory_fraction() {
return instance().m_per_process_memory_fraction;
}
/** Pinned memory allocator settings */
static bool pinned_use_cuda_host_register() {
return instance().m_pinned_use_cuda_host_register;
@ -152,7 +156,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
"pinned_use_hip_host_register",
"graph_capture_record_stream_reuse",
"pinned_reserve_segment_size_mb",
"pinned_num_register_threads"};
"pinned_num_register_threads",
"per_process_memory_fraction"};
return keys;
}
@ -177,6 +182,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
size_t parseGraphCaptureRecordStreamReuse(
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i);
double parsePerProcessMemoryFraction(
const c10::CachingAllocator::ConfigTokenizer& tokenizer,
size_t i);
std::atomic<size_t> m_pinned_num_register_threads{1};
std::atomic<size_t> m_pinned_reserve_segment_size_mb{0};
@ -189,6 +197,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
std::atomic<bool> m_release_lock_on_cudamalloc{false};
std::atomic<bool> m_pinned_use_cuda_host_register{false};
std::atomic<bool> m_graph_capture_record_stream_reuse{false};
std::atomic<double> m_per_process_memory_fraction{1.0};
};
// Keep this for backwards compatibility

View File

@ -1100,7 +1100,7 @@ class RingBuffer {
} // anonymous namespace
} // namespace Native
static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
static std::string reportProcessMemoryInfo(const cudaDeviceProp& prop) {
#ifdef PYTORCH_C10_DRIVER_API_SUPPORTED
void* nvml_handle = DriverAPI::get_nvml_handle();
if (!nvml_handle) {
@ -1111,9 +1111,6 @@ static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
return true;
}();
cudaDeviceProp prop{};
C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
// NOLINTNEXTLINE(*-c-arrays)
char pci_id[80];
snprintf(
@ -1215,14 +1212,16 @@ class DeviceCachingAllocator {
// record used memory.
size_t total_allocated_memory = 0;
size_t allowed_memory_maximum = 0;
cudaDeviceProp device_prop;
// maximum amount of memory that device is allowed to
// allocate. This is set iff memory fraction is less than 1
std::optional<size_t> allowed_memory_maximum{std::nullopt};
// all live expandable segments
std::vector<ExpandableSegment*> expandable_segments_;
std::vector<c10::DeviceIndex> devices_with_peer_access_;
bool set_fraction = false;
bool record_history = false;
std::atomic<CreateContextFn> context_recorder_;
@ -1264,6 +1263,9 @@ class DeviceCachingAllocator {
: device_id(id),
large_blocks(/*small=*/false),
small_blocks(/*small=*/true) {
C10_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, id));
setMemoryFraction(CUDAAllocatorConfig::per_process_memory_fraction());
stats.max_split_size =
static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
context_recorder_.store(nullptr);
@ -1399,7 +1401,7 @@ class DeviceCachingAllocator {
if (!block_found) {
// Do garbage collection if the flag is set.
if (C10_UNLIKELY(
set_fraction &&
allowed_memory_maximum.has_value() &&
AcceleratorAllocatorConfig::garbage_collection_threshold() >
0.0)) {
garbage_collect_cached_blocks(context);
@ -1456,11 +1458,12 @@ class DeviceCachingAllocator {
C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
std::string allowed_info;
if (set_fraction) {
allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
if (allowed_memory_maximum.has_value()) {
allowed_info =
format_size(allowed_memory_maximum.value()) + " allowed; ";
}
std::string proc_info = reportProcessMemoryInfo(device_id);
std::string proc_info = reportProcessMemoryInfo(device_prop);
record_trace(
TraceEntry::OOM,
@ -1518,7 +1521,7 @@ class DeviceCachingAllocator {
for (const auto& obs : observers_local) {
obs(device_id,
alloc_size,
set_fraction ? allowed_memory_maximum : device_total,
allowed_memory_maximum.value_or(device_total),
device_free);
}
@ -2015,25 +2018,26 @@ class DeviceCachingAllocator {
/** get memory fraction limiting maximum allocated memory **/
double getMemoryFraction() {
if (!set_fraction) {
if (!allowed_memory_maximum.has_value()) {
return 1.0;
}
size_t device_free = 0;
size_t device_total = 0;
C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
return static_cast<double>(allowed_memory_maximum) /
static_cast<double>(device_total);
return static_cast<double>(allowed_memory_maximum.value()) /
static_cast<double>(device_prop.totalGlobalMem);
}
/** set memory fraction to limit maximum allocated memory **/
void setMemoryFraction(double fraction) {
size_t device_free = 0;
size_t device_total = 0;
C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
allowed_memory_maximum =
static_cast<size_t>(fraction * static_cast<double>(device_total));
set_fraction = true;
TORCH_CHECK(
0 <= fraction && fraction <= 1,
"invalid fraction:",
fraction,
". Please set within [0, 1].");
allowed_memory_maximum = std::nullopt;
if (fraction < 1.0) {
allowed_memory_maximum = static_cast<size_t>(
fraction * static_cast<double>(device_prop.totalGlobalMem));
}
}
/** get expandable segment size for all the streams on device **/
@ -3010,7 +3014,7 @@ class DeviceCachingAllocator {
BlockPool& pool = *p.pool;
if (C10_UNLIKELY(
set_fraction &&
allowed_memory_maximum.has_value() &&
AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
// Track block reuse interval only when garbage collection is enabled.
++pool.get_free_blocks_call_count;
@ -3083,7 +3087,7 @@ class DeviceCachingAllocator {
size_t gc_threshold = static_cast<size_t>(
AcceleratorAllocatorConfig::garbage_collection_threshold() *
static_cast<double>(allowed_memory_maximum));
static_cast<double>(allowed_memory_maximum.value()));
// No need to trigger GC yet
if (total_allocated_memory <= gc_threshold) {
return;
@ -3161,8 +3165,8 @@ class DeviceCachingAllocator {
bool active_pool =
p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator();
if (set_fraction &&
total_allocated_memory + size > allowed_memory_maximum) {
if (allowed_memory_maximum.has_value() &&
total_allocated_memory + size > allowed_memory_maximum.value()) {
p.err = cudaErrorMemoryAllocation;
return false;
// Temporarily disable checkpointing & cudagraphs internally
@ -3859,7 +3863,6 @@ class NativeCachingAllocator : public CUDAAllocator {
"Allocator not initialized for device ",
device,
": did you call init?");
C10_CUDA_CHECK(c10::cuda::SetDevice(device));
return device_allocator[device]->getMemoryFraction();
}
@ -3869,12 +3872,6 @@ class NativeCachingAllocator : public CUDAAllocator {
"Allocator not initialized for device ",
device,
": did you call init?");
TORCH_CHECK(
0 <= fraction && fraction <= 1,
"invalid fraction:",
fraction,
". Please set within [0, 1].");
C10_CUDA_CHECK(c10::cuda::SetDevice(device));
device_allocator[device]->setMemoryFraction(fraction);
}

View File

@ -2,6 +2,7 @@
#include <c10/core/AllocatorConfig.h>
#include <c10/core/CachingDeviceAllocator.h>
#include <c10/cuda/CUDAAllocatorConfig.h>
#include <c10/cuda/CUDAGraphsC10Utils.h>
#include <c10/cuda/CUDAMacros.h>
#include <c10/cuda/CUDAStream.h>

View File

@ -427,7 +427,6 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
// on the current device each later call sees.
void init(int dev_count) override {
static bool called = [](int dev_count) {
;
// Are there external guarantees init will be called before
// any of the allocator's other functions?
// std::lock_guard<std::mutex> lk(general_mutex);

View File

@ -50,13 +50,7 @@ namespace c10 {
/// However, you should prefer to use ArrayRef when possible, because its use
/// of TORCH_CHECK will lead to better user-facing error messages.
template <typename T>
class ArrayRef
#if !(defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER))
// workaround a ICX bug: ICX does not recognize c10::ArrayRef as a
// template when ArrayRef is specified as `final`:
final
#endif
: public HeaderOnlyArrayRef<T> {
class ArrayRef /*final*/ : public HeaderOnlyArrayRef<T> {
public:
/// @name Constructors, all inherited from HeaderOnlyArrayRef except for
/// SmallVector. As inherited constructors won't work with class template

View File

@ -73,6 +73,19 @@ void box_cox_zero_lambda(
}
}
template <typename T>
at::vec::Vectorized<T> box_cox_nonzero_lambda_impl(
at::vec::Vectorized<T> data,
at::vec::Vectorized<T> lambda1,
at::vec::Vectorized<T> lambda2,
at::vec::Vectorized<T> k_eps) {
auto sum = data + lambda2;
auto max = at::vec::max(sum, k_eps);
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
auto pow = max.pow(lambda1);
return at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
}
template <typename T>
void box_cox_nonzero_lambda(
int64_t D,
@ -88,21 +101,18 @@ void box_cox_nonzero_lambda(
auto k_eps_vec = Vec(k_eps);
for(; j + VLEN < D; j += VLEN) {
auto data = Vec::loadu(data_ptr + j);
auto lambda2 = Vec::loadu(lambda2_ptr + j);
auto sum = data + lambda2;
auto max = at::vec::max(sum, k_eps_vec);
auto lambda1 = Vec::loadu(lambda1_ptr + j);
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
auto pow = max.pow(lambda1);
auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
auto lambda2 = Vec::loadu(lambda2_ptr + j);
auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
res.store(out + j);
}
for ( ;j < D; ++j) {
auto sum = data_ptr[j] + lambda2_ptr[j];
auto max = std::max(sum, k_eps);
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]);
auto pow = std::pow(max, lambda1_ptr[j]);
out[j] = pow * lambda_over_1 - lambda_over_1;
if (j < D) {
auto remaining = D - j;
auto data = Vec::loadu(data_ptr + j, remaining);
auto lambda1 = Vec::loadu(lambda1_ptr + j, remaining);
auto lambda2 = Vec::loadu(lambda2_ptr + j, remaining);
auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
res.store(out + j, remaining);
}
}
#else

View File

@ -206,6 +206,41 @@ templates_path = [
os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
]
# TODO: document these and remove them from here.
# Fixes the duplicated
autosummary_filename_map = {
"torch.nn.utils.prune.identity": "torch.nn.utils.prune.identity_function",
"torch.nn.utils.prune.Identity": "torch.nn.utils.prune.Identity_class",
"torch.optim.adamw.adamw": "torch.optim.adamw.adamw_function",
"torch.optim.adamw.AdamW": "torch.optim.adamw.AdamW_class",
"torch.optim.asgd.asgd": "torch.optim.asgd.asgd_function",
"torch.optim.asgd.ASGD": "torch.optim.asgd.ASGD_class",
"torch.optim.nadam.nadam": "torch.optim.nadam.nadam_function",
"torch.optim.nadam.NAdam": "torch.optim.nadam.NAdam_class",
"torch.optim.radam.radam": "torch.optim.radam.radam_function",
"torch.optim.radam.RAdam": "torch.optim.radam.RAdam_class",
"torch.optim.rmsprop.rmsprop": "torch.optim.rmsprop.rmsprop_function",
"torch.optim.rmsprop.RMSprop": "torch.optim.rmsprop.RMSprop_class",
"torch.optim.rprop.rprop": "torch.optim.rprop.rprop_function",
"torch.optim.rprop.Rprop": "torch.optim.rprop.Rprop_class",
"torch.optim.sgd.sgd": "torch.optim.sgd.sgd_function",
"torch.optim.sgd.SGD": "torch.optim.sgd.SGD_class",
"torch.optim.adadelta.adadelta": "torch.optim.adadelta.adadelta_function",
"torch.optim.adadelta.Adadelta": "torch.optim.adadelta.Adadelta_class",
"torch.optim.adagrad.adagrad": "torch.optim.adagrad.adagrad_function",
"torch.optim.adagrad.Adagrad": "torch.optim.adagrad.Adagrad_class",
"torch.optim.adam.adam": "torch.optim.adam.adam_function",
"torch.optim.adam.Adam": "torch.optim.adam.Adam_class",
"torch.optim.adamax.adamax": "torch.optim.adamax.adamax_function",
"torch.optim.adamax.Adamax": "torch.optim.adamax.Adamax_class",
"torch.mtia.stream": "torch.mtia.stream_function",
"torch.mtia.Stream": "torch.mtia.Stream_class",
"torch.cpu.stream": "torch.cpu.stream_function",
"torch.cpu.Stream": "torch.cpu.Stream_class",
"torch.cuda.stream": "torch.cuda.stream_function",
"torch.cuda.Stream": "torch.cuda.Stream_class",
"torch.xpu.stream": "torch.xpu.stream_function",
"torch.xpu.Stream": "torch.xpu.Stream_class",
}
coverage_ignore_functions = [
# torch
@ -3195,6 +3230,11 @@ autodoc_type_aliases = {
# Enable overriding of function signatures in the first line of the docstring.
autodoc_docstring_signature = True
# Exclude inherited IntEnum methods that have RST formatting issues in their docstrings
autodoc_default_options = {
"exclude-members": "from_bytes, to_bytes",
}
# -- katex javascript in header
#
# def setup(app):

View File

@ -619,6 +619,10 @@ Available options:
and reallocate buffers across multiple streams, especially when the capture DAG frequently
reaches joined frontiers.
* ``per_process_memory_fraction`` option limits the amount of memory that can be allocated
on all the CUDA devices to a specified fraction of the available memory. This is a value
between 0 and 1. Attempting to allocate more memory will raise an out of memory error.
.. note::
Some stats reported by the

View File

@ -46,6 +46,108 @@ These headers are promised to be ABI stable across releases and adhere to a stro
Unless absolutely necessary, we recommend the high-level C++ API in `torch/csrc/stable`
which will handle all the rough edges of the C API for the user.
## Migrating your kernel to the LibTorch stable ABI
If you'd like your kernel to be ABI stable with LibTorch, meaning you'd the ability to build for one version and run on another, your kernel must only use the limited stable ABI. This following section goes through some steps of migrating an existing kernel and APIs we imagine you would need to swap over.
Firstly, instead of registering kernels through `TORCH_LIBRARY`, LibTorch ABI stable kernels must be registered via `STABLE_TORCH_LIBRARY`. Note that, for the time being, implementations registered via `STABLE_TORCH_LIBRARY` must be boxed unlike `TORCH_LIBRARY`. See the simple example below or our docs on [Stack-based APIs](stack-based-apis) for more details. For kernels that are registered via `pybind`, before using the stable ABI, it would be useful to migrate to register them via `TORCH_LIBRARY`.
While previously your kernels might have included APIs from `<torch/*.h>` (for example, `<torch/all.h>`), they are now limited to including from the 3 categories of headers mentioned above (`torch/csrc/stable/*.h`, `torch/headeronly/*.h` and the stable C headers). This means that your extension should no longer use any utilities from the `at::` or `c10::` namespaces but instead use their replacements in `torch::stable` and `torch::headeronly`. To provide a couple examples of the necessary migrations:
- all uses of `at::Tensor` must be replaced with `torch::stable::Tensor`
- all uses of `TORCH_CHECK` must be replaced with `STD_TORCH_CHECK`
- all uses of `at::kCUDA` must be replaced with `torch::headeronly::kCUDA` etc.
- native functions such as `at::pad` must be replaced with `torch::stable::pad`
- native functions that are called as Tensor methods (e.g., `Tensor.pad`) must be replaced with the ATen variant through `torch::stable::pad`.
As mentioned above, the LibTorch stable ABI is still under development. If there is any API or feature you would like to see added to the stable ABI/`torch::headeronly`/`torch::stable`, please file a request through a [new issue on the PyTorch repo](https://github.com/pytorch/pytorch/issues).
Below is a simple example of migrating an existing kernel that uses `TORCH_LIBRARY` to the stable ABI (`TORCH_STABLE_LIBRARY`). For a larger end to end example you can take a look at the FA3 repository. Specifically the diff between [`flash_api.cpp`](https://github.com/Dao-AILab/flash-attention/blob/ad70a007e6287d4f7e766f94bcf2f9a813f20f6b/hopper/flash_api.cpp#L1) and the stable variant [`flash_api_stable.cpp`](https://github.com/Dao-AILab/flash-attention/blob/ad70a007e6287d4f7e766f94bcf2f9a813f20f6b/hopper/flash_api_stable.cpp#L1).
### Original Version with `TORCH_LIBRARY`
```cpp
// original_kernel.cpp - Using TORCH_LIBRARY (not stable ABI)
#include <torch/torch.h>
#include <ATen/ATen.h>
namespace myops {
// Simple kernel that adds a scalar value to each element of a tensor
at::Tensor add_scalar(const at::Tensor& input, double scalar) {
TORCH_CHECK(input.scalar_type() == at::kFloat, "Input must be float32");
return input.add(scalar);
}
// Register the operator
TORCH_LIBRARY(myops, m) {
m.def("add_scalar(Tensor input, float scalar) -> Tensor", &add_scalar);
}
// Register the implementation
TORCH_LIBRARY_IMPL(myops, CompositeExplicitAutograd, m) {
m.impl("add_scalar", &add_scalar);
}
} // namespace myops
```
### Migrated Version with `STABLE_TORCH_LIBRARY`
```cpp
// stable_kernel.cpp - Using STABLE_TORCH_LIBRARY (stable ABI)
// (1) Don't include <torch/torch.h> <ATen/ATen.h>
// only include APIs from torch/csrc/stable, torch/headeronly and C-shims
#include <torch/csrc/stable/library.h>
#include <torch/csrc/stable/tensor_struct.h>
#include <torch/csrc/stable/ops.h>
#include <torch/csrc/stable/stableivalue_conversions.h>
#include <torch/headeronly/core/ScalarType.h>
#include <torch/headeronly/macros/Macros.h>
namespace myops {
// Simple kernel that adds a scalar value to each element of a tensor
torch::stable::Tensor add_scalar(const torch::stable::Tensor& input, double scalar) {
// (2) use STD_TORCH_CHECK instead of TORCH_CHECK
STD_TORCH_CHECK(
// (3) use torch::headeronly::kFloat instead of at:kFloat
input.scalar_type() == torch::headeronly::kFloat,
"Input must be float32");
// (4) Use stable ops namespace instead of input.add
return torch::stable::add(input, scalar);
}
// (5) Add Boxed wrapper required for STABLE_TORCH_LIBRARY
void boxed_add_scalar(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
// Extract arguments from stack using `to<T>`
auto input = to<torch::stable::Tensor>(stack[0]);
auto scalar = to<double>(stack[1]);
// Call the actual kernel
auto result = add_scalar(input, scalar);
// Put result back on stack using `from()`
// Stack slot 0 now holds the return value
stack[0] = from(result);
}
// (6) Register the operator using STABLE_TORCH_LIBRARY
STABLE_TORCH_LIBRARY(myops, m) {
m.def("add_scalar(Tensor input, float scalar) -> Tensor", &boxed_add_scalar);
}
// (7) Register the implementation using STABLE_TORCH_LIBRARY_IMPL
STABLE_TORCH_LIBRARY_IMPL(myops, CompositeExplicitAutograd, m) {
m.impl("add_scalar", &boxed_add_scalar);
}
} // namespace myops
```
## How are objects passed across the ABI boundary when interacting with the dispatcher?
@ -109,6 +211,7 @@ There are two invariants for the stack:
a. When calling a stack-based API, you must give owning references to the calling stack and steal references from the returned stack.
b. When registering your function to be called with a stack, you must steal references from your argument stack and push onto the stack new references.
(stack-based-apis)=
### Stack-based APIs
The above is relevant in two places:

View File

@ -253,7 +253,6 @@ regular full-precision tensor.
.. autosummary::
:toctree: generated
:nosignatures:
:template: classtemplate.rst
view
as_strided

View File

@ -630,6 +630,37 @@ def mirror_files_into_torchgen() -> None:
raise RuntimeError("Check the file paths in `mirror_files_into_torchgen()`")
def mirror_inductor_external_kernels() -> None:
"""
Copy external kernels into Inductor so they are importable.
"""
paths = [
(
CWD / "torch/_inductor/kernel/vendored_templates/cutedsl_grouped_gemm.py",
CWD
/ "third_party/cutlass/examples/python/CuTeDSL/blackwell/grouped_gemm.py",
),
]
for new_path, orig_path in paths:
# Create the dirs involved in new_path if they don't exist
if not new_path.exists():
new_path.parent.mkdir(parents=True, exist_ok=True)
# Copy the files from the orig location to the new location
if orig_path.is_file():
shutil.copyfile(orig_path, new_path)
continue
if orig_path.is_dir():
if new_path.exists():
# copytree fails if the tree exists already, so remove it.
shutil.rmtree(new_path)
shutil.copytree(orig_path, new_path)
continue
raise RuntimeError(
"Check the file paths in `mirror_inductor_external_kernels()`"
)
# ATTENTION: THIS IS AI SLOP
def extract_variant_from_version(version: str) -> str:
"""Extract variant from version string, defaulting to 'cpu'."""
@ -1616,6 +1647,8 @@ def main() -> None:
if RUN_BUILD_DEPS:
build_deps()
mirror_inductor_external_kernels()
(
ext_modules,
cmdclass,
@ -1649,6 +1682,7 @@ def main() -> None:
"_inductor/codegen/aoti_runtime/*.cpp",
"_inductor/script.ld",
"_inductor/kernel/flex/templates/*.jinja",
"_inductor/kernel/templates/*.jinja",
"_export/serde/*.yaml",
"_export/serde/*.thrift",
"share/cmake/ATen/*.cmake",

View File

@ -75,6 +75,7 @@ class TestScheduler(TestCase):
class TestCubicScheduler(TestCase):
def setUp(self):
super().setUp()
self.model_sparse_config = [
{"tensor_fqn": "0.weight", "sparsity_level": 0.8},
{"tensor_fqn": "2.weight", "sparsity_level": 0.4},

View File

@ -11,6 +11,7 @@ from torch.testing._internal.common_utils import IS_LINUX, run_tests, TestCase
@unittest.skipIf(not IS_LINUX, "Only works on linux")
class TestTorchrun(TestCase):
def setUp(self):
super().setUp()
self._test_dir = tempfile.mkdtemp(prefix=self.__class__.__name__)
def tearDown(self):

View File

@ -11,6 +11,8 @@ set(AOTI_ABI_CHECK_TEST_SRCS
${AOTI_ABI_CHECK_TEST_ROOT}/test_accessor.cpp
${AOTI_ABI_CHECK_TEST_ROOT}/test_cast.cpp
${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
${AOTI_ABI_CHECK_TEST_ROOT}/test_dispatch.cpp
${AOTI_ABI_CHECK_TEST_ROOT}/test_dispatch_v2.cpp
${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
${AOTI_ABI_CHECK_TEST_ROOT}/test_exception.cpp
${AOTI_ABI_CHECK_TEST_ROOT}/test_headeronlyarrayref.cpp
@ -46,6 +48,10 @@ endif()
# Disable unused-variable warnings for variables that are only used to test compilation
target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-variable)
target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-but-set-variable)
# Add -Wno-dangling-pointer for GCC 13
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
target_compile_options_if_supported(test_aoti_abi_check -Wno-dangling-pointer)
endif()
foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})

View File

@ -0,0 +1,82 @@
#include <gtest/gtest.h>
#include <torch/headeronly/core/Dispatch.h>
#include <torch/headeronly/core/Dispatch_v2.h>
// MY_PRIVATE_CHECK_SELECTIVE_BUILD is a prelude to case block. For
// testing, we do nothing:
#define MY_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type) /* empty */
#define MY_PRIVATE_CASE_TYPE_USING_HINT(...) \
THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL( \
MY_PRIVATE_CHECK_SELECTIVE_BUILD, __VA_ARGS__)
#define MY_DISPATCH_CASE(...) \
THO_DISPATCH_CASE_TMPL(MY_PRIVATE_CASE_TYPE_USING_HINT, __VA_ARGS__)
// MY_RECORD_KERNEL_FUNCTION_DTYPE is a prelude to switch
// statement. For testing, we just avoid unused variable warning:
#define MY_RECORD_KERNEL_FUNCTION_DTYPE(DISPATCHNAME, ENUMTYPE) \
(void)DISPATCHNAME
// MY_CHECK_NOT_IMPLEMENTED is called in switch default block. For
// testing, we count case mismatches:
#define MY_CHECK_NOT_IMPLEMENTED(...) default_count++
#define MY_DISPATCH_SWITCH(...) \
THO_DISPATCH_SWITCH_TMPL( \
MY_RECORD_KERNEL_FUNCTION_DTYPE, MY_CHECK_NOT_IMPLEMENTED, __VA_ARGS__)
// MY_CASE_FUNCTION is called in a case block. For testing, we count
// case matches and ensure that scalar_t/index_t type is defined:
#define MY_CASE_FUNCTION \
[&] { \
count++; \
scalar_t tmp; \
(void)tmp; \
}
#define MY_INDEX_CASE_FUNCTION \
[&] { \
count++; \
index_t tmp; \
(void)tmp; \
}
#define DEFINE_ITEM(TYPE, SCALARTYPE) ScalarType::SCALARTYPE,
#define MY_DISPATCH_V2(TYPE, NAME, BODY, ...) \
THO_DISPATCH_V2_TMPL( \
MY_DISPATCH_SWITCH, \
MY_DISPATCH_CASE, \
TYPE, \
NAME, \
AT_WRAP(BODY), \
__VA_ARGS__)
#define TEST_DISPATCH_V2(NAME, EXPECTEDCOUNT, ...) \
TEST(TestDispatchV2, NAME) { \
using torch::headeronly::ScalarType; \
using torch::headeronly::impl::ScalarTypeToCPPTypeT; \
int8_t total_count = 0; \
int8_t count = 0; \
int8_t default_count = 0; \
for (ScalarType t : \
{AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ITEM)}) { \
total_count++; \
MY_DISPATCH_V2(t, "test_my_dispatch_v2", MY_CASE_FUNCTION, __VA_ARGS__); \
} \
EXPECT_EQ(count, EXPECTEDCOUNT); \
EXPECT_EQ(default_count + count, total_count); \
}
TEST_DISPATCH_V2(AT_FLOAT8_TYPES_, 5, AT_FLOAT8_TYPES);
TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_, 5, AT_INTEGRAL_TYPES);
TEST_DISPATCH_V2(AT_FLOATING_TYPES_, 2, AT_FLOATING_TYPES);
TEST_DISPATCH_V2(AT_BAREBONES_UNSIGNED_TYPES_, 3, AT_BAREBONES_UNSIGNED_TYPES);
TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_V2_, 8, AT_INTEGRAL_TYPES_V2);
TEST_DISPATCH_V2(AT_COMPLEX_TYPES_, 2, AT_COMPLEX_TYPES);
TEST_DISPATCH_V2(AT_QINT_TYPES_, 3, AT_QINT_TYPES);
TEST_DISPATCH_V2(AT_ALL_TYPES_, 7, AT_ALL_TYPES);
TEST_DISPATCH_V2(AT_ALL_TYPES_AND_COMPLEX_, 9, AT_ALL_TYPES_AND_COMPLEX);
#undef DEFINE_ITEM

View File

@ -0,0 +1,45 @@
#include <gtest/gtest.h>
#include <torch/headeronly/core/Dispatch_v2.h>
#include <torch/headeronly/util/Exception.h>
#define DEFINE_ITEM(TYPE, SCALARTYPE) ScalarType::SCALARTYPE,
#define TEST_DISPATCH_V2(NAME, EXPECTEDCOUNT, ...) \
TEST(TestThoDispatchV2, NAME) { \
using torch::headeronly::ScalarType; \
using torch::headeronly::impl::ScalarTypeToCPPTypeT; \
int8_t total_count = 0; \
int8_t count = 0; \
int8_t default_count = 0; \
for (ScalarType t : \
{AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ITEM)}) { \
total_count++; \
try { \
THO_DISPATCH_V2( \
t, \
"test_tho_dispatch_v2", \
[&] { \
count++; \
scalar_t tmp; \
(void)tmp; \
}, \
__VA_ARGS__); \
} catch (...) { \
default_count++; /* counts mismatches */ \
} \
} \
EXPECT_EQ(count, EXPECTEDCOUNT); \
EXPECT_EQ(default_count + count, total_count); \
}
TEST_DISPATCH_V2(AT_FLOAT8_TYPES_, 5, AT_FLOAT8_TYPES);
TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_, 5, AT_INTEGRAL_TYPES);
TEST_DISPATCH_V2(AT_FLOATING_TYPES_, 2, AT_FLOATING_TYPES);
TEST_DISPATCH_V2(AT_BAREBONES_UNSIGNED_TYPES_, 3, AT_BAREBONES_UNSIGNED_TYPES);
TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_V2_, 8, AT_INTEGRAL_TYPES_V2);
TEST_DISPATCH_V2(AT_COMPLEX_TYPES_, 2, AT_COMPLEX_TYPES);
TEST_DISPATCH_V2(AT_QINT_TYPES_, 3, AT_QINT_TYPES);
TEST_DISPATCH_V2(AT_ALL_TYPES_, 7, AT_ALL_TYPES);
TEST_DISPATCH_V2(AT_ALL_TYPES_AND_COMPLEX_, 9, AT_ALL_TYPES_AND_COMPLEX);
#undef DEFINE_ITEM

View File

@ -70,6 +70,13 @@ if(NOT MSVC)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12)
target_compile_options_if_supported(test_api "-Wno-error=nonnull")
endif()
# Add -Wno-error=array-bounds for GCC 13+
# See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113239
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
target_compile_options_if_supported(test_api "-Wno-error=array-bounds")
endif()
endif()
if(INSTALL_TEST)

View File

@ -64,7 +64,7 @@ def run(initializer):
def main():
initializer_parameter_map = {}
for initializer in INITIALIZERS.keys():
for initializer in INITIALIZERS:
sys.stderr.write(f"Evaluating {initializer} ...\n")
initializer_parameter_map[initializer] = run(initializer)

View File

@ -130,7 +130,7 @@ def main():
options = parser.parse_args()
optimizer_parameter_map = {}
for optimizer in OPTIMIZERS.keys():
for optimizer in OPTIMIZERS:
sys.stderr.write(f"Evaluating {optimizer} ...\n")
optimizer_parameter_map[optimizer] = run(
optimizer, options.iterations, options.sample_every

View File

@ -11,6 +11,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase
class TestCustomBackend(TestCase):
def setUp(self):
super().setUp()
# Load the library containing the custom backend.
self.library_path = get_custom_backend_library_path()
torch.ops.load_library(self.library_path)

View File

@ -18,6 +18,7 @@ torch.ops.import_module("pointwise")
class TestCustomOperators(TestCase):
def setUp(self):
super().setUp()
self.library_path = get_custom_op_library_path()
ops.load_library(self.library_path)

View File

@ -22,6 +22,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase
class TestMakeCheckpointer(TestCase):
def setUp(self) -> None:
super().setUp()
# Create a temporary directory for checkpoints
self.temp_dir = tempfile.mkdtemp()

View File

@ -161,6 +161,7 @@ class TestCheckpointProcessConfig(TestCase):
class TestCheckpointProcess(TestCase):
def setUp(self) -> None:
super().setUp()
"""Set up common test fixtures."""
self.rank_info = RankInfo(
global_world_size=1,

View File

@ -14,6 +14,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase
class TestCheckpointReader(TestCase):
def setUp(self):
super().setUp()
# Create a temporary directory for test checkpoints
self.temp_dir = tempfile.mkdtemp()

View File

@ -52,6 +52,7 @@ class TestCheckpointWriterConfig(TestCase):
class TestCheckpointWriter(TestCase):
def setUp(self):
super().setUp()
# Create a temporary directory for test checkpoints
self.temp_dir = tempfile.mkdtemp()

View File

@ -52,6 +52,7 @@ class TestCheckpointer(TestCase):
"""Parameterized tests that work with both sync and async checkpointers."""
def setUp(self):
super().setUp()
# Create a temporary directory for checkpoints
self.temp_dir = tempfile.mkdtemp()
@ -397,6 +398,7 @@ class TestAsyncCheckpointerSpecific(TestCase):
"""Tests specific to AsyncCheckpointer functionality."""
def setUp(self):
super().setUp()
# Create a temporary directory for checkpoints
self.temp_dir = tempfile.mkdtemp()

View File

@ -12,6 +12,7 @@ from torch.testing._internal.common_utils import requires_cuda, run_tests, TestC
class TestDefaultStager(TestCase):
def setUp(self) -> None:
super().setUp()
# Create a test state dictionary with various data types
self.state_dict = {
"model": torch.nn.Linear(10, 5).state_dict(),

View File

@ -208,7 +208,7 @@ class TestSingleRankSaveLoad(TestCase):
# Create model.safetensors.index.json with weight mapping
weight_map = {}
for key in quantized_checkpoint.keys():
for key in quantized_checkpoint:
weight_map[key] = "model.safetensors"
index_data = {
@ -245,7 +245,7 @@ class TestSingleRankSaveLoad(TestCase):
sorted(original_tensors.keys()), sorted(state_dict_to_load.keys())
)
for tensor_name in original_tensors.keys():
for tensor_name in original_tensors:
original = original_tensors[tensor_name]
loaded = state_dict_to_load[tensor_name]

View File

@ -15,6 +15,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase
class TestQuantizedHfStorage(TestCase):
def setUp(self):
super().setUp()
"""Set up common test fixtures."""
self.temp_dir = tempfile.TemporaryDirectory()
self.path = self.temp_dir.name

View File

@ -21,6 +21,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase
class SignalHandlingTest(TestCase):
def setUp(self):
super().setUp()
# Save original environment variable if it exists
self.original_signals_env = os.environ.get(
"TORCHELASTIC_SIGNALS_TO_HANDLE", None

View File

@ -498,7 +498,7 @@ class TestFSDPMixedPrecision(FSDPTest):
for name, tensor in state_dict.items():
# Parameters and buffers are checkpointed in their
# original dtypes, which may be different.
if name in named_buffers.keys():
if name in named_buffers:
self.assertEqual(tensor.dtype, _BUFFER_ORIG_DTYPE)
else:
self.assertEqual(

View File

@ -16,6 +16,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase
class LauncherApiTest(TestCase):
def setUp(self):
super().setUp()
# Save original environment variable if it exists
self.original_signals_env = os.environ.get(
"TORCHELASTIC_SIGNALS_TO_HANDLE", None

View File

@ -21,6 +21,7 @@ from torch.distributed.pipelining import (
from torch.distributed.pipelining._utils import generate_stage_to_rank_mapping
from torch.distributed.pipelining.schedules import (
_Action,
_add_reduce_grad,
_add_send_recv,
_add_unshard_reshard,
_format_pipeline_order,
@ -574,6 +575,45 @@ class TestScheduleLowering(TestCase):
),
)
@parametrize(
"test_info",
[
{
"compute": ["0F0", "0F1", " ", "0B0", "0B1"],
"comms": ["0F0", "0F1", "0B0", "0B1", "0REDUCE_GRAD"],
},
{
"compute": ["0F0", "0F1", "1F0", "1F1", "1B0", "1B1", "0B0", "0B1"],
"comms": [
"0F0",
"0F1",
"1F0",
"1F1",
"1B0",
"1B1",
"1REDUCE_GRAD",
"0B0",
"0B1",
"0REDUCE_GRAD",
],
},
],
)
def test_reduce_grad(self, test_info):
compute_sch = self._parse_actions(test_info["compute"])
expected_comms_sch = self._parse_actions(test_info["comms"])
comms_sch = _add_reduce_grad(compute_sch, 2)
for expected, actual in zip(expected_comms_sch, comms_sch, strict=True):
self.assertEqual(
expected,
actual,
(
f"Mismatch: expected action {expected} but found {actual}."
f"\nWhole Schedule: {comms_sch}"
),
)
@parametrize(
"test_info",
[

View File

@ -46,6 +46,7 @@ from torch.testing._internal.common_utils import (
parametrize,
run_tests,
skip_but_pass_in_sandcastle_if,
TEST_MULTIACCELERATOR,
)
@ -56,7 +57,6 @@ batch_size = 64
torch.manual_seed(0)
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
backend = dist.get_default_backend_for_device(device_type)
TEST_MULTIACCELERATOR = torch.accelerator.device_count() >= 2
@dataclass

View File

@ -24,6 +24,7 @@ from torch.testing._internal.common_utils import (
parametrize,
run_tests,
skip_but_pass_in_sandcastle_if,
TEST_MULTIACCELERATOR,
)
from torch.utils._pytree import tree_map_only
@ -34,7 +35,6 @@ chunks = 8
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
backend = dist.get_default_backend_for_device(device_type)
TEST_MULTIACCELERATOR = torch.accelerator.device_count() >= 2
torch.manual_seed(0)

View File

@ -50,22 +50,24 @@ class TestDTensorDebugMode(TestCase):
x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
y_dtensor = DTensor.from_local(y, mesh, [Shard(0)], run_check=False)
with DebugMode(record_torchfunction=True) as debug_mode:
with DebugMode(
record_torchfunction=True, record_ids=True, record_output=True
) as debug_mode:
torch.mm(x_dtensor, y_dtensor).sum()
self.assertExpectedInline(
debug_mode.debug_string(),
"""\
torch.mm(dt: f32[8, 8]| S(0), dt: f32[8, 32]| S(0))
aten::mm(dt: f32[8, 8]| S(0), dt: f32[8, 32]| S(0))
torch.mm(dt$0: f32[8, 8]| S(0), dt$1: f32[8, 32]| S(0)) -> dt$6: f32[8, 32]| S(0)
aten::mm(dt$0: f32[8, 8]| S(0), dt$1: f32[8, 32]| S(0))
redistribute_input(1, S(0) -> R)
redistribute_input(t: f32[1, 32], trace: S(0)->R)
_c10d_functional::all_gather_into_tensor(t: f32[1, 32], 8, 0)
_c10d_functional::wait_tensor(t: f32[8, 32])
aten::mm(t: f32[1, 8], t: f32[8, 32])
<method 'sum' of 'torch._C.TensorBase' objects>(dt: f32[8, 32]| S(0))
aten::sum(dt: f32[8, 32]| S(0))
aten::sum(t: f32[1, 32])""",
redistribute_input(t$2: f32[1, 32], trace: S(0)->R)
_c10d_functional::all_gather_into_tensor(t$2: f32[1, 32], 8, 0) -> t$3: f32[8, 32]
_c10d_functional::wait_tensor(t$3: f32[8, 32]) -> t$3: f32[8, 32]
aten::mm(t$4: f32[1, 8], t$3: f32[8, 32]) -> t$5: f32[1, 32]
<method 'sum' of 'torch._C.TensorBase' objects>(dt$6: f32[8, 32]| S(0)) -> dt$8: f32[]| P
aten::sum(dt$6: f32[8, 32]| S(0))
aten::sum(t$5: f32[1, 32]) -> t$7: f32[]""",
)
self.assertTrue(isinstance(debug_mode.operators[0], _OpCall))

View File

@ -3,7 +3,8 @@
import itertools
import random
import unittest
from typing import Any, Callable, ClassVar, Optional
from collections.abc import Callable
from typing import Any, ClassVar, Optional
import torch
import torch.distributed as dist

View File

@ -1,11 +1,18 @@
# Owner(s): ["oncall: distributed"]
import itertools
from contextlib import nullcontext
from typing import Any
import torch
import torch.distributed as dist
from torch.distributed._local_tensor import (
local_tensor_mode,
LocalTensor,
LocalTensorMode,
)
from torch.distributed.device_mesh import init_device_mesh
from torch.distributed.tensor import distribute_tensor, DTensor
from torch.distributed.tensor import DeviceMesh, distribute_tensor, DTensor
from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
from torch.distributed.tensor._utils import (
_compute_local_shape_and_global_offset,
@ -14,6 +21,7 @@ from torch.distributed.tensor._utils import (
compute_global_tensor_shape,
compute_local_shape_and_global_offset,
compute_local_tensor_info,
ExplicitRedistributionContext,
)
from torch.distributed.tensor.debug import CommDebugMode
from torch.distributed.tensor.placement_types import (
@ -851,5 +859,93 @@ class Test2DStridedLocalShard(DTensorTestBase):
self.assertEqual(global_tensor, dtensor_2d.full_tensor())
class LocalTensorTestBase(TestCase):
def assertEqual(self, lhs, rhs, **kwargs):
mode = local_tensor_mode()
with nullcontext() if mode is None else mode.disable():
if isinstance(lhs, LocalTensor) and isinstance(rhs, LocalTensor):
assert isinstance(lhs, LocalTensor) and isinstance(rhs, LocalTensor)
super().assertEqual(lhs._ranks, rhs._ranks)
for r in lhs._ranks:
super().assertEqual(
lhs._local_tensors[r],
rhs._local_tensors[r],
lambda m: f"rank {r}: {m}",
)
elif isinstance(lhs, LocalTensor) or isinstance(rhs, LocalTensor):
lhs, rhs = (lhs, rhs) if isinstance(lhs, LocalTensor) else (rhs, lhs)
for r in lhs._ranks:
super().assertEqual(
lhs._local_tensors[r], rhs, lambda m: f"rank {r}: {m}"
)
else:
return super().assertEqual(lhs, rhs, **kwargs)
@property
def world_size(self):
raise NotImplementedError("override world-size in your subclass")
def build_device_mesh(self) -> DeviceMesh:
return init_device_mesh("cpu", (self.world_size,))
def setUp(self):
super().setUp()
torch.distributed.init_process_group(
# TODO: test other ranks too
"fake",
rank=0,
world_size=self.world_size,
)
def tearDown(self):
super().tearDown()
try:
dist.destroy_process_group()
except AssertionError:
pass
class TestExplicitRedistribute(LocalTensorTestBase):
@property
def world_size(self):
return 4
def test_explicit_matmul(self):
with LocalTensorMode(self.world_size):
device_mesh = self.build_device_mesh()
dim = 128
x = torch.randn(8, dim, requires_grad=True)
A = torch.randn(dim, dim, requires_grad=True)
# Prepare DTensors
dx = distribute_tensor(x, device_mesh, [Shard(0)])
dA = distribute_tensor(A, device_mesh, [Shard(0)])
# implicit redistribute works as usual by default
with CommDebugMode() as comm_mode:
torch.matmul(dx, dA)
self.assertEqual(comm_mode.get_total_counts(), 1)
# explicit redistribute works too
with ExplicitRedistributionContext():
with self.assertRaisesRegex(RuntimeError, "Implicit redistribution"):
torch.matmul(dx, dA)
# explicit redistribute allows manual redistribute
with ExplicitRedistributionContext():
dA_repl = dA.redistribute(device_mesh, [Replicate()])
torch.matmul(dx, dA_repl)
dx = distribute_tensor(x, device_mesh, [Shard(0)])
dA = distribute_tensor(A, device_mesh, [Replicate()])
with ExplicitRedistributionContext():
dY = torch.matmul(dx, dA_repl)
loss = dY.sum()
# we now see the error during backwards
with self.assertRaisesRegex(RuntimeError, "Implicit redistribution"):
loss.backward()
if __name__ == "__main__":
run_tests()

View File

@ -1189,9 +1189,7 @@ class AbstractCommTest:
self.assertEqual(len(set(rank_to_seq_num.values())), 2)
self.assertEqual(rank_to_seq_num[0], rank_to_seq_num[2])
expected_same = {
rank_to_seq_num[i]
for i in rank_to_seq_num.keys()
if i not in [0, 2]
rank_to_seq_num[i] for i in rank_to_seq_num if i not in [0, 2]
}
self.assertEqual(len(expected_same), 1)
self.assertEqual(rank_to_seq_num[0] + 1, rank_to_seq_num[1])
@ -1558,7 +1556,7 @@ class CommTest(AbstractCommTest, MultiProcessTestCase):
}
invalid_debug_modes = ["foo", 0, 1, -1]
for mode in mapping.keys():
for mode in mapping:
os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode)
dist.set_debug_level_from_env()
set_debug_mode = dist.get_debug_level()

View File

@ -2357,6 +2357,7 @@ class ReducerModule(nn.Module):
class ReducerTest(TestCase):
def setUp(self):
super().setUp()
self.file = tempfile.NamedTemporaryFile(delete=False)
world_size = 1
self.store = c10d.FileStore(self.file.name, world_size)

View File

@ -252,6 +252,7 @@ class ProcessGroupNCCLNoGPUTest(TestCase):
MAIN_PROCESS_RANK = 0
def setUp(self):
super().setUp()
self.rank = self.MAIN_PROCESS_RANK
self.world_size = 1
self.file = tempfile.NamedTemporaryFile(delete=False)
@ -5789,6 +5790,229 @@ class NCCLTraceTest(NCCLTraceTestBase):
else:
self.assertTrue("duration_ms" not in t["entries"][0])
@requires_nccl()
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
@parametrize("timing_enabled", [True, False])
def test_fr_record_reset_circular_buffer_full(self, timing_enabled):
"""
Test that when the circular buffer in entries_ is full and we call reset,
then fill the buffer with new entries, dump_entries returns only the new
entries and not the old ones.
"""
if self.rank == self.MAIN_PROCESS_RANK:
return
# Override buffer size to 10 for faster testing
os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "10"
pg = self._create_process_group_nccl()
if timing_enabled:
pg._enable_collectives_timing()
device = self.local_device
self.set_thread_name("fr_test_thread")
a = torch.full((3, 4), float(self.rank), device=device)
# Fill the buffer completely with 10 entries
for _ in range(10):
f = pg.allreduce(a)
f.wait()
torch.cuda.synchronize(device=device)
time.sleep(1)
# Verify buffer is full with 10 entries
t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
self.assertEqual(len(t["entries"]), 10)
# Now reset the flight recorder
torch._C._distributed_c10d._reset_fr_recording_nccl()
# Add new entries after reset - fill the buffer completely again
for _ in range(10):
f = pg.allreduce(a)
f.wait()
torch.cuda.synchronize(device=device)
time.sleep(1)
# Verify we get exactly 10 new entries, not 20
t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
self.assertEqual(len(t["entries"]), 10)
# Verify all entries have the expected properties (from after reset)
# After reset, record IDs should start from 0 again
for i, entry in enumerate(t["entries"]):
self.assertIn("profiling_name", entry)
self.assertEqual(entry["profiling_name"], "nccl:all_reduce")
self.assertIn("record_id", entry)
# Record IDs should be sequential starting from 0 after reset
self.assertEqual(entry["record_id"], i)
dist.destroy_process_group()
@requires_nccl()
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
@parametrize("timing_enabled", [True, False])
def test_fr_record_reset_partial_overwrite(self, timing_enabled):
"""
Test that when the circular buffer is full, we reset, and then add fewer
entries than the buffer size, we only get the new entries.
This tests that old entries at the end of the circular buffer are properly
filtered out based on reset_epoch.
"""
if self.rank == self.MAIN_PROCESS_RANK:
return
# Override buffer size to 10 for faster testing
os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "10"
pg = self._create_process_group_nccl()
if timing_enabled:
pg._enable_collectives_timing()
device = self.local_device
self.set_thread_name("fr_test_thread")
a = torch.full((3, 4), float(self.rank), device=device)
# Fill the buffer completely
for _ in range(10):
f = pg.allreduce(a)
f.wait()
torch.cuda.synchronize(device=device)
time.sleep(1)
# Reset the flight recorder
torch._C._distributed_c10d._reset_fr_recording_nccl()
# Add only 3 new entries (much less than buffer size)
for _ in range(3):
f = pg.allreduce(a)
f.wait()
torch.cuda.synchronize(device=device)
time.sleep(1)
# Verify we only get the 3 new entries, not 10
t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
self.assertEqual(len(t["entries"]), 3)
# Verify record IDs start from 0 after reset
for i, entry in enumerate(t["entries"]):
self.assertIn("record_id", entry)
self.assertEqual(entry["record_id"], i)
dist.destroy_process_group()
@requires_nccl()
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
@parametrize("timing_enabled", [True, False])
def test_fr_record_reset_wraparound(self, timing_enabled):
"""
Test that when we reset in the middle of the circular buffer and then
wrap around, dump_entries correctly returns only entries from the current
epoch in the correct order.
"""
if self.rank == self.MAIN_PROCESS_RANK:
return
# Override buffer size to 10 for faster testing
os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "10"
pg = self._create_process_group_nccl()
if timing_enabled:
pg._enable_collectives_timing()
device = self.local_device
self.set_thread_name("fr_test_thread")
a = torch.full((3, 4), float(self.rank), device=device)
# Fill half the buffer
for _ in range(5):
f = pg.allreduce(a)
f.wait()
torch.cuda.synchronize(device=device)
time.sleep(1)
# Reset at this point (reset happens at index 5)
torch._C._distributed_c10d._reset_fr_recording_nccl()
# Now add 8 entries, which will wrap around
# (5->9 fills rest of buffer, then 0->2 wraps around)
for _ in range(8):
f = pg.allreduce(a)
f.wait()
torch.cuda.synchronize(device=device)
time.sleep(1)
# Should get exactly 8 entries, properly ordered
t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
self.assertEqual(len(t["entries"]), 8)
# Entries should be in chronological order
# The dump_entries() method returns entries from next_ to end, then 0 to next_
# After filtering old entries, we should have 8 entries in order
# Verify record IDs start from 0 after reset (id_ is reset in reset_all())
for i, entry in enumerate(t["entries"]):
self.assertIn("profiling_name", entry)
self.assertIn("record_id", entry)
self.assertEqual(entry["record_id"], i)
dist.destroy_process_group()
@requires_nccl()
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
@parametrize("timing_enabled", [True, False])
def test_fr_record_multiple_resets(self, timing_enabled):
"""
Test multiple consecutive resets to ensure each reset properly increments
the epoch and filters out entries from previous epochs.
"""
if self.rank == self.MAIN_PROCESS_RANK:
return
# Override buffer size to 10 for faster testing
os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "10"
pg = self._create_process_group_nccl()
if timing_enabled:
pg._enable_collectives_timing()
device = self.local_device
self.set_thread_name("fr_test_thread")
a = torch.full((3, 4), float(self.rank), device=device)
# First batch: 2 entries
for _ in range(2):
f = pg.allreduce(a)
f.wait()
torch.cuda.synchronize(device=device)
time.sleep(1)
# First reset
torch._C._distributed_c10d._reset_fr_recording_nccl()
# Second batch: 3 entries
for _ in range(3):
f = pg.allreduce(a)
f.wait()
torch.cuda.synchronize(device=device)
time.sleep(1)
# Second reset
torch._C._distributed_c10d._reset_fr_recording_nccl()
# Third batch: 4 entries
for _ in range(4):
f = pg.allreduce(a)
f.wait()
torch.cuda.synchronize(device=device)
time.sleep(1)
# Should only see the last 4 entries
t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
self.assertEqual(len(t["entries"]), 4)
# Verify record IDs start from 0 after the last reset
for i, entry in enumerate(t["entries"]):
self.assertIn("record_id", entry)
self.assertEqual(entry["record_id"], i)
dist.destroy_process_group()
def check_if_test_is_skipped(fn):
def wrapper(self, *args, **kwargs):
@ -6120,6 +6344,14 @@ class ProcessGroupNCCLLargerScaleTest(MultiProcessTestCase):
if self.rank == 6 or self.rank == 7:
dist.broadcast(tensor2, 6, group=ng2)
self.assertEqual(tensor2, torch.full((1,), 6))
# Test the case when the split changes the pg option of split group
# while the parent pg option is not changed.
new_pg = c10d.new_group([0, 1, 2, 3, 4, 5, 6, 7], device_id=device)
backend_new_pg = new_pg._get_backend(torch.device(device))
self.assertEqual(len(backend_new_pg.options.global_ranks_in_group), 8)
c10d.split_group(new_pg, [[0, 2, 4, 6], [1, 3, 5, 7]])
self.assertEqual(len(backend_new_pg.options.global_ranks_in_group), 8)
# a barrier and a cuda sync before destroying all pgs.
dist.barrier(pg)
torch.cuda.synchronize()

View File

@ -499,6 +499,7 @@ class ComposabilityTest(MultiProcContinuousTest):
[
_ComputationType.UNSHARD,
_ComputationType.FORWARD,
_ComputationType.REDUCE_GRAD, # Contains final fsdp post_backward
],
microbatch_index=0,
)

View File

@ -2,6 +2,7 @@
import contextlib
import copy
import functools
import logging
import random
import unittest
from contextlib import contextmanager
@ -51,6 +52,9 @@ from torch.testing._internal.inductor_utils import HAS_GPU
from torch.testing._internal.triton_utils import requires_cuda_and_triton
log = logging.getLogger(__name__)
def reset_rng_state():
torch.manual_seed(1337)
random.seed(1337)
@ -1200,6 +1204,116 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
for r in res[1:]:
self.assertEqual(res[0], r)
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@patch.object(torch._dynamo.config, "enable_compiler_collectives", True)
@patch.object(torch._inductor.config, "max_autotune_gemm", True)
@patch.object(torch._inductor.config, "distributed_max_autotune_gemm", True)
def test_multiproc_autotune(self):
with _dynamo_dist_per_rank_init(self.rank, self.world_size):
torch._dynamo.utils.clear_compilation_metrics()
@torch.compile()
def f(a, b, c):
res = (
torch.sum((a @ b) + 1.0)
+ torch.sum(torch.relu(b @ c))
+ torch.sum(c @ a)
)
return res
a = torch.randn(1024, 1024, device=self.rank, dtype=torch.bfloat16)
b = torch.randn(1024, 2048, device=self.rank, dtype=torch.bfloat16)
c = torch.randn(2048, 1024, device=self.rank, dtype=torch.bfloat16)
try:
f(a, b, c)
except Exception:
log.exception("Caught exception running f")
raise
metrics = torch._dynamo.utils.get_compilation_metrics()
res = [None] * self.world_size
torch.distributed.all_gather_object(res, len(metrics))
for r in res[1:]:
self.assertEqual(res[0], r)
print(f"Result from {self.rank} is {f(a, b, c)}")
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@patch.object(torch._dynamo.config, "enable_compiler_collectives", True)
@patch.object(torch._inductor.config, "max_autotune_gemm", True)
@patch.object(torch._inductor.config, "distributed_max_autotune_gemm", True)
def test_multiproc_autotune_dynamic_shapes(self):
with _dynamo_dist_per_rank_init(self.rank, self.world_size):
torch._dynamo.utils.clear_compilation_metrics()
@torch.compile()
def f(a, b, c):
res = (
torch.sum((a @ b) + 1.0)
+ torch.sum(torch.relu(b @ c))
+ torch.sum(c @ a)
)
return res
a = torch.randn(1024, 1024, device=self.rank, dtype=torch.bfloat16)
b = torch.randn(1024, 2048, device=self.rank, dtype=torch.bfloat16)
c = torch.randn(2048, 1024, device=self.rank, dtype=torch.bfloat16)
# Mark tensors as dynamic on dimension 0
torch._dynamo.mark_dynamic(a, 0)
torch._dynamo.mark_dynamic(a, 1)
torch._dynamo.mark_dynamic(b, 0)
torch._dynamo.mark_dynamic(b, 1)
torch._dynamo.mark_dynamic(c, 0)
torch._dynamo.mark_dynamic(c, 1)
try:
f(a, b, c)
except Exception:
log.exception("Caught exception running f")
raise
metrics = torch._dynamo.utils.get_compilation_metrics()
res = [None] * self.world_size
torch.distributed.all_gather_object(res, len(metrics))
for r in res[1:]:
self.assertEqual(res[0], r)
print(f"Result from {self.rank} is {f(a, b, c)}")
# Store the initial compilation count
initial_compile_count = len(metrics)
# # Test with different sizes to ensure dynamic shapes work without recompilation
a2 = torch.randn(512, 512, device=self.rank, dtype=torch.bfloat16)
b2 = torch.randn(512, 2048, device=self.rank, dtype=torch.bfloat16)
c2 = torch.randn(2048, 512, device=self.rank, dtype=torch.bfloat16)
try:
result2 = f(a2, b2, c2)
print(f"Result2 from {self.rank} is {result2}")
except Exception:
log.exception("Caught exception running f with different sizes")
raise
# Verify no recompilation occurred
metrics_after = torch._dynamo.utils.get_compilation_metrics()
final_compile_count = len(metrics_after)
self.assertEqual(
initial_compile_count,
final_compile_count,
"Expected no recompilation with dynamic shapes",
)
# Verify all ranks have the same compilation count
res_after = [None] * self.world_size
torch.distributed.all_gather_object(res_after, final_compile_count)
for r in res_after[1:]:
self.assertEqual(res_after[0], r)
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
def test_get_pg_attr(self):
with _dynamo_dist_per_rank_init(self.rank, self.world_size):

View File

@ -1985,6 +1985,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
"bucket_reduce_scatters_fx_bucket_size_determinator": lambda _: 2,
"reorder_for_compute_comm_overlap": True,
"reorder_for_compute_comm_overlap_passes": [
_reorder_communication_preserving_peak_memory,
sink_waits_iterative,
_reorder_communication_preserving_peak_memory,
],
@ -2046,11 +2047,6 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
assert node_stats is not None
self.assertTrue(isinstance(node_stats, dict))
self.assertEqual(len(node_stats), 4)
it = iter(node_stats.values())
node_stat0 = next(it)
self.assertTrue(node_stat0.limiting_factor == "None")
node_stat1 = next(it)
self.assertTrue("collective ordering" in node_stat1.limiting_factor)
@skipIfXpu # https://github.com/intel/torch-xpu-ops/issues/1581
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")

View File

@ -128,14 +128,14 @@ class TestLocalTensorWorld2(LocalTensorTestBase):
self.assertEqual(len(result_add._local_tensors), 2)
# Verify the operation was applied to each local tensor
for rank in identical_local_tensors.keys():
for rank in identical_local_tensors:
expected = identical_local_tensors[rank] + identical_local_tensors[rank]
self.assertEqual(result_add._local_tensors[rank], expected)
# Test multiplication
result_mul = lt1 * 2.0
self.assertIsInstance(result_mul, LocalTensor)
for rank in identical_local_tensors.keys():
for rank in identical_local_tensors:
expected = identical_local_tensors[rank] * 2.0
self.assertEqual(result_mul._local_tensors[rank], expected)
@ -163,7 +163,7 @@ class TestLocalTensorWorld2(LocalTensorTestBase):
result = lt + regular_tensor
self.assertIsInstance(result, LocalTensor)
for rank in identical_local_tensors.keys():
for rank in identical_local_tensors:
expected = identical_local_tensors[rank] + regular_tensor
self.assertEqual(result._local_tensors[rank], expected)
@ -212,14 +212,14 @@ class TestLocalTensorWorld2(LocalTensorTestBase):
dist.all_reduce(lt_sum, group=fake_pg)
expected_sum = torch.tensor([[6.0, 8.0], [10.0, 12.0]])
for rank in test_tensors.keys():
for rank in test_tensors:
self.assertEqual(lt_sum._local_tensors[rank], expected_sum)
# Test broadcast within mode
lt_broadcast = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
dist.broadcast(lt_broadcast, src=0, group=fake_pg)
for rank in test_tensors.keys():
for rank in test_tensors:
self.assertEqual(lt_broadcast._local_tensors[rank], test_tensors[0])
# Test that regular operations still work
@ -293,21 +293,21 @@ class TestLocalTensorWorld3(LocalTensorTestBase):
lt_sum = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
dist.all_reduce(lt_sum, op=dist.ReduceOp.SUM, group=fake_pg)
expected_sum = torch.tensor([[6.0, 7.0], [6.0, 15.0]]) # Sum of all tensors
for rank in test_tensors.keys():
for rank in test_tensors:
self.assertEqual(lt_sum._local_tensors[rank], expected_sum)
# Test MAX reduction
lt_max = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
dist.all_reduce(lt_max, op=dist.ReduceOp.MAX, group=fake_pg)
expected_max = torch.tensor([[3.0, 4.0], [3.0, 6.0]]) # Max across all tensors
for rank in test_tensors.keys():
for rank in test_tensors:
self.assertEqual(lt_max._local_tensors[rank], expected_max)
# Test MIN reduction
lt_min = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
dist.all_reduce(lt_min, op=dist.ReduceOp.MIN, group=fake_pg)
expected_min = torch.tensor([[1.0, 1.0], [1.0, 4.0]]) # Min across all tensors
for rank in test_tensors.keys():
for rank in test_tensors:
self.assertEqual(lt_min._local_tensors[rank], expected_min)
def test_all_reduce_collective(self):
@ -328,7 +328,7 @@ class TestLocalTensorWorld3(LocalTensorTestBase):
# Verify all ranks have the sum of all tensors (after adding 1 to each)
expected_sum = torch.tensor([[114.0, 225.0, 336.0], [447.0, 558.0, 669.0]])
for rank in different_tensors.keys():
for rank in different_tensors:
self.assertEqual(lt_sum._local_tensors[rank], expected_sum)
def test_broadcast_collective(self):
@ -348,7 +348,7 @@ class TestLocalTensorWorld3(LocalTensorTestBase):
# Verify all ranks have rank 1's original tensor
expected_broadcast = different_tensors[1]
for rank in different_tensors.keys():
for rank in different_tensors:
self.assertEqual(lt_broadcast._local_tensors[rank], expected_broadcast)
def test_all_gather_collective(self):

View File

@ -17,6 +17,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase
class RunTest(TestCase):
def setUp(self):
super().setUp()
# Save original environment variable if it exists
self.original_signals_env = os.environ.get(
"TORCHELASTIC_SIGNALS_TO_HANDLE", None

View File

@ -25,6 +25,7 @@ class MyClass:
class TestSerialization(TestCase):
def setUp(self) -> None:
super().setUp()
# disable debug asserts
self._old_debug = os.environ.get(DEBUG_ENV)
os.environ[DEBUG_ENV] = "0"

View File

@ -317,6 +317,7 @@ class HashStoreTest(TestCase, StoreTestBase):
class PrefixStoreTest(TestCase):
def setUp(self):
super().setUp()
# delete is false as FileStore will automatically clean up the file
self.file = tempfile.NamedTemporaryFile(delete=False)

View File

@ -283,7 +283,7 @@ class TestCompilerBisector(TestCase):
)
def test_bisect_pre_grad_graph(self):
def f(x):
for i in range(5):
for _ in range(5):
x = x + 1
return x.relu()

Some files were not shown because too many files have changed in this diff Show More