Compare commits

..

36 Commits

Author SHA1 Message Date
f2c43efab9 Merge branch 'main' into ci_attn 2025-10-29 16:45:33 -07:00
c1f01d51ef Define attention kernel op name 2025-10-29 11:15:55 -07:00
665df0bc72 Merge remote-tracking branch 'origin/attention_benchmark' into ci_attn 2025-10-28 13:57:16 -07:00
2882e9037b Fix output-json 2025-10-28 13:53:58 -07:00
a58b89df44 Fix linting 2025-10-28 12:54:07 -07:00
c684aa411b Merge remote-tracking branch 'origin/main' into attention_benchmark 2025-10-28 10:20:14 -07:00
836383de99 Merge remote-tracking branch 'origin/attention_benchmark' into ci_attn 2025-10-27 21:16:06 -07:00
fcb1bf53cb Merge remote-tracking branch 'origin/main' into attention_benchmark 2025-10-27 21:14:03 -07:00
df9bd5ce7c Fix linting 2025-10-27 21:09:17 -07:00
c3f01becc0 Removed test changes 2025-10-27 10:00:33 -07:00
56bdd1aa05 Test only safe-backend 2025-10-27 09:03:42 -07:00
d2602f0884 Merge remote-tracking branch 'origin/attention_benchmark' into ci_attn 2025-10-27 09:01:15 -07:00
82b70f9a7f Update safe-backend use 2025-10-27 08:59:52 -07:00
9f2c63259f Remove commented code 2025-10-26 22:03:53 -07:00
6b08e5e075 Merge remote-tracking branch 'origin/attention_benchmark' into ci_attn 2025-10-26 21:41:38 -07:00
2bcd29cc48 Handle nan 2025-10-26 21:36:18 -07:00
f7f410a1f5 Clean fav2 2025-10-26 21:06:24 -07:00
1832d9720f Format config logic now 2025-10-23 14:16:19 -07:00
4069d76684 Add print-config 2025-10-23 13:47:46 -07:00
1d3f285e3f Revert accidental third_party/fbgemm submodule update
Revert fbgemm submodule to match main branch commit.
2025-10-23 10:27:58 -07:00
a1066fc671 Merge branch 'attention_benchmark' into ci_attn 2025-10-23 10:24:58 -07:00
11536e5a6b Update json 2025-10-23 10:21:01 -07:00
09daf4533d Testing small subset 2025-10-21 13:49:25 -07:00
5899e0478d Add more runners 2025-10-21 12:27:04 -07:00
7b98ed8273 Update attention-gym installation 2025-10-21 12:00:52 -07:00
ad118b53d8 Update attention-gym installation 2025-10-21 11:56:45 -07:00
00832e4a45 updates 2025-10-20 09:44:14 -07:00
56e3f97f2a Attention ops in CI 2025-10-20 09:37:31 -07:00
9d8778b8bf Lint and minor fixes 2025-10-20 08:58:32 -07:00
86279c6f25 Merge remote-tracking branch 'origin/main' into attention_benchmark 2025-10-19 16:08:23 -07:00
93553121d8 Update score_mod 2025-10-09 15:30:13 -07:00
e5eb96af95 Add json 2025-10-06 09:52:59 -07:00
4407b6c9e3 Transformer benchmarks 2025-10-06 08:30:52 -07:00
22ea056fcd Update score_mod 2025-09-30 14:00:51 -07:00
d7466fd5c6 Add config files 2025-09-30 10:36:04 -07:00
de61804393 Add attention benchmarking 2025-09-29 09:35:01 -07:00
479 changed files with 6371 additions and 14103 deletions

View File

@ -195,16 +195,13 @@ case "$tag" in
NINJA_VERSION=1.9.0
TRITON=yes
;;
pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
pytorch-linux-jammy-xpu-n-py3)
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
VISION=yes
XPU_VERSION=2025.2
NINJA_VERSION=1.9.0
TRITON=yes
if [[ $tag =~ "benchmarks" ]]; then
INDUCTOR_BENCHMARKS=yes
fi
;;
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.10

View File

@ -3,7 +3,7 @@
set -eux
ACL_VERSION=${ACL_VERSION:-"v52.6.0"}
ACL_VERSION=${ACL_VERSION:-"v25.02"}
ACL_INSTALL_DIR="/acl"
# Clone ACL

View File

@ -40,7 +40,11 @@ EOF
# Default url values
rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
# Add amdgpu repository
UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
# Add rocm repository
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -

View File

@ -12,8 +12,8 @@ function do_install() {
rocm_version_nodot=${rocm_version//./}
# post merge of https://github.com/icl-utk-edu/magma/pull/65
MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
# https://github.com/icl-utk-edu/magma/pull/65
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
rocm_dir="/opt/rocm"

View File

@ -97,7 +97,7 @@ case ${image} in
manylinux2_28-builder:xpu)
TARGET=xpu_final
GPU_IMAGE=amd64/almalinux:8
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13"
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
MANY_LINUX_VERSION="2_28"
;;
*)

View File

@ -54,15 +54,12 @@ ENV OPENSSL_DIR /opt/openssl
RUN rm install_openssl.sh
ARG INDUCTOR_BENCHMARKS
ARG ANACONDA_PYTHON_VERSION
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
# Install XPU Dependencies
ARG XPU_VERSION

View File

@ -6,7 +6,7 @@ dependencies = [
"GitPython==3.1.45",
"docker==7.1.0",
"pytest==7.3.2",
"uv==0.9.6"
"uv==0.9.5"
]
[tool.setuptools]

View File

@ -1,7 +1,7 @@
SHELL=/usr/bin/env bash
DOCKER_CMD ?= docker
DESIRED_ROCM ?= 7.1
DESIRED_ROCM ?= 7.0
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
PACKAGE_NAME = magma-rocm
# inherit this from underlying docker image, do not pass this env var to docker
@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
magma-rocm/build_magma.sh
.PHONY: all
all: magma-rocm71
all: magma-rocm70
all: magma-rocm64
@ -25,11 +24,6 @@ clean:
$(RM) -r magma-*
$(RM) -r output
.PHONY: magma-rocm71
magma-rocm71: DESIRED_ROCM := 7.1
magma-rocm71:
$(DOCKER_RUN)
.PHONY: magma-rocm70
magma-rocm70: DESIRED_ROCM := 7.0
magma-rocm70:

View File

@ -6,8 +6,8 @@ set -eou pipefail
# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
# post merge of https://github.com/icl-utk-edu/magma/pull/65
MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
# https://github.com/icl-utk-edu/magma/pull/65
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
# Folders for the build
PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
# Fetch magma sources and verify checksum
pushd ${PACKAGE_DIR}
git clone https://github.com/icl-utk-edu/magma
git clone https://github.com/jeffdaily/magma
pushd magma
git checkout ${MAGMA_VERSION}
popd

View File

@ -426,7 +426,7 @@ fi
if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then
# export test times so that potential sharded tests that'll branch off this build will use consistent data
# don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
PYTHONPATH=. python tools/stats/export_test_times.py
python tools/stats/export_test_times.py
fi
# don't do this for bazel or s390x or riscv64 as they don't use sccache
if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then

View File

@ -42,7 +42,7 @@ declare -f -t trap_add
function assert_git_not_dirty() {
# TODO: we should add an option to `build_amd.py` that reverts the repo to
# an unmodified state.
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *xla* ]] && [[ "$BUILD_ENVIRONMENT" != *win* ]] ; then
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *xla* ]] ; then
git_status=$(git status --porcelain | grep -v '?? third_party' || true)
if [[ $git_status ]]; then
echo "Build left local git repository checkout dirty"

View File

@ -572,8 +572,6 @@ fi
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
DYNAMO_BENCHMARK_FLAGS+=(--device xpu)
else
DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
fi
@ -667,8 +665,6 @@ test_perf_for_dashboard() {
device=cuda_b200
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
device=rocm
elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
device=xpu
fi
for mode in "${modes[@]}"; do
@ -1663,6 +1659,22 @@ test_operator_microbenchmark() {
done
}
test_attention_microbenchmark() {
TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR"
TEST_DIR=$(pwd)
# Install attention-gym dependency
echo "Installing attention-gym..."
python -m pip install git+https://github.com/meta-pytorch/attention-gym.git@main
pip show triton
cd "${TEST_DIR}"/benchmarks/transformer
$TASKSET python score_mod.py --config configs/config_basic.yaml \
--output-json-for-dashboard "${TEST_REPORTS_DIR}/attention_microbenchmark.json"
}
if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
(cd test && python -c "import torch; print(torch.__config__.show())")
(cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@ -1720,6 +1732,8 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
fi
elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
test_operator_microbenchmark
elif [[ "${TEST_CONFIG}" == *attention_microbenchmark* ]]; then
test_attention_microbenchmark
elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
test_inductor_distributed
elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@ -1761,7 +1775,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
else
# Do this after checkout_install_torchbench to ensure we clobber any
# nightlies that torchbench may pull in
if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then
if [[ "${TEST_CONFIG}" != *cpu* ]]; then
install_torchrec_and_fbgemm
fi
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"

View File

@ -31,24 +31,23 @@ if "%USE_XPU%"=="1" (
if errorlevel 1 exit /b 1
)
:: Miniconda has been installed as part of the Windows AMI with all the dependencies.
:: We just need to activate it here
call %INSTALLER_DIR%\activate_miniconda3.bat
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
:: Update CMake
:: TODO: Investigate why this helps MKL detection, even when CMake from choco is not used
call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=System' --apply-install-arguments-to-dependencies --version=3.27.9
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
call pip install mkl==2024.2.0 mkl-static==2024.2.0 mkl-include==2024.2.0 ninja typing-extensions
call pip install -r .ci/docker/requirements-ci.txt
SET CMAKE_LIBRARY_PATH=%PYTHON_PATH%\Library\lib
SET CMAKE_INCLUDE_PATH=%PYTHON_PATH%\Library\include
:: TODO: Move to .ci/docker/requirements-ci.txt
call pip install mkl==2024.2.0 mkl-static==2024.2.0 mkl-include==2024.2.0
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
:: Install libuv
curl -k https://s3.amazonaws.com/ossci-windows/libuv-1.40.0-h8ffe710_0.tar.bz2 -o libuv-1.40.0-h8ffe710_0.tar.bz2
7z x -aoa libuv-1.40.0-h8ffe710_0.tar.bz2
tar -xvf libuv-1.40.0-h8ffe710_0.tar -C %PYTHON_PATH%
set libuv_ROOT=%PYTHON_PATH%\Library
:: Override VS env here
pushd .
if "%VC_VERSION%" == "" (

View File

@ -0,0 +1,30 @@
if "%BUILD_ENVIRONMENT%"=="" (
set CONDA_PARENT_DIR=%CD%
) else (
set CONDA_PARENT_DIR=C:\Jenkins
)
set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3
:: Be conservative here when rolling out the new AMI with conda. This will try
:: to install conda as before if it couldn't find the conda installation. This
:: can be removed eventually after we gain enough confidence in the AMI
if not exist %CONDA_ROOT_DIR% (
set INSTALL_FRESH_CONDA=1
)
if "%INSTALL_FRESH_CONDA%"=="1" (
curl --retry 3 --retry-all-errors -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
%TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
)
:: Activate conda so that we can use its commands, i.e. conda, python, pip
call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
:: Activate conda so that we can use its commands, i.e. conda, python, pip
call conda activate py_tmp
call pip install -r .ci/docker/requirements-ci.txt

View File

@ -3,6 +3,19 @@ set PATH=C:\Program Files\CMake\bin;C:\Program Files\7-Zip;C:\ProgramData\chocol
:: Install Miniconda3
set INSTALLER_DIR=%SCRIPT_HELPERS_DIR%\installation-helpers
:: Miniconda has been installed as part of the Windows AMI with all the dependencies.
:: We just need to activate it here
call %INSTALLER_DIR%\activate_miniconda3.bat
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
:: PyTorch is now installed using the standard wheel on Windows into the conda environment.
:: However, the test scripts are still frequently referring to the workspace temp directory
:: build\torch. Rather than changing all these references, making a copy of torch folder
:: from conda to the current workspace is easier. The workspace will be cleaned up after
:: the job anyway
xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
pushd .
if "%VC_VERSION%" == "" (
call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64

View File

@ -40,11 +40,6 @@ fi
# TODO: Move this to .ci/docker/requirements-ci.txt
python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2"
# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
python -m pip install expecttest==0.3.0
python -m pip install -r $SCRIPT_PARENT_DIR/../docker/requirements-ci.txt
run_tests() {
# Run nvidia-smi if available
for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do

View File

@ -22,10 +22,8 @@ curl --retry 3 -kL "%PYTHON_INSTALLER_URL%" --output python-amd64.exe
if errorlevel 1 exit /b 1
start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_test=0 %ADDITIONAL_OPTIONS% TargetDir=%CD%\Python
if errorlevel 1 exit /b 1
set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%"
%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel build
if errorlevel 1 exit /b 1

View File

@ -27,9 +27,7 @@ runs:
docker system prune -af
diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
diskspace_cutoff_int=$((diskspace_cutoff + 0))
difference=$((100 - diskspace_cutoff_int))
echo "Error: Available diskspace is less than $difference percent. Not enough diskspace."
echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
echo "$msg"
exit 1
else

View File

@ -57,10 +57,16 @@ outputs:
runs:
using: composite
steps:
- name: Install Dependencies
id: install-dependencies
shell: bash
run: |
- uses: nick-fields/retry@v3.0.0
name: Setup dependencies
env:
GITHUB_TOKEN: ${{ inputs.github-token }}
with:
shell: bash
timeout_minutes: 10
max_attempts: 5
retry_wait_seconds: 30
command: |
set -eux
# PyYAML 6.0 doesn't work with MacOS x86 anymore
# This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2

View File

@ -37,6 +37,19 @@ runs:
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
- name: Setup conda
shell: bash
run: |
# Windows conda is baked into the AMI at this location
CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"
{
echo "CONDA=${CONDA}";
echo "CONDA_RUN=${CONDA} run --no-capture-output";
echo "CONDA_BUILD=${CONDA} run conda-build";
echo "CONDA_INSTALL=${CONDA} install";
} >> "${GITHUB_ENV}"
- name: Setup Python3
env:
PYTHON_VERSION: ${{ inputs.python-version }}
@ -44,12 +57,40 @@ runs:
run: |
set +e
set -x
export DESIRED_PYTHON=3.10
echo "DESIRED_PYTHON=3.10" | tee -a "${GITHUB_ENV}"
.ci/pytorch/windows/internal/install_python.bat
echo "PATH=$(pwd)/Python/Scripts;$(pwd)/Python;$(pwd);/usr/bin/;${PATH}" | tee -a "${GITHUB_ENV}"
echo "PYTHON_PATH=$(pwd)/Python" | tee -a "${GITHUB_ENV}"
ln -s "$(pwd)/Python/python.exe" "$(pwd)/Python/python3.exe"
# Create new py_tmp env with python-version
${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv
PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
EXIT_CODE=$?
if [[ "${EXIT_CODE}" == "0" ]]; then
echo "Found Python3 at ${PYTHON3}, adding it into GITHUB_PATH"
PYTHON_PATH=$(dirname "${PYTHON3}")
echo "${PYTHON_PATH}" >> "${GITHUB_PATH}"
else
# According to https://docs.conda.io/en/latest/miniconda.html, we are using the Miniconda3
# installation, which is Python 3 based. Its Python is default to Python 3. Further, there
# is also the Miniconda installation that is Python 2 based, and both can be installed if
# needed. In both cases, Python binary is just called python
PYTHON=$(${CONDA_RUN} -n py_tmp which python)
EXIT_CODE=$?
if [[ "${EXIT_CODE}" == "0" ]]; then
echo "Found Python at ${PYTHON}, set Python3 alias and add it into GITHUB_PATH"
PYTHON3=$(echo "${PYTHON}" | sed "s/python/python3/")
# It's difficult to setup alias across GitHub action steps, so I just add a softlink
# here pointing to Python
ln -s "${PYTHON}" "${PYTHON3}"
PYTHON_PATH=$(dirname "${PYTHON}")
echo "${PYTHON_PATH}" >> "${GITHUB_PATH}"
else
echo "Found no Python using ${CONDA_RUN}"
fi
fi
- name: Get temporary directory used by Windows Python
shell: bash
@ -63,7 +104,6 @@ runs:
continue-on-error: true
shell: powershell
run: |
Set-Alias -Name python3 -Value python
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TMPDIR,"C:\Jenkins\Miniconda3" -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere

View File

@ -28,20 +28,25 @@ runs:
# retry this step several time similar to how checkout-pytorch GHA does
- name: Cleanup workspace
if: always()
uses: nick-fields/retry@v3.0.0
env:
EXTRA_DELETE_DIR: ${{ inputs.extra-delete-dir }}
shell: bash
run: |
set +e
set -x
with:
shell: bash
timeout_minutes: 5
max_attempts: 3
retry_wait_seconds: 90
command: |
set +e
set -x
if [ -n "${EXTRA_DELETE_DIR}" ]; then
# It's ok to fail to clean up the extra directory on Windows as it only contains
# the build artifacts and doesn't take up much space, i.e. /c/5053411580/build-results
rm -rf "${EXTRA_DELETE_DIR}" || true
fi
if [ -n "${EXTRA_DELETE_DIR}" ]; then
# It's ok to fail to clean up the extra directory on Windows as it only contains
# the build artifacts and doesn't take up much space, i.e. /c/5053411580/build-results
rm -rf "${EXTRA_DELETE_DIR}" || true
fi
rm -rf ./*
rm -rf ./*
- name: Print all processes locking the runner workspace
continue-on-error: true

View File

@ -1 +1 @@
3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2
69bbe7363897764f9e758d851cd0340147d27f94

View File

@ -19,7 +19,6 @@ ciflow_push_tags:
- ciflow/inductor-perf-test-nightly-rocm-mi300
- ciflow/inductor-perf-test-nightly-rocm-mi355
- ciflow/inductor-perf-test-nightly-x86-zen
- ciflow/inductor-perf-test-nightly-xpu
- ciflow/inductor-periodic
- ciflow/inductor-rocm
- ciflow/linux-aarch64
@ -27,7 +26,6 @@ ciflow_push_tags:
- ciflow/nightly
- ciflow/op-benchmark
- ciflow/periodic
- ciflow/periodic-rocm-mi200
- ciflow/periodic-rocm-mi300
- ciflow/pull
- ciflow/quantization-periodic

View File

@ -11,17 +11,11 @@ architectures:
* Latest XPU
"""
import json
import os
import re
from pathlib import Path
from typing import Optional
SCRIPT_DIR = Path(__file__).absolute().parent
REPO_ROOT = SCRIPT_DIR.parent.parent
# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
CUDA_STABLE = "12.8"
CUDA_ARCHES_FULL_VERSION = {
@ -37,7 +31,8 @@ CUDA_ARCHES_CUDNN_VERSION = {
"13.0": "9",
}
ROCM_ARCHES = ["7.0", "7.1"]
# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
ROCM_ARCHES = ["6.4", "7.0"]
XPU_ARCHES = ["xpu"]
@ -142,48 +137,9 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
}
# Used by tools/nightly.py
PYTORCH_NIGHTLY_PIP_INDEX_URL = "https://download.pytorch.org/whl/nightly"
NIGHTLY_SOURCE_MATRIX = {
"cpu": dict(
name="cpu",
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cpu",
supported_platforms=["Linux", "macOS", "Windows"],
accelerator="cpu",
)
}
CUDA_NIGHTLY_SOURCE_MATRIX = {
f"cuda-{major}.{minor}": dict(
name=f"cuda-{major}.{minor}",
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu{major}{minor}",
supported_platforms=["Linux", "Windows"],
accelerator="cuda",
)
for major, minor in (map(int, version.split(".")) for version in CUDA_ARCHES)
}
ROCM_NIGHTLY_SOURCE_MATRIX = {
f"rocm-{major}.{minor}": dict(
name=f"rocm-{major}.{minor}",
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm{major}.{minor}",
supported_platforms=["Linux"],
accelerator="rocm",
)
for major, minor in (map(int, version.split(".")) for version in ROCM_ARCHES)
}
XPU_NIGHTLY_SOURCE_MATRIX = {
"xpu": dict(
name="xpu",
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/xpu",
supported_platforms=["Linux"],
accelerator="xpu",
)
}
NIGHTLY_SOURCE_MATRIX.update(CUDA_NIGHTLY_SOURCE_MATRIX)
NIGHTLY_SOURCE_MATRIX.update(ROCM_NIGHTLY_SOURCE_MATRIX)
NIGHTLY_SOURCE_MATRIX.update(XPU_NIGHTLY_SOURCE_MATRIX)
def get_nccl_wheel_version(arch_version: str) -> str:
import re
requirements = map(
str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
)
@ -191,14 +147,17 @@ def get_nccl_wheel_version(arch_version: str) -> str:
def read_nccl_pin(arch_version: str) -> str:
nccl_pin_path = (
REPO_ROOT
/ ".ci"
/ "docker"
/ "ci_commit_pins"
/ f"nccl-cu{arch_version[:2]}.txt"
from pathlib import Path
nccl_pin_path = os.path.join(
Path(__file__).absolute().parents[2],
".ci",
"docker",
"ci_commit_pins",
f"nccl-cu{arch_version[:2]}.txt",
)
return nccl_pin_path.read_text().strip()
with open(nccl_pin_path) as f:
return f.read().strip()
def validate_nccl_dep_consistency(arch_version: str) -> None:
@ -206,8 +165,7 @@ def validate_nccl_dep_consistency(arch_version: str) -> None:
wheel_ver = get_nccl_wheel_version(arch_version)
if not nccl_release_tag.startswith(f"v{wheel_ver}"):
raise RuntimeError(
f"{arch_version} NCCL release tag version {nccl_release_tag} "
f"does not correspond to wheel version {wheel_ver}"
f"{arch_version} NCCL release tag version {nccl_release_tag} does not correspond to wheel version {wheel_ver}"
)
@ -454,14 +412,7 @@ def generate_wheels_matrix(
return ret
arch_version = ""
for arch_version in CUDA_ARCHES:
validate_nccl_dep_consistency(arch_version)
del arch_version
if __name__ == "__main__":
# Used by tools/nightly.py
(SCRIPT_DIR / "nightly_source_matrix.json").write_text(
json.dumps(NIGHTLY_SOURCE_MATRIX, indent=4) + "\n"
)
validate_nccl_dep_consistency("13.0")
validate_nccl_dep_consistency("12.9")
validate_nccl_dep_consistency("12.8")
validate_nccl_dep_consistency("12.6")

View File

@ -103,13 +103,6 @@ jobs:
with:
cuda-version: ${{ inputs.cuda-version }}
# TODO: Move to a requirements.txt file for windows
- name: Install pip dependencies
shell: bash
run: |
set -eu
python3 -m pip install 'xdoctest>=1.1.0'
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
@ -130,8 +123,9 @@ jobs:
if: ${{ !inputs.disable-monitor }}
continue-on-error: true
run: |
python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
# Windows conda doesn't have python3 binary, only python, but it's python3
${CONDA_RUN} python -m pip install psutil==5.9.8 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
${CONDA_RUN} python -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
- name: Download PyTorch Build Artifacts
@ -210,8 +204,7 @@ jobs:
run: |
pushd "${PYTORCH_FINAL_PACKAGE_DIR}"
# shellcheck disable=SC2046,SC2102
python3 -mpip install -vvv $(echo *.whl)[opt-einsum,optree] optree==0.13.0
python3 -mpip install mkl==2024.2.0 mkl-static==2024.2.0 mkl-include==2024.2.0
python3 -mpip install $(echo *.whl)[opt-einsum,optree] optree==0.13.0
popd
.ci/pytorch/win-test.sh

View File

@ -38,10 +38,6 @@ on:
default: ""
description: |
List of tests to include (empty string implies default list)
dashboard-tag:
required: false
type: string
default: ""
disable-monitor:
description: |
[Experimental] Disable utilization monitoring for tests.
@ -62,11 +58,6 @@ on:
required: false
type: number
default: 1
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
permissions:
id-token: write
contents: read
@ -205,8 +196,6 @@ jobs:
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
run: |
# Fetch aws credential from IMDs
@ -257,8 +246,6 @@ jobs:
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
-e TESTS_TO_INCLUDE \
-e ZE_AFFINITY_MASK \
-e HUGGING_FACE_HUB_TOKEN \
-e DASHBOARD_TAG \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--ulimit stack=10485760:83886080 \
--ulimit core=0 \

View File

@ -0,0 +1,73 @@
name: attention_op_microbenchmark
on:
push:
tags:
- ciflow/op-benchmark/*
workflow_dispatch:
schedule:
# Run at 06:00 UTC everyday
- cron: 0 7 * * *
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
# { config: "attention_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
jobs:
attn-microbenchmark-build:
if: github.repository_owner == 'pytorch'
uses: ./.github/workflows/_linux-build.yml
with:
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '8.0 9.0'
test-matrix: |
{ include: [
{ config: "attention_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
]}
secrets: inherit
attn-microbenchmark-test:
name: attn-microbenchmark-test
uses: ./.github/workflows/_linux-test.yml
needs: attn-microbenchmark-build
with:
timeout-minutes: 500
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image: ${{ needs.attn-microbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.attn-microbenchmark-build.outputs.test-matrix }}
secrets: inherit
# B200 runner
# opmicrobenchmark-build-b200:
# if: github.repository_owner == 'pytorch'
# name: opmicrobenchmark-build-b200
# uses: ./.github/workflows/_linux-build.yml
# with:
# runner: linux.12xlarge.memory
# build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
# docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
# cuda-arch-list: '10.0'
# test-matrix: |
# { include: [
# { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
# ]}
# secrets: inherit
# opmicrobenchmark-test-b200:
# name: opmicrobenchmark-test-b200
# uses: ./.github/workflows/_linux-test.yml
# needs: opmicrobenchmark-build-b200
# with:
# timeout-minutes: 500
# build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
# docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
# test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
# aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
# secrets: inherit

View File

@ -36,7 +36,7 @@ jobs:
runs-on: linux.9xlarge.ephemeral
strategy:
matrix:
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm7.0", "rocm7.1", "cpu"]
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
steps:
- name: Build docker image
uses: pytorch/pytorch/.github/actions/binary-docker-build@main

View File

@ -52,8 +52,8 @@ jobs:
{ tag: "cuda12.9" },
{ tag: "cuda12.8" },
{ tag: "cuda12.6" },
{ tag: "rocm6.4" },
{ tag: "rocm7.0" },
{ tag: "rocm7.1" },
{ tag: "cpu" },
]
steps:

View File

@ -34,7 +34,7 @@ jobs:
id-token: write
strategy:
matrix:
rocm_version: ["71", "70"]
rocm_version: ["70", "64"]
steps:
- name: Checkout PyTorch
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -54,8 +54,8 @@ jobs:
{ name: "manylinuxaarch64-builder", tag: "cuda12.9", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm7.0", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm7.1", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "xpu", runner: "linux.9xlarge.ephemeral" },

View File

@ -55,7 +55,7 @@ jobs:
docker-image: ["pytorch/manylinux2_28-builder:cpu"]
include:
- device: "rocm"
rocm_version: "7.1"
rocm_version: "7.0"
runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
- device: "cuda"
rocm_version: ""
@ -159,7 +159,12 @@ jobs:
WITH_CLANG_LDD="--with-clang-ldd"
fi
docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"
if [[ "${BUILD_DEVICE}" == xpu ]]; then
docker exec -t "${container_name}" bash -c "dnf install -y gcc-toolset-13-gcc-c++"
docker exec -t "${container_name}" bash -c "source /opt/rh/gcc-toolset-13/enable && ${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE"
else
docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"
fi
if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "xpu") ]]; then
docker exec -t "${container_name}" bash -c "auditwheel repair --plat ${PLATFORM} //artifacts/*.whl"

View File

@ -67,7 +67,6 @@ jobs:
pytorch-linux-jammy-py3.12-halide,
pytorch-linux-jammy-xpu-n-1-py3,
pytorch-linux-jammy-xpu-n-py3,
pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
pytorch-linux-jammy-py3-clang18-asan,
pytorch-linux-jammy-py3-clang12-onnx,
pytorch-linux-jammy-linter,

View File

@ -384,6 +384,124 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-rocm6_4-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.4
GPU_ARCH_VERSION: "6.4"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: libtorch-rocm6_4-shared-with-deps-release
build_environment: linux-binary-libtorch
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-rocm6_4-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-rocm6_4-shared-with-deps-release-build
- get-label-type
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.4
GPU_ARCH_VERSION: "6.4"
GPU_ARCH_TYPE: rocm
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-rocm6_4-shared-with-deps-release
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: ROCm set GPU_FLAG
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
docker-image-name: libtorch-cxx11-builder
custom-tag-prefix: rocm6.4
docker-build-dir: .ci/docker
working-directory: pytorch
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
env:
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm
libtorch-rocm6_4-shared-with-deps-release-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-rocm6_4-shared-with-deps-release-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.4
GPU_ARCH_VERSION: "6.4"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-rocm6_4-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-rocm7_0-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -501,121 +619,3 @@ jobs:
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-rocm7_1-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm7.1
GPU_ARCH_VERSION: "7.1"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.1
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: libtorch-rocm7_1-shared-with-deps-release
build_environment: linux-binary-libtorch
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-rocm7_1-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-rocm7_1-shared-with-deps-release-build
- get-label-type
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm7.1
GPU_ARCH_VERSION: "7.1"
GPU_ARCH_TYPE: rocm
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.1
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
permissions:
id-token: write
contents: read
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-rocm7_1-shared-with-deps-release
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: ROCm set GPU_FLAG
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
docker-image-name: libtorch-cxx11-builder
custom-tag-prefix: rocm7.1
docker-build-dir: .ci/docker
working-directory: pytorch
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
env:
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm
libtorch-rocm7_1-shared-with-deps-release-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-rocm7_1-shared-with-deps-release-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm7.1
GPU_ARCH_VERSION: "7.1"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.1
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-rocm7_1-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml

File diff suppressed because it is too large Load Diff

View File

@ -1,148 +0,0 @@
name: inductor-perf-nightly-xpu
on:
push:
tags:
- ciflow/inductor-perf-test-nightly-xpu/*
schedule:
- cron: 30 17 * * *
workflow_dispatch:
inputs:
training:
description: Run training (on by default)?
required: false
type: boolean
default: true
inference:
description: Run inference (on by default)?
required: false
type: boolean
default: true
default:
description: Run inductor_default?
required: false
type: boolean
default: false
dynamic:
description: Run inductor_dynamic_shapes?
required: false
type: boolean
default: false
cppwrapper:
description: Run inductor_cpp_wrapper?
required: false
type: boolean
default: false
cudagraphs:
description: Run inductor_cudagraphs?
required: false
type: boolean
default: false
freezing_cudagraphs:
description: Run inductor_cudagraphs with freezing for inference?
required: false
type: boolean
default: false
aotinductor:
description: Run aot_inductor for inference?
required: false
type: boolean
default: false
maxautotune:
description: Run inductor_max_autotune?
required: false
type: boolean
default: false
benchmark_configs:
description: The list of configs used the benchmark
required: false
type: string
default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions: read-all
jobs:
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
xpu-n-py3_10-inductor-benchmark-build:
name: xpu-n-py3.10-inductor-benchmark
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-xpu-n-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
runner: linux.c7i.12xlarge
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_xpu", shard: 1, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_huggingface_perf_xpu", shard: 2, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_huggingface_perf_xpu", shard: 3, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_huggingface_perf_xpu", shard: 4, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_huggingface_perf_xpu", shard: 5, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
]}
secrets: inherit
xpu-n-py3_10-inductor-benchmark-test-nightly:
permissions:
id-token: write
contents: read
if: github.event_name != 'workflow_dispatch'
name: xpu-n-py3.10-inductor-benchmark
uses: ./.github/workflows/_xpu-test.yml
needs: xpu-n-py3_10-inductor-benchmark-build
with:
build-environment: linux-jammy-xpu-n-py3.10
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
timeout-minutes: 720
# Disable monitor in perf tests for more investigation
disable-monitor: true
monitor-log-interval: 10
monitor-data-collect-interval: 2
secrets: inherit
xpu-n-py3_10-inductor-benchmark-test:
permissions:
id-token: write
contents: read
if: github.event_name == 'workflow_dispatch'
name: xpu-n-py3.10-inductor-test
uses: ./.github/workflows/_xpu-test.yml
needs: xpu-n-py3_10-inductor-benchmark-build
with:
build-environment: linux-jammy-xpu-n-py3.10
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -1,84 +0,0 @@
name: periodic-rocm-mi200
on:
schedule:
# We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
# Also run less frequently on weekends.
- cron: 45 0,8,16 * * 1-5
- cron: 45 4 * * 0,6
- cron: 45 4,12,20 * * 1-5
- cron: 45 12 * * 0,6
- cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
push:
tags:
- ciflow/periodic/*
- ciflow/periodic-rocm-mi200/*
branches:
- release/*
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
llm-td:
if: github.repository_owner == 'pytorch'
name: before-test
uses: ./.github/workflows/llm_td_retrieval.yml
permissions:
id-token: write
contents: read
target-determination:
name: before-test
uses: ./.github/workflows/target_determination.yml
needs: llm-td
permissions:
id-token: write
contents: read
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-rocm-py3_10-build:
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
secrets: inherit

View File

@ -204,6 +204,37 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-rocm-py3_10-build:
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build:
name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
uses: ./.github/workflows/_linux-build.yml

View File

@ -6,7 +6,6 @@ on:
- pull
- trunk
- periodic
- periodic-rocm-mi200
- periodic-rocm-mi300
- inductor
- unstable

1
.gitignore vendored
View File

@ -143,7 +143,6 @@ scripts/release_notes/*.json
sccache-stats*.json
lint.json
merge_record.json
.github/scripts/nightly_source_matrix.json
# These files get copied over on invoking setup.py
torchgen/packaged/*

View File

@ -374,7 +374,7 @@ cmake_dependent_option(
"Build the lazy Torchscript backend, not compatible with mobile builds" ON
"NOT INTERN_BUILD_MOBILE" OFF)
cmake_dependent_option(BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin folder"
cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
OFF "USE_CUDA" OFF)
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
"CPU_AARCH64" OFF)

View File

@ -19,13 +19,6 @@ inline namespace CPU_CAPABILITY {
#error "Big endian is not supported."
#endif
// GCC does not properly optimize bf16 operators
#if defined(__ARM_FEATURE_BF16) && (__clang_major__ >= 19)
#define BF16_ARITHMETIC_SUPPORTED() 1
#else
#define BF16_ARITHMETIC_SUPPORTED() 0
#endif
// Unlike the float16_t family of types, bfloat16_t is not available
// when we're not targeting bfloat16 hardware support on some
// platforms (but not Mac, so we have to be careful not to shadow the
@ -359,72 +352,18 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
other, &Vectorized<float>::name); \
}
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
Vectorized frac() const;
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)
#ifdef __ARM_FEATURE_BF16
// Flip sign bit
Vectorized<c10::BFloat16> neg() const {
return vreinterpretq_bf16_s16(vreinterpretq_s16_bf16(values) ^ (-32768));
}
// Fast reciprocal is fine because we are truncating results
Vectorized<c10::BFloat16> reciprocal() const {
auto x = vcvtq_low_f32_bf16(values);
auto y = vcvtq_high_f32_bf16(values);
x = vrecpeq_f32(x);
y = vrecpeq_f32(y);
return vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(x), y);
}
// Clearing the sign bit
Vectorized<c10::BFloat16> abs() const {
return vreinterpretq_bf16_u16(vreinterpretq_u16_bf16(values) & 0x7FFF);
}
#else
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
#endif
// These functions are optimized on clang-21+
#if BF16_ARITHMETIC_SUPPORTED() && (__clang_major__ >= 21)
Vectorized<c10::BFloat16> operator==(
const Vectorized<c10::BFloat16>& other) const {
return values == other.values;
}
Vectorized<c10::BFloat16> operator!=(
const Vectorized<c10::BFloat16>& other) const {
return values != other.values;
}
Vectorized<c10::BFloat16> operator<(
const Vectorized<c10::BFloat16>& other) const {
return values < other.values;
}
Vectorized<c10::BFloat16> operator<=(
const Vectorized<c10::BFloat16>& other) const {
return values <= other.values;
}
Vectorized<c10::BFloat16> operator>(
const Vectorized<c10::BFloat16>& other) const {
return values > other.values;
}
Vectorized<c10::BFloat16> operator>=(
const Vectorized<c10::BFloat16>& other) const {
return values >= other.values;
}
#else
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==)
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=)
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<)
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<=)
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>)
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>=)
#endif
#undef DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
#undef DEFINE_BINARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
@ -473,52 +412,28 @@ template <>
Vectorized<c10::BFloat16> inline operator+(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b) {
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
return x + y;
#else
return binary_operator_via_float(std::plus<Vectorized<float>>(), a, b);
#endif
}
template <>
Vectorized<c10::BFloat16> inline operator-(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b) {
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
return x - y;
#else
return binary_operator_via_float(std::minus<Vectorized<float>>(), a, b);
#endif
}
template <>
Vectorized<c10::BFloat16> inline operator*(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b) {
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
return x * y;
#else
return binary_operator_via_float(std::multiplies<Vectorized<float>>(), a, b);
#endif
}
template <>
Vectorized<c10::BFloat16> inline operator/(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b) {
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
return x / y;
#else
return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
#endif
}
// frac. Implement this here so we can use subtraction
@ -629,19 +544,12 @@ Vectorized<c10::BFloat16> inline fmadd(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b,
const Vectorized<c10::BFloat16>& c) {
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
bfloat16x8_t z = c;
return x * y + z;
#else
// NOTE [BF16 FMA]: There isn't an FMA that accumulates into BF16! Also,
// vbfmlalbq_f32 and vbfmlaltq_f32 take the even and odd-numbered
// elements, not the bottom and top half, so they don't seem
// particularly useful here. Ideally we would include dot product in
// the Vectorized interface...
return a * b + c;
#endif
}
template <>
@ -649,15 +557,8 @@ Vectorized<c10::BFloat16> inline fnmadd(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b,
const Vectorized<c10::BFloat16>& c) {
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
bfloat16x8_t z = c;
return (-x) * y + z;
#else
// See NOTE [BF16 FMA] above.
return -a * b + c;
#endif
}
template <>
@ -665,15 +566,8 @@ Vectorized<c10::BFloat16> inline fmsub(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b,
const Vectorized<c10::BFloat16>& c) {
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
bfloat16x8_t z = c;
return x * y - z;
#else
// See NOTE [BF16 FMA] above.
return a * b - c;
#endif
}
template <>
@ -681,15 +575,8 @@ Vectorized<c10::BFloat16> inline fnmsub(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b,
const Vectorized<c10::BFloat16>& c) {
#if BF16_ARITHMETIC_SUPPORTED()
bfloat16x8_t x = a;
bfloat16x8_t y = b;
bfloat16x8_t z = c;
return (-x) * y - z;
#else
// See NOTE [BF16 FMA] above.
return -a * b - c;
#endif
}
#endif // !defined(C10_MOBILE) && defined(__aarch64__)

View File

@ -6,9 +6,9 @@ namespace at::vec {
inline namespace CPU_CAPABILITY {
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
// Enable auto-vectorization for clang-17+
// Enable auto-vectorization for GCC-13+ and clang-17+
// GCC-12 has a bug: gcc.gnu.org/bugzilla/show_bug.cgi?id=117001
#if defined(__clang__) && (__clang_major__ >= 17)
#if __GNUC__ > 12 || (defined(__clang__) && (__clang_major__ >= 17))
template <typename from_type, typename to_type>
inline void convertImpl(

View File

@ -309,7 +309,7 @@ class Vectorized<float> {
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
// Implementation copied from Arm Optimized Routine
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
inline Vectorized<float> vexpq_f32_u20() const {
Vectorized<float> exp_u20() const {
// bail out to sleef if it's a special case:
// i.e. there's an input s.t. |input| > 87.3....
const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
@ -348,9 +348,6 @@ class Vectorized<float> {
return vfmaq_f32(scale, poly, scale);
}
Vectorized<float> exp_u20() const {
return vexpq_f32_u20();
}
Vectorized<float> fexp_u20() const {
return exp_u20();
}
@ -637,7 +634,7 @@ inline Vectorized<float> Vectorized<float>::erf() const {
// - exp(- x * x)
auto pow_2 = (*this) * (*this);
auto neg_pow_2 = pow_2 ^ neg_zero_vec;
auto tmp4 = neg_pow_2.vexpq_f32_u20();
auto tmp4 = neg_pow_2.exp();
auto tmp5 = tmp4 ^ neg_zero_vec;
// erf(x) = sign(x) * (1 - r * t * exp(- x * x))
auto tmp6 = t * tmp5;

View File

@ -1,90 +1,78 @@
#include <ATen/cuda/CUDAGreenContext.h>
#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/driver_api.h>
#include <stdexcept>
#include <vector>
#define HAS_CUDA_GREEN_CONTEXT() 1
#else
#define HAS_CUDA_GREEN_CONTEXT() 0
// Suppress unsued private field warnings as this class is not supposed to be called
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-private-field")
#endif
namespace at::cuda {
GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
#if CUDA_HAS_GREEN_CONTEXT
int driver_version;
C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
TORCH_CHECK(
driver_version >= 12080, "cuda driver too old to use green context!");
CUcontext pctx = nullptr;
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
if (C10_UNLIKELY(!pctx)) {
TORCH_WARN(
"Attempted to create a green context but"
" there was no primary context! Creating a primary context...");
GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
#if HAS_CUDA_GREEN_CONTEXT()
int driver_version;
C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
TORCH_CHECK(
driver_version >= 12080, "cuda driver too old to use green context!");
CUcontext pctx = nullptr;
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
if (C10_UNLIKELY(!pctx)) {
TORCH_WARN(
"Attempted to create a green context but"
" there was no primary context! Creating a primary context...");
cudaFree(0);
}
cudaFree(0);
}
CUdevice device;
device_id_ = device_id;
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
CUdevice device;
device_id_ = device_id;
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
// Get device resources
CUdevResource device_resource;
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
// Get device resources
CUdevResource device_resource;
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
// Split resources
std::vector<CUdevResource> result(1);
auto result_data = result.data();
unsigned int nb_groups = 1;
CUdevResource remaining;
// Split resources
std::vector<CUdevResource> result(1);
auto result_data = result.data();
unsigned int nb_groups = 1;
CUdevResource remaining;
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
result_data,
&nb_groups,
&device_resource,
&remaining,
0, // default flags
num_sms));
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
result_data,
&nb_groups,
&device_resource,
&remaining,
0, // default flags
num_sms));
TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
// Generate resource descriptor
CUdevResourceDesc desc;
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
&desc, result_data, 1));
// Generate resource descriptor
CUdevResourceDesc desc;
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
&desc, result_data, 1));
// Create green context
// CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
&green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
// Create green context
// CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
&green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
// Convert to regular context
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
// Convert to regular context
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
#else
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
#endif
}
std::unique_ptr<GreenContext> GreenContext::create(
uint32_t num_sms,
std::optional<uint32_t> device_id) {
#if HAS_CUDA_GREEN_CONTEXT()
#if CUDA_HAS_GREEN_CONTEXT
if (!device_id.has_value()) {
device_id = at::cuda::current_device();
}
return std::unique_ptr<GreenContext>(new GreenContext(device_id.value(), num_sms));
return std::make_unique<GreenContext>(device_id.value(), num_sms);
#else
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
#endif
@ -92,7 +80,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
// Implement move operations
GreenContext::GreenContext(GreenContext&& other) noexcept{
#if HAS_CUDA_GREEN_CONTEXT()
#if CUDA_HAS_GREEN_CONTEXT
device_id_ = std::exchange(other.device_id_, -1);
green_ctx_ = std::exchange(other.green_ctx_, nullptr);
context_ = std::exchange(other.context_, nullptr);
@ -103,7 +91,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
}
GreenContext& GreenContext::operator=(GreenContext&& other) noexcept{
#if HAS_CUDA_GREEN_CONTEXT()
#if CUDA_HAS_GREEN_CONTEXT
if (this != &other) {
// Clean up current resources
if (green_ctx_) {
@ -132,7 +120,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
}
GreenContext::~GreenContext() noexcept{
#if HAS_CUDA_GREEN_CONTEXT()
#if CUDA_HAS_GREEN_CONTEXT
C10_CUDA_DRIVER_CHECK(
c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
#else
@ -140,9 +128,25 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
#endif
}
// Get the underlying CUDA context
CUcontext GreenContext::getContext() const {
#if CUDA_HAS_GREEN_CONTEXT
return context_;
#else
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
#endif
}
// Get the underlying green context
#if CUDA_HAS_GREEN_CONTEXT
CUgreenCtx GreenContext::getGreenContext() const {
return green_ctx_;
}
#endif
// Make this context current
void GreenContext::setContext() {
#if HAS_CUDA_GREEN_CONTEXT()
#if CUDA_HAS_GREEN_CONTEXT
auto current_stream = c10::cuda::getCurrentCUDAStream();
parent_stream_ = current_stream.stream();
@ -171,7 +175,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
}
void GreenContext::popContext() {
#if HAS_CUDA_GREEN_CONTEXT()
#if CUDA_HAS_GREEN_CONTEXT
// see above note about stream being hardcoded to the default stream
at::cuda::CUDAEvent ev;
ev.record(c10::cuda::getCurrentCUDAStream());

View File

@ -1,38 +1,53 @@
#pragma once
#include <ATen/cuda/CUDAEvent.h>
#include <cuda.h>
// Forward declare green context as opaque ptr
typedef struct CUgreenCtx_st* CUgreenCtx;
#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/driver_api.h>
#include <cuda.h>
#include <memory>
#include <stdexcept>
#include <vector>
#define CUDA_HAS_GREEN_CONTEXT 1
#else
#define CUDA_HAS_GREEN_CONTEXT 0
#endif
namespace at::cuda {
class TORCH_CUDA_CPP_API GreenContext {
public:
// Green context creation
static std::unique_ptr<GreenContext> create(
uint32_t num_sms,
std::optional<uint32_t> device_id);
~GreenContext() noexcept;
GreenContext(uint32_t device_id, uint32_t num_sms);
static std::unique_ptr<GreenContext> create(uint32_t num_sms, std::optional<uint32_t> device_id);
// Delete copy constructor and assignment
GreenContext(const GreenContext&) = delete;
GreenContext& operator=(const GreenContext&) = delete;
// Implement move operations
GreenContext(GreenContext&& other) noexcept;
GreenContext& operator=(GreenContext&& other) noexcept;
~GreenContext() noexcept;
// Get the underlying CUDA context
CUcontext getContext() const;
// Get the underlying green context
#if CUDA_HAS_GREEN_CONTEXT
CUgreenCtx getGreenContext() const;
#endif
// Make this context current
void setContext();
void popContext();
private:
GreenContext(uint32_t device_id, uint32_t num_sms);
// Implement move operations
GreenContext(GreenContext&& other) noexcept;
GreenContext& operator=(GreenContext&& other) noexcept;
#if CUDA_HAS_GREEN_CONTEXT
int32_t device_id_ = -1;
CUgreenCtx green_ctx_ = nullptr;
CUcontext context_ = nullptr;
cudaStream_t parent_stream_ = nullptr;
#endif
};
} // namespace at::cuda

View File

@ -7,6 +7,17 @@
#endif
#if defined(USE_ROCM)
// hipSparse const API added in v2.4.0
#if HIPSPARSE_VERSION >= 200400
#define AT_USE_HIPSPARSE_GENERIC_API() 1
#else
#define AT_USE_HIPSPARSE_GENERIC_API() 1
#endif
#else // USE_ROCM
#define AT_USE_HIPSPARSE_GENERIC_API() 0
#endif // USE_ROCM
// cuSparse Generic API spsv function was added in CUDA 11.3.0
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11500)
#define AT_USE_CUSPARSE_GENERIC_SPSV() 1

View File

@ -1,6 +1,5 @@
#pragma once
#include <c10/core/CachingDeviceAllocator.h>
#include <c10/core/Device.h>
#include <c10/util/Exception.h>
@ -152,36 +151,6 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
}
virtual bool isAvailable() const override;
/* MTIAGraph related APIs */
virtual int64_t mtiagraphCreate(bool keep_graph = false) const {
FAIL_MTIAHOOKS_FUNC(__func__);
return -1;
}
virtual void mtiagraphCaptureBegin(int64_t handle, MempoolId_t pool) const {
FAIL_MTIAHOOKS_FUNC(__func__);
}
virtual void mtiagraphCaptureEnd(int64_t handle) const {
FAIL_MTIAHOOKS_FUNC(__func__);
}
virtual void mtiagraphInstantiate(int64_t handle) const {
FAIL_MTIAHOOKS_FUNC(__func__);
}
virtual void mtiagraphReplay(int64_t handle) const {
FAIL_MTIAHOOKS_FUNC(__func__);
}
virtual void mtiagraphReset(int64_t handle) const {
FAIL_MTIAHOOKS_FUNC(__func__);
}
virtual MempoolId_t mtiagraphPool(int64_t handle) const {
FAIL_MTIAHOOKS_FUNC(__func__);
}
};
struct TORCH_API MTIAHooksArgs {};

View File

@ -534,20 +534,20 @@ Tensor trace_decomp(const Tensor& tensor) {
std::tuple<Tensor, std::optional<int64_t>> tril_batch_rule(
const Tensor& self,
std::optional<int64_t> self_bdim,
c10::SymInt diagonal = 0) {
int64_t diagonal = 0) {
TORCH_CHECK(self.dim() >= 2, "tril: The input tensor must have at least 2 dimensions.");
auto self_ = moveBatchDimToFront(self, self_bdim);
auto result = at::tril_symint(self_, std::move(diagonal));
auto result = at::tril(self_, diagonal);
return std::make_tuple(std::move(result), 0);
}
std::tuple<Tensor, std::optional<int64_t>> triu_batch_rule(
const Tensor& self,
std::optional<int64_t> self_bdim,
c10::SymInt diagonal = 0) {
int64_t diagonal = 0) {
TORCH_CHECK(self.dim() >= 2, "triu: The input tensor must have at least 2 dimensions.");
auto self_ = moveBatchDimToFront(self, self_bdim);
auto result = at::triu_symint(self_, std::move(diagonal));
auto result = at::triu(self_, diagonal);
return std::make_tuple(std::move(result), 0);
}

View File

@ -25,19 +25,18 @@ TORCH_PRECOMPUTE_META_FUNC(avg_pool2d)
// #20866, #22032: Guarantee this for the official C++ API?
TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
"avg_pool2d: kernel_size must either be a single int, or a tuple of two ints");
const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
const int64_t kH = kernel_size[0];
const int64_t kW = kernel_size.size() == 1 ? kH : kernel_size[1];
TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,
"avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints");
const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
const int dW = stride.empty() ? kW :
stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
const int64_t dH = stride.empty() ? kH : stride[0];
const int64_t dW = stride.empty() ? kW : stride.size() == 1 ? dH : stride[1];
TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
"avg_pool2d: padding must either be a single int, or a tuple of two ints");
const int padH = safe_downcast<int, int64_t>(padding[0]);
const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
const int64_t padH = padding[0];
const int64_t padW = padding.size() == 1 ? padH : padding[1];
TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0,
"divisor must be not zero");

View File

@ -410,8 +410,8 @@ struct ConvParams {
return false;
}
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
// broken on cuDNN 9.8 - 9.14
if (cudnn_version >= 90800 && cudnn_version < 91500) {
// broken on cuDNN 9.8
if (cudnn_version >= 90800) {
if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
(input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
weight.dim() == 5) {

View File

@ -139,7 +139,7 @@ void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, dou
}
);
} else {
AT_DISPATCH_ALL_TYPES_AND(kHalf, dtype, "smooth_l1_backward_cpu_out", [&] {
AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] {
auto norm_val = norm.to<scalar_t>();
scalar_t beta_val(beta);
auto norm_val_vec = Vectorized<scalar_t>(norm_val);

View File

@ -170,14 +170,10 @@ static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const
#if defined(CUDA_VERSION) || defined(USE_ROCM)
const auto scalar_type = mat1.scalar_type();
return (beta.toComplexDouble() == 1.0
// self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
// is to use lt interface only when self is bias.
&& self.dim() == 1 && self.sizes()[0] == mat2_sizes[1] && self.is_contiguous()
&& result.dim() == 2 && result.is_contiguous()
// Conditions for bias to be fusable
&& (
self.is_contiguous() &&
// NOTE: fine to have 1-len dims to the left from the right-most one
(self.dim() == 1 || self.squeeze().dim() == 1) &&
self.sizes().back() == mat2_sizes[1]
)
&& ( // some dtype restrictions
#ifndef USE_ROCM
scalar_type == at::ScalarType::Double ||

View File

@ -213,9 +213,9 @@ _f4_f4_bf16_grouped_mm_fbgemm(
const Tensor& mat_a,
const Tensor& mat_b,
const Tensor& scale_a,
const std::optional<Tensor>& global_scale_a,
const Tensor& global_scale_a,
const Tensor& scale_b,
const std::optional<Tensor>& global_scale_b,
const Tensor& global_scale_b,
const std::optional<Tensor>& offs,
const std::optional<Tensor>& bias,
Tensor& out) {
@ -225,28 +225,14 @@ _f4_f4_bf16_grouped_mm_fbgemm(
"mat_a must be Float4_e2n1fn_2, got: ", mat_a.scalar_type());
TORCH_CHECK_VALUE(mat_b.scalar_type() == at::kFloat4_e2m1fn_x2,
"mat_b must be Float4_e2n1fn_2, got: ", mat_b.scalar_type());
std::optional<Tensor> combined_global_scale = std::nullopt;
if (global_scale_a.has_value() || global_scale_b.has_value()) {
// NVFP4
TORCH_CHECK_VALUE(global_scale_a.has_value() && global_scale_b.has_value(),
"For NVFP4 grouped gemm both of global_scale_{a,b} must have values")
TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e4m3fn,
"scale_a must be Float8_e4m3fn, got: ", scale_a.scalar_type());
TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e4m3fn,
"scale_b must be Float8_e4m3fn, got: ", scale_b.scalar_type());
TORCH_CHECK_VALUE(global_scale_a.value().scalar_type() == at::kFloat,
"global_scale_a must be Float, got: ", global_scale_a.value().scalar_type());
TORCH_CHECK_VALUE(global_scale_b.value().scalar_type() == at::kFloat,
"global_scale_b must be Float, got: ", global_scale_b.value().scalar_type());
combined_global_scale = global_scale_a.value().mul(global_scale_b.value());
} else {
// MXFP4
TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e8m0fnu,
"scale_a must be Float8_e8m0fnu, got: ", scale_a.scalar_type());
TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e8m0fnu,
"scale_b must be Float8_e8m0fnu, got: ", scale_b.scalar_type());
}
TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e4m3fn,
"scale_a must be Float8_e4m3fn, got: ", scale_a.scalar_type());
TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e4m3fn,
"scale_b must be Float8_e4m3fn, got: ", scale_b.scalar_type());
TORCH_CHECK_VALUE(global_scale_a.scalar_type() == at::kFloat,
"global_scale_a must be Float, got: ", global_scale_a.scalar_type());
TORCH_CHECK_VALUE(global_scale_b.scalar_type() == at::kFloat,
"global_scale_b must be Float, got: ", global_scale_b.scalar_type());
auto o = fbgemm_gpu::f4f4bf16_grouped_mm(
mat_a,
@ -255,7 +241,7 @@ _f4_f4_bf16_grouped_mm_fbgemm(
scale_b,
offs.value(),
out,
combined_global_scale
global_scale_a.mul(global_scale_b)
);
#else
TORCH_CHECK_NOT_IMPLEMENTED(false, "nvfp4 grouped gemm is not supported without USE_FBGEMM_GENAI, and only for CUDA")
@ -485,10 +471,9 @@ namespace {
using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 4> scale_grouped_kernel_dispatch = {{
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 3> scale_grouped_kernel_dispatch = {{
{ "rowwise_rowwise", scaled_blas::check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
{ "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8},
{ "mxfp4_mxfp4", scaled_blas::check_mxfp4_recipe, ScaledGemmImplementation::MXFP4_MXFP4},
{ "nvfp4_nvfp4", scaled_blas::check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4}}};
} // anonymous namespace
@ -614,21 +599,6 @@ _scaled_grouped_mm_cuda_v2(
offs.value(),
out);
}
case ScaledGemmImplementation::MXFP4_MXFP4: {
// scale shape checks
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
return _f4_f4_bf16_grouped_mm_fbgemm(
mat_a,
mat_b,
scale_a[0], /* block-scale A */
std::nullopt, /* global-scale A */
scale_b[0], /* block-scale B */
std::nullopt, /* global-scale B */
offs.value(),
std::nullopt, /* bias */
out);
}
case ScaledGemmImplementation::NVFP4_NVFP4: {
// scale shape checks
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);

View File

@ -13,7 +13,7 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx,
if (allow_neg_indices) {
ind = (ind < 0) ? ind + ind_dim_size : ind;
}
CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds", "Expected 0 <= index < ind_dim_size(%ld), but got index = %ld", ind_dim_size, ind);
CUDA_KERNEL_ASSERT(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds");
int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
if (off >= slice_size) return;
auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);

View File

@ -794,24 +794,6 @@ void _check_deepseek_scale_stride(const Tensor& scale, const Tensor& t, const Sc
}
}
void
_check_deepseek_support() {
#ifndef USE_ROCM
auto dprops = at::cuda::getCurrentDeviceProperties();
if (dprops->major != 9) {
// Only on Hopper GPUs
TORCH_CHECK_NOT_IMPLEMENTED(
dprops->major == 9,
"DeepSeek style (1x128, 128x128) scaling only supported in CUDA for SM90")
}
// Only in cublasLt >= 12.9
TORCH_CHECK_NOT_IMPLEMENTED(
CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900,
"DeepSeek style (1x128, 128x128) scaling requires cublasLt >= 12.9"
);
#endif
}
Tensor&
_scaled_block1x128_block1x128(
const Tensor& mat_a, const Tensor& mat_b,
@ -820,12 +802,8 @@ _scaled_block1x128_block1x128(
const c10::ScalarType out_dtype,
const bool use_fast_accum,
Tensor& out) {
#ifndef USE_ROCM
// Restrictions:
// A, B are FP8, scales are fp32, shape K//128
// CUDA: Only Hopper GPUs
_check_deepseek_support();
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
mat_a.scalar_type(), mat_b.scalar_type());
TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
@ -843,12 +821,6 @@ _scaled_block1x128_block1x128(
_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
return out;
#else
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"1x128 and 128x128 scaling not available with ROCm"
);
#endif
}
Tensor&
@ -859,12 +831,10 @@ _scaled_block128x128_block1x128(
const c10::ScalarType out_dtype,
const bool use_fast_accum,
Tensor& out) {
#ifndef USE_ROCM
// Restrictions:
// A, B are FP8, scales are fp32, shape K//128
// CUDA: Only Hopper GPUs
_check_deepseek_support();
std::cout << "mat_b: " << mat_b.dim() << ", " << mat_b.sizes() << ", " << mat_b.strides() << std::endl;
std::cout << "scale_b: " << scale_b.dim() << ", " << scale_b.sizes() << ", " << scale_b.strides() << std::endl;
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
mat_a.scalar_type(), mat_b.scalar_type());
TORCH_CHECK_VALUE(scale_a.sizes()[0] == ceil_div<int64_t>(mat_a.sizes()[0], 128) && scale_a.sizes()[1] == ceil_div<int64_t>(mat_a.sizes()[1], 128) && scale_a.scalar_type() == kFloat,
@ -882,12 +852,6 @@ _scaled_block128x128_block1x128(
_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
return out;
#else
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"1x128 and 128x128 scaling not available with ROCm"
);
#endif
}
Tensor&
@ -898,12 +862,8 @@ _scaled_block1x128_block128x128(
const c10::ScalarType out_dtype,
const bool use_fast_accum,
Tensor& out) {
#ifndef USE_ROCM
// Restrictions:
// A, B are FP8, scales are fp32, A: shape K//128, B: K//128, N//128
// CUDA: Only Hopper GPUs
_check_deepseek_support();
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
mat_a.scalar_type(), mat_b.scalar_type());
TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
@ -921,12 +881,6 @@ _scaled_block1x128_block128x128(
_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
return out;
#else
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"1x128 and 128x128 scaling not available with ROCm"
);
#endif
}
Tensor&

View File

@ -160,8 +160,8 @@ struct _cuda_scatter_gather_internal_kernel {
auto offsets = offset_calc.get(i);
int64_t idx_dim = *(index_t*)(index_ptr + offsets[2]);
CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
&& "scatter gather kernel index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);
CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
&& "scatter gather kernel index out of bounds");
f(
(scalar_t*)(self_ptr + offsets[0]),
@ -406,8 +406,9 @@ struct _cuda_scatter_fill_internal_kernel {
auto offsets = offset_calc.get(i);
int64_t idx_dim = *(index_t*)(index_ptr + offsets[1]);
CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
&& "index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);
CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
&& "index out of bounds"
);
f(
(scalar_t*)(self_ptr + offsets[0]),

View File

@ -141,8 +141,7 @@ WelfordDataLN cuWelfordOnlineSum(
if constexpr (!rms_norm){
U delta = val - curr_sum.mean;
U new_count = curr_sum.count + 1.f;
//Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
#if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
#else
U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
@ -164,8 +163,7 @@ WelfordDataLN cuWelfordCombine(
U count = dataA.count + dataB.count;
U mean, sigma2;
if (count > decltype(dataB.count){0}) {
//Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
#if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
auto coef = __builtin_amdgcn_rcpf(count);
#else
auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division

View File

@ -86,28 +86,6 @@ struct zeta_functor {
}
};
struct logaddexp_functor {
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
inline T operator()(const T a, const T b) {
return c10::metal::logaddexp(a, b);
}
template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
inline float operator()(const T a, const T b) {
return c10::metal::logaddexp(float(a), float(b));
}
};
struct logaddexp2_functor {
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
inline T operator()(const T a, const T b) {
return c10::metal::logaddexp2(a, b);
}
template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
inline float operator()(const T a, const T b) {
return c10::metal::logaddexp2(float(a), float(b));
}
};
struct xlog1py_functor {
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
inline T operator()(const T a, const T b) {
@ -399,10 +377,6 @@ REGISTER_FLOAT_BINARY_OP(fmin);
REGISTER_FLOAT_BINARY_OP(nextafter);
REGISTER_FLOAT_BINARY_OP(zeta);
REGISTER_INT2FLOAT_BINARY_OP(zeta);
REGISTER_FLOAT_BINARY_OP(logaddexp);
REGISTER_INT2FLOAT_BINARY_OP(logaddexp);
REGISTER_FLOAT_BINARY_OP(logaddexp2);
REGISTER_INT2FLOAT_BINARY_OP(logaddexp2);
REGISTER_FLOAT_BINARY_OP(xlog1py);
REGISTER_INT2FLOAT_BINARY_OP(xlog1py);
REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_t);
@ -489,8 +463,6 @@ REGISTER_BINARY_OP(add, float2, float2);
REGISTER_BINARY_OP(add, half2, half2);
REGISTER_BINARY_OP(sub, float2, float2);
REGISTER_BINARY_OP(sub, half2, half2);
REGISTER_BINARY_OP(logaddexp, float2, float2);
REGISTER_BINARY_OP(logaddexp, half2, half2);
REGISTER_BINARY_ALPHA_OP(add_alpha, float2, float2, float2);
REGISTER_BINARY_ALPHA_OP(add_alpha, half2, half2, half2);
REGISTER_BINARY_ALPHA_OP(sub_alpha, float2, float2, float2);

View File

@ -89,14 +89,6 @@ static void zeta_mps_kernel(TensorIteratorBase& iter) {
lib.exec_binary_kernel(iter, "zeta");
}
static void logaddexp_mps_kernel(TensorIteratorBase& iter) {
lib.exec_binary_kernel(iter, "logaddexp");
}
static void logaddexp2_mps_kernel(TensorIteratorBase& iter) {
lib.exec_binary_kernel(iter, "logaddexp2");
}
static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "xlog1py_mps not implemented for non-floating types");
lib.exec_binary_kernel(iter, "xlog1py");
@ -219,8 +211,6 @@ REGISTER_DISPATCH(fmin_stub, &fmin_mps_kernel)
REGISTER_DISPATCH(copysign_stub, &copysign_mps_kernel)
REGISTER_DISPATCH(nextafter_stub, &nextafter_mps_kernel)
REGISTER_DISPATCH(zeta_stub, &zeta_mps_kernel)
REGISTER_DISPATCH(logaddexp_stub, &logaddexp_mps_kernel);
REGISTER_DISPATCH(logaddexp2_stub, &logaddexp2_mps_kernel);
REGISTER_DISPATCH(xlog1py_stub, &xlog1py_mps_kernel)
REGISTER_DISPATCH(chebyshev_polynomial_t_stub, &chebyshev_polynomial_t_mps_kernel)
REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_mps_kernel)

View File

@ -17,6 +17,8 @@
#include <ATen/ops/ge_native.h>
#include <ATen/ops/gt_native.h>
#include <ATen/ops/le_native.h>
#include <ATen/ops/logaddexp2_native.h>
#include <ATen/ops/logaddexp_native.h>
#include <ATen/ops/logical_and_native.h>
#include <ATen/ops/logical_or_native.h>
#include <ATen/ops/logical_xor_native.h>
@ -275,6 +277,30 @@ TORCH_IMPL_FUNC(pow_Scalar_out_mps)(const Scalar& base, const Tensor& exp, const
}
}
TORCH_IMPL_FUNC(logaddexp_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
mps::BinaryOpBlock logaddexp_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
MPSGraph* mpsGraph = cachedGraph->graph();
MPSGraphTensor* sumTensor =
[mpsGraph additionWithPrimaryTensor:[mpsGraph exponentWithTensor:primaryCastTensor name:nil]
secondaryTensor:[mpsGraph exponentWithTensor:secondaryCastTensor name:nil]
name:nil];
return [mpsGraph logarithmWithTensor:sumTensor name:nil];
};
mps::binaryOpTensor(self, other, output, "logaddexp_out_mps", logaddexp_op_block);
}
TORCH_IMPL_FUNC(logaddexp2_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
mps::BinaryOpBlock logaddexp2_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
MPSGraph* mpsGraph = cachedGraph->graph();
MPSGraphTensor* sumTensor =
[mpsGraph additionWithPrimaryTensor:[mpsGraph exponentBase2WithTensor:primaryCastTensor name:nil]
secondaryTensor:[mpsGraph exponentBase2WithTensor:secondaryCastTensor name:nil]
name:nil];
return [mpsGraph logarithmBase2WithTensor:sumTensor name:nil];
};
mps::binaryOpTensor(self, other, output, "logaddexp2_out_mps", logaddexp2_op_block);
}
TORCH_IMPL_FUNC(xlogy_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
mps::BinaryOpBlock xlogy_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
MPSGraph* mpsGraph = cachedGraph->graph();

View File

@ -370,7 +370,7 @@ static void nllnd_loss_backward_impl(Tensor& grad_input_arg,
onValue:-1.0f
offValue:0.0f
name:nil];
oneHotTensor = castMPSTensor(mpsGraph, oneHotTensor, [inputTensor dataType]);
oneHotTensor = castMPSTensor(mpsGraph, oneHotTensor, inputTensor.dataType);
if (isWeightsArrayValid) {
oneHotTensor = [mpsGraph multiplicationWithPrimaryTensor:oneHotTensor
secondaryTensor:weightTensor
@ -705,7 +705,6 @@ static void smooth_l1_loss_template(const Tensor& input,
TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.");
TORCH_CHECK(input.is_mps());
TORCH_CHECK(target.is_mps());
TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "MPS doesn't know how to do square_i64");
if ((input.numel() == 0) || (target.numel() == 0)) {
reduction == Reduction::Mean ? output.fill_(std::numeric_limits<float>::quiet_NaN()) : output.zero_();
return;
@ -772,7 +771,7 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar:beta dataType:[inputTensor dataType]];
MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar:beta dataType:MPSDataTypeFloat32];
// xn - yn
MPSGraphTensor* diffTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensor
secondaryTensor:targetTensor
@ -798,8 +797,7 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
name:@"lossTensor"];
MPSGraphTensor* outputTensor = lossTensor;
if (reduction == Reduction::Mean) {
MPSGraphTensor* numelTensor = [mpsGraph constantWithScalar:(double)input.numel()
dataType:[lossTensor dataType]];
MPSGraphTensor* numelTensor = [mpsGraph constantWithScalar:(double)input.numel() dataType:MPSDataTypeFloat32];
outputTensor = [mpsGraph divisionWithPrimaryTensor:lossTensor secondaryTensor:numelTensor name:nil];
}
MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:outputTensor

View File

@ -84,9 +84,6 @@ std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_mps_out(const Tensor& self,
Tensor& output,
Tensor& save_mean,
Tensor& save_var) {
TORCH_CHECK_NOT_IMPLEMENTED(self.scalar_type() != kLong, "Long batch norm is not supported with MPS");
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()),
"Batch norm for complex is not supported for MPS");
using namespace at::native::mps;
struct CachedGraph : public MPSCachedGraph {
CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@ -921,7 +918,6 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_mps(const Tensor& input,
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
const int axis = input_ndim - normalized_ndim;
MPSStream* stream = getCurrentMPSStream();
TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "Not implemented for long on MPS");
@autoreleasepool {
mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
// which kernel variant to use based on the normalized axis N size

View File

@ -1028,18 +1028,15 @@ TORCH_IMPL_FUNC(prod_out_mps)
}
TORCH_IMPL_FUNC(amax_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amax is not defined for complex types");
reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMAX, "amax_out_mps");
}
TORCH_IMPL_FUNC(amin_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amin is not defined for complex types");
reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMIN, "amin_out_mps");
}
TORCH_IMPL_FUNC(aminmax_out_mps)
(const Tensor& input_t, std::optional<int64_t> dim_opt, bool keepdim, const Tensor& min_t, const Tensor& max_t) {
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "aminmax is not defined for complex types");
reduction_out_mps(input_t,
dim_opt.has_value() ? OptionalIntArrayRef({*dim_opt}) : std::nullopt,
keepdim,

View File

@ -31,7 +31,6 @@ void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& v
indices.copy_(values.toType(at::ScalarType::Long));
return;
}
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()), "kthvalue is not implemented for complex types");
// issue #154890, raising error to prevent crash within MPSGraph until
// workaround is implemented.
TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");

View File

@ -3622,7 +3622,8 @@
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: logaddexp_out
CPU, CUDA: logaddexp_out
MPS: logaddexp_out_mps
tags: pointwise
- func: logaddexp(Tensor self, Tensor other) -> Tensor
@ -3634,7 +3635,8 @@
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: logaddexp2_out
CPU, CUDA: logaddexp2_out
MPS: logaddexp2_out_mps
tags: pointwise
- func: logaddexp2(Tensor self, Tensor other) -> Tensor
@ -8865,11 +8867,11 @@
autogen: bitwise_right_shift.Scalar_Tensor_out
tags: pointwise
- func: tril_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)
- func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
structured_delegate: tril.out
variants: method
- func: triu_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)
- func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
structured_delegate: triu.out
variants: method
@ -8993,25 +8995,25 @@
- func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
variants: method, function
- func: triu.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
- func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU: triu_cpu
CUDA: triu_cuda
MPS: triu_mps_out
- func: triu(Tensor self, SymInt diagonal=0) -> Tensor
- func: triu(Tensor self, int diagonal=0) -> Tensor
structured_delegate: triu.out
variants: method, function
- func: tril.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
- func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU: tril_cpu
CUDA: tril_cuda
MPS: tril_mps_out
- func: tril(Tensor self, SymInt diagonal=0) -> Tensor
- func: tril(Tensor self, int diagonal=0) -> Tensor
structured_delegate: tril.out
variants: method, function

View File

@ -467,28 +467,6 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, IntArrayRe
!options.has_layout() || options.layout() == kSparse,
"expected sparse layout, but got layout ",
options.layout());
if (indices.numel() > 0) {
Tensor min_indices =
std::get</* values */ 0>(indices.min(/* dim */ 1, /* keepdim */ false));
Tensor cpu_min_indices;
if (!indices.is_cpu()) {
cpu_min_indices = min_indices.to(at::DeviceType::CPU);
} else {
cpu_min_indices = min_indices;
}
auto cpu_min_indices_accessor = cpu_min_indices.accessor<int64_t, 1>();
for (const auto d : c10::irange(indices.size(0))) {
int64_t min_index_in_dim = cpu_min_indices_accessor[d];
TORCH_CHECK(
min_index_in_dim >= 0,
"found negative index ",
min_index_in_dim,
" for dim ",
d);
}
}
return at::native::_sparse_coo_tensor_unsafe(
indices,
values,

View File

@ -1837,10 +1837,6 @@ class BenchmarkRunner:
def skip_models_for_cuda(self):
return set()
@property
def skip_models_for_xpu(self):
return set()
@property
def skip_models_for_cpu(self):
return set()
@ -3931,8 +3927,6 @@ def run(runner, args, original_dir=None):
runner.skip_models.update(runner.skip_models_for_cpu_aarch64)
elif args.devices == ["cuda"]:
runner.skip_models.update(runner.skip_models_for_cuda)
elif args.devices == ["xpu"]:
runner.skip_models.update(runner.skip_models_for_xpu)
if not args.multiprocess:
runner.skip_models.update(runner.skip_multiprocess_models)

View File

@ -124,10 +124,6 @@ class TorchBenchmarkRunner(BenchmarkRunner):
def skip_models_for_cuda(self):
return self._skip["device"]["cuda"]
@property
def skip_models_for_xpu(self):
return self._skip["device"]["xpu"]
@property
def skip_models_for_freezing_cuda(self):
return self._skip["freezing"]["cuda"]

View File

@ -217,9 +217,6 @@ skip:
cuda: []
xpu:
- *DETECTRON2_MODELS
test:
training:
- *DETECTRON2_MODELS

View File

@ -125,6 +125,17 @@ AttentionType = Literal[
]
DtypeString = Literal["bfloat16", "float16", "float32"]
SpeedupType = Literal["fwd", "bwd"]
# Operator Name mapping
backend_to_operator_name = {
"math": "math attention kernel",
"efficient": "efficient attention kernel",
"cudnn": "cudnn attention kernel",
"fav2": "flash attention 2 kernel",
"fav3": "flash attention 3 kernel",
"fakv": "flash attention kv cache kernel",
"og-eager": "eager attention kernel",
"flex": "flex attention kernel",
}
def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
@ -1265,12 +1276,14 @@ def _output_json_for_dashboard(
model: ModelInfo
metric: MetricInfo
operator_name = backend_to_operator_name.get(backend, backend)
# Benchmark extra info
benchmark_extra_info = {
"input_config": input_config,
"device": device,
"arch": device_arch,
"operator_name": backend,
"operator_name": operator_name,
"attn_type": config.attn_type,
"shape": str(config.shape),
"max_autotune": config.max_autotune,
@ -1288,7 +1301,7 @@ def _output_json_for_dashboard(
type="attention-benchmark",
origins=["pytorch"],
extra_info={
"operator_name": backend,
"operator_name": operator_name,
"attn_type": config.attn_type,
},
),
@ -1315,7 +1328,7 @@ def _output_json_for_dashboard(
type="attention-benchmark",
origins=["pytorch"],
extra_info={
"operator_name": backend,
"operator_name": operator_name,
},
),
metric=MetricInfo(
@ -1341,7 +1354,7 @@ def _output_json_for_dashboard(
type="attention-benchmark",
origins=["pytorch"],
extra_info={
"operator_name": backend,
"operator_name": operator_name,
},
),
metric=MetricInfo(
@ -1371,7 +1384,7 @@ def _output_json_for_dashboard(
type="attention-benchmark",
origins=["pytorch"],
extra_info={
"operator_name": backend,
"operator_name": operator_name,
},
),
metric=MetricInfo(

View File

@ -1,4 +1,4 @@
// Implementation of special math functions for Metal
// Implementation of specal math functions for Metal
#pragma once
#include <c10/metal/expm1f.h>
#include <c10/metal/igamma.h>
@ -624,64 +624,6 @@ inline T spherical_bessel_j0(T x) {
return static_cast<T>(::metal::sin(x) / x);
}
template <typename T>
inline ::metal::enable_if_t<is_scalar_floating_point_v<T>, T> logaddexp(
T a,
T b) {
float a0 = static_cast<float>(a);
float b0 = static_cast<float>(b);
if (::metal::isinf(a0) && a0 == b0) {
return static_cast<T>(a0);
} else {
float m0 = ::metal::max(a0, b0);
return static_cast<T>(
m0 + ::c10::metal::log1p(::metal::exp(-::metal::abs(a0 - b0))));
}
}
// The function is ported from mlx
template <typename T>
inline ::metal::enable_if_t<is_complex_v<T>, T> logaddexp(T a, T b) {
if (::metal::isnan(a.x) || ::metal::isnan(a.y) || ::metal::isnan(b.x) ||
::metal::isnan(b.y)) {
return T(NAN, NAN);
}
T maxval = a.x > b.x ? a : b;
T minval = a.x < b.x ? a : b;
constexpr auto inf = ::metal::numeric_limits<T>::infinity().x;
if (minval.x == -inf || maxval.x == inf) {
return maxval;
}
float2 maxval_ = static_cast<float2>(maxval);
float2 minval_ = static_cast<float2>(minval);
float m = ::metal::exp(minval_.x - maxval_.x);
float2 dexp{
m * ::metal::cos(minval_.y - maxval_.y),
m * ::metal::sin(minval_.y - maxval_.y),
};
return static_cast<T>(maxval_ + ::c10::metal::log1p(dexp));
}
template <typename T>
inline T logaddexp2(T a, T b) {
constexpr auto log_2 = float(0.693147180559945309417232121458176);
constexpr auto inv_log_2 = float(1) / log_2;
float a0 = static_cast<float>(a);
float b0 = static_cast<float>(b);
if (::metal::isinf(a0) && a0 == b0) {
return static_cast<T>(a0);
} else {
float m0 = ::metal::max(a0, b0);
return static_cast<T>(
m0 +
::c10::metal::log1p(::metal::pow(float(2), -::metal::abs(a0 - b0))) *
inv_log_2);
}
}
template <typename T>
inline float xlog1py(T x, T y) {
if (::metal::isnan(y)) {

View File

@ -322,24 +322,6 @@ inline float log1p(float x) {
return rc;
}
// The function is ported from mlx
inline float2 log1p(float2 in) {
float x = in.x;
float y = in.y;
float zabs = ::metal::precise::sqrt(x * x + y * y);
float theta = ::metal::atan2(y, x + 1);
if (zabs < 0.5f) {
float r = x * (2 + x) + y * y;
if (r == 0) { // handle underflow
return {x, theta};
}
return {0.5f * log1p(r), theta};
} else {
auto z0 = ::metal::sqrt((x + 1) * (x + 1) + y * y);
return {::metal::log(z0), theta};
}
}
template <typename T1, typename T2 = T1>
struct pair {
T1 first;

View File

@ -34,7 +34,7 @@ struct MemEvent {
bool overlaps(const MemBlock& a, const MemBlock& b) {
// two blocks dont overlap if
// |---a--------|--------------b--------|
// start_a end_a <= start_b end_b
// strat_a end_a <= start_b end_b
return !(
(a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));
}

View File

@ -33,7 +33,7 @@ struct bitset final {
constexpr bitset() noexcept = default;
constexpr bitset(const bitset&) noexcept = default;
constexpr bitset(bitset&&) noexcept = default;
// there is an issue for gcc 5.3.0 when define default function as constexpr
// there is an issure for gcc 5.3.0 when define default function as constexpr
// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68754.
bitset& operator=(const bitset&) noexcept = default;
bitset& operator=(bitset&&) noexcept = default;

View File

@ -554,17 +554,6 @@ class DeviceCachingAllocator {
}
}
double getMemoryFraction() {
if (!set_fraction) {
return 1.0;
}
c10::xpu::DeviceProp device_prop;
c10::xpu::get_device_properties(&device_prop, device_index);
return static_cast<double>(allowed_memory_maximum) /
static_cast<double>(device_prop.global_mem_size);
}
void setMemoryFraction(double fraction) {
c10::xpu::DeviceProp device_prop;
c10::xpu::get_device_properties(&device_prop, device_index);
@ -735,11 +724,6 @@ class XPUAllocator : public DeviceAllocator {
device_allocators[device]->resetAccumulatedStats();
}
double getMemoryFraction(DeviceIndex device) {
assertValidDevice(device);
return device_allocators[device]->getMemoryFraction();
}
void setMemoryFraction(double fraction, DeviceIndex device) {
assertValidDevice(device);
TORCH_CHECK_VALUE(
@ -793,10 +777,6 @@ void recordStream(const DataPtr& dataPtr, XPUStream stream) {
return allocator.recordStream(dataPtr, stream);
}
double getMemoryFraction(DeviceIndex device) {
return allocator.getMemoryFraction(device);
}
void setMemoryFraction(double fraction, DeviceIndex device) {
return allocator.setMemoryFraction(fraction, device);
}

View File

@ -25,8 +25,6 @@ C10_XPU_API void raw_delete(void* ptr);
C10_XPU_API void recordStream(const DataPtr& dataPtr, XPUStream stream);
C10_XPU_API double getMemoryFraction(DeviceIndex device);
C10_XPU_API void setMemoryFraction(double fraction, DeviceIndex device);
} // namespace c10::xpu::XPUCachingAllocator

View File

@ -38,7 +38,7 @@ uint32_t crc32_combine (uint32_t crcA, uint32_t crcB, size_t lengthB);
/// compute CRC32 (bitwise algorithm)
uint32_t crc32_bitwise (const void* data, size_t length, uint32_t previousCrc32 = 0);
/// compute CRC32 (half-byte algorithm)
/// compute CRC32 (half-byte algoritm)
uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32 = 0);
#ifdef CRC32_USE_LOOKUP_TABLE_BYTE
@ -96,7 +96,7 @@ uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previo
#define __BIG_ENDIAN 4321
#endif
// define endianness and some integer data types
// define endianess and some integer data types
#if defined(_MSC_VER) || defined(__MINGW32__)
// Windows always little endian
#define __BYTE_ORDER __LITTLE_ENDIAN
@ -168,7 +168,7 @@ namespace
/// zlib's CRC32 polynomial
const uint32_t Polynomial = 0xEDB88320;
/// swap endianness
/// swap endianess
static inline uint32_t swap(uint32_t x)
{
#if defined(__GNUC__) || defined(__clang__)
@ -229,7 +229,7 @@ uint32_t crc32_bitwise(const void* data, size_t length, uint32_t previousCrc32)
}
/// compute CRC32 (half-byte algorithm)
/// compute CRC32 (half-byte algoritm)
uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32)
{
uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
@ -662,7 +662,7 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)
// - if you append length(B) zeros to A and call it A' (think of it as AAAA000)
// and prepend length(A) zeros to B and call it B' (think of it as 0000BBB)
// then exists a C' = A' ^ B'
// - remember: if you XOR something with zero, it remains unchanged: X ^ 0 = X
// - remember: if you XOR someting with zero, it remains unchanged: X ^ 0 = X
// - that means C' = A concat B so that crc(A concat B) = crc(C') = crc(A') ^ crc(B')
// - the trick is to compute crc(A') based on crc(A)
// and crc(B') based on crc(B)

View File

@ -76,7 +76,7 @@ typedef struct mz_zip_archive mz_zip_archive;
// 2) Writing with 1-pass sequential access
// -> We must take care not to require updating values that have already
// been written. We place the variable-length index at the end and do
// not put any index into the header to fulfill this constraint.
// not put any indicies into the header to fulfill this constraint.
// The model.json, which contains all the metadata information,
// should be written as the last file. One reason is that the size of tensor

View File

@ -519,7 +519,7 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoadWithAllocator) {
std::tie(data_ptr, size) = reader.getRecord("key1", &overrideAllocator);
EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1);
EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes);
// allocate with base allocator
// allcoate with base allocator
std::tie(data_ptr, size) = reader.getRecord("key1");
EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1);
EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes + kBytes1);

View File

@ -2,9 +2,9 @@
## Overview
The LibTorch Stable ABI (Application Binary Interface) provides a limited interface for extending PyTorch functionality without being tightly coupled to specific PyTorch versions. This enables the development of custom operators and extensions that remain compatible across PyTorch releases. This limited set of APIs is not intended to replace existing LibTorch, but rather to provide a stable foundation for a majority of custom extension use cases. If there is any API you would like to see added to the stable ABI, please file a request through a [new issue on the PyTorch repo](https://github.com/pytorch/pytorch/issues).
The LibTorch Stable ABI (Application Binary Interface) provides an interface for extending PyTorch functionality without being tightly coupled to specific PyTorch versions. This enables the development of custom operators and extensions that remain compatible across PyTorch releases.
The limited stable ABI consists of three main components:
The stable ABI consists of three main components:
1. **Stable C headers** - Low-level C API implemented by libtorch (primarily `torch/csrc/inductor/aoti_torch/c/shim.h`)
2. **Header-only C++ library** - Standalone utilities implemented in only headers such that there is no dependence on libtorch (`torch/headeronly/*`)
@ -14,8 +14,8 @@ We discuss each of these in detail
### `torch/headeronly`
The inlined C++ headers living in [`torch/headeronly`](https://github.com/pytorch/pytorch/tree/main/torch/headeronly) are completely decoupled from LibTorch. The headers consist of certain utilities that might be familiar to custom extension writers. For example, the
`c10::ScalarType` enum lives here as `torch::headeronly::ScalarType`, as well as a libtorch-independent version of `TORCH_CHECK` that is `STD_TORCH_CHECK`. You can trust all APIs in the `torch::headeronly` namespace to not depend on `libtorch.so`. These APIs are also globally listed in [torch/header_only_apis.txt](https://github.com/pytorch/pytorch/blob/main/torch/header_only_apis.txt).
This is a set of inlined C++ headers are completely decoupled from libtorch. The headers consist of certain utilities that might be familiar to custom extension writers. For example, the
`c10::ScalarType` enum lives here as `torch::headeronly::ScalarType`.
### `torch/csrc/stable`
@ -34,14 +34,8 @@ We are continuing to improve coverage in our `torch/csrc/stable` APIs. Please fi
### Stable C headers
The stable C headers started by AOTInductor form the foundation of the stable ABI. Presently, the available C headers include:
- [torch/csrc/inductor/aoti_torch/c/shim.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/inductor/aoti_torch/c/shim.h): Includes C-style shim APIs for commonly used regarding Tensors, dtypes, CUDA, and the like.
- [torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h): Includes C-style shim APIs for ATen ops from `native_functions.yaml` (e.g. `aoti_torch_aten_new_empty`).
- [torch/csrc/inductor/aoti_torch/generated/c_shim_*.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/inductor/aoti_torch/generated): Includes C-style shim APIs for specific backend kernels dispatched from `native_functions.yaml` (e.g. `aoti_torch_cuda_pad`). These APIs should only be used for the specific backend they are named after (e.g. `aoti_torch_cuda_pad` should only be used within CUDA kernels), as they opt out of the dispatcher.
- [torch/csrc/stable/c/shim.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/stable/c/shim.h): We are building out more ABIs to logically live in `torch/csrc/stable/c` instead of continuing the AOTI naming that no longer makes sense for our general use case.
These headers are promised to be ABI stable across releases and adhere to a stronger backwards compatibility policy than LibTorch. Specifically, we promise not to modify them for at least 2 years after they are released. However, this is **use at your own risk**. For example, users must handle the memory lifecycle of objects returned by certain APIs. Further, the stack-based APIs discussed below which allow the user to call into the PyTorch dispatcher do not provide strong guarantees on forward and backward compatibility of the underlying op that is called.
The stable C headers used by AOTInductor form the foundation of the stable ABI. However, this is **use at your own risk**. For example, users must handle the memory lifecycle of objects returned by certain APIs.
Further, the stack-based APIs discussed below which allow the user to call the PyTorch dispatcher don't provide strong guarantees on forward and backward compatibility.
Unless absolutely necessary, we recommend the high-level C++ API in `torch/csrc/stable`
which will handle all the rough edges of the C API for the user.

View File

@ -76,7 +76,6 @@
:nosignatures:
empty_cache
get_per_process_memory_fraction
max_memory_allocated
max_memory_reserved
mem_get_info

View File

@ -1106,7 +1106,7 @@ class build_ext(setuptools.command.build_ext.build_ext):
continue
self.copy_file(source_lib, target_lib)
# Delete old rpath and add @loader_lib to the rpath
# This should prevent deallocate from attempting to package another instance
# This should prevent delocate from attempting to package another instance
# of OpenMP library in torch wheel as well as loading two libomp.dylib into
# the address space, as libraries are cached by their unresolved names
install_name_tool_args = [

View File

@ -266,7 +266,7 @@ class TestFullyShardPostAccGradHookMultiThread(FSDPTestMultiThread):
model(inp).sum().backward()
param_names = {param_name for param_name, _ in model.named_parameters()}
self.assertEqual(param_names, set(param_name_to_hook_count.keys()))
for count in param_name_to_hook_count.values():
for param_name, count in param_name_to_hook_count.items():
self.assertEqual(count, 1)

View File

@ -827,7 +827,7 @@ class TestFullyShardShardPlacementFnMultiProcess(FSDPTest):
torch.manual_seed(42 + self.rank)
inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type.type)
for _ in range(5):
for iter_idx in range(5):
ref_loss = ref_model(inp).sum()
loss = model(inp).sum()
self.assertEqual(ref_loss, loss)

View File

@ -800,7 +800,6 @@ if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS or IS_CI):
stderr_redirects={0: stderr_redir},
ret_vals={0: queue},
queue_finished_reading_event=worker_finished_event_mock,
numa_options=None,
)
self.assertEqual("hello_0", queue.get())
if stdout_redir:

View File

@ -31,17 +31,17 @@ if TEST_WITH_DEV_DBG_ASAN:
sys.exit(0)
_DISTRIBUTED_STATE_DICT_IMPLS = {
_DISTRIBUTED_STATE_DICT_IMPLS = (
StateDictType.LOCAL_STATE_DICT,
StateDictType.SHARDED_STATE_DICT,
}
)
class TestDistributedCheckpoint(FSDPTest):
@property
def world_size(self):
if torch.accelerator.is_available():
gpu_cnt = torch.accelerator.device_count()
if torch.cuda.is_available():
gpu_cnt = torch.cuda.device_count()
if gpu_cnt < 2:
return gpu_cnt
return 2
@ -93,9 +93,7 @@ class TestDistributedCheckpoint(FSDPTest):
# TODO: add resharding test case.
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(
TestDistributedCheckpoint, globals(), only_for=devices, allow_xpu=True
)
devices = ("cuda", "hpu")
instantiate_device_type_tests(TestDistributedCheckpoint, globals(), only_for=devices)
if __name__ == "__main__":
run_tests()

View File

@ -36,8 +36,8 @@ device_type = torch.device(get_devtype())
class TestApply(FSDPTest):
@property
def world_size(self):
if torch.accelerator.is_available():
gpu_cnt = torch.accelerator.device_count()
if torch.cuda.is_available():
gpu_cnt = torch.cuda.device_count()
if gpu_cnt < 2:
return gpu_cnt
return 2

View File

@ -2,6 +2,7 @@
# Owner(s): ["oncall: distributed"]
import sys
from pathlib import Path
import torch
import torch.distributed as dist
@ -44,19 +45,53 @@ class TestInstantiator(TestCase):
self.assertEqual(return_type_str, "Tuple[Tensor, int, str]")
def test_instantiate_scripted_remote_module_template(self):
dir_path = Path(instantiator.INSTANTIATED_TEMPLATE_DIR_PATH)
# Cleanup.
file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")
for file_path in file_paths:
file_path.unlink()
# Check before run.
file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")
num_files_before = len(list(file_paths))
self.assertEqual(num_files_before, 0)
generated_module = instantiator.instantiate_scriptable_remote_module_template(
MyModuleInterface
)
self.assertTrue(hasattr(generated_module, "_remote_forward"))
self.assertTrue(hasattr(generated_module, "_generated_methods"))
# Check after run.
file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")
num_files_after = len(list(file_paths))
self.assertEqual(num_files_after, 1)
def test_instantiate_non_scripted_remote_module_template(self):
dir_path = Path(instantiator.INSTANTIATED_TEMPLATE_DIR_PATH)
# Cleanup.
file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")
for file_path in file_paths:
file_path.unlink()
# Check before run.
file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")
num_files_before = len(list(file_paths))
self.assertEqual(num_files_before, 0)
generated_module = (
instantiator.instantiate_non_scriptable_remote_module_template()
)
self.assertTrue(hasattr(generated_module, "_remote_forward"))
self.assertTrue(hasattr(generated_module, "_generated_methods"))
# Check after run.
file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")
num_files_after = len(list(file_paths))
self.assertEqual(num_files_after, 1)
if __name__ == "__main__":
run_tests()

View File

@ -64,10 +64,6 @@ class TestDTensorDebugMode(TestCase):
self.assertTrue(isinstance(debug_mode.operators[2], _RedistributeCall))
self.assertEqual(next(iter(debug_mode.operators[1])), torch.ops.aten.mm.default)
# check stringification
self.assertTrue(hasattr(debug_mode.operators[0], "args_str"))
self.assertFalse(hasattr(debug_mode.operators[0], "args"))
def test_debug_string_inside_context(self):
mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
@ -271,7 +267,6 @@ class TestDTensorDebugMode(TestCase):
record_torchfunction=True,
record_faketensor=True,
record_tensor_attributes=["a1", "a2"],
store_original_args=True,
) as debug_mode:
torch.matmul(y, x)
@ -284,9 +279,6 @@ class TestDTensorDebugMode(TestCase):
aten::_unsafe_view(t: f32[64, 8], [8, 8, 8])""",
)
self.assertTrue(hasattr(debug_mode.operators[0], "args"))
self.assertEqual(id(debug_mode.operators[0].args[0]), id(y))
@parametrize("has_inner_mode", [True, False])
@parametrize("has_outer_mode", [True, False])
def test_nested_debug_mode(self, has_inner_mode, has_outer_mode):

View File

@ -20,18 +20,18 @@ from torch.distributed.tensor.experimental._attention import (
_cp_options,
_disable_context_parallel_dispatcher,
_enable_context_parallel_dispatcher,
_HeadTailLoadBalancer,
_is_causal_behavior,
_LoadBalancer,
_PerDocumentHeadTailLoadBalancer,
_PTRRLoadBalancer,
_RotateMethod,
context_parallel,
context_parallel_unshard,
set_rotate_method,
)
from torch.distributed.tensor.experimental._context_parallel._cp_custom_ops import (
flex_cp_allgather,
from torch.distributed.tensor.experimental._cp_custom_ops import flex_cp_allgather
from torch.distributed.tensor.experimental._load_balancer import (
_HeadTailLoadBalancer,
_LoadBalancer,
_PerDocumentHeadTailLoadBalancer,
_PTRRLoadBalancer,
)
from torch.distributed.tensor.parallel import parallelize_module
from torch.nn.attention import sdpa_kernel, SDPBackend
@ -52,9 +52,7 @@ from torch.testing._internal.common_cuda import (
from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
from torch.testing._internal.common_utils import run_tests, skipIfRocm
from torch.testing._internal.distributed._tensor.common_dtensor import (
create_local_tensor_test_class,
DTensorTestBase,
map_local_tensor_for_rank,
with_comms,
)
@ -802,47 +800,11 @@ class TestSharding(DTensorTestBase):
chunks = freqs_cis.chunk(self.world_size * 2)
self.assertEqual(
freqs_cis_shard,
map_local_tensor_for_rank(
chunks,
self.rank,
lambda chunks, rank: torch.cat(
[chunks[rank], chunks[self.world_size * 2 - rank - 1]],
dim=0,
),
torch.cat(
[chunks[self.rank], chunks[self.world_size * 2 - self.rank - 1]], dim=0
),
)
RingAttentionTestWithLocalTensor = create_local_tensor_test_class(
RingAttentionTest,
skipped_tests=[
# Need to make attention implementation local tensor friendly, e.g.
# rewrite "rank local" logic
"test_ring_attention_sdpa",
],
)
CPFlexAttentionTestWithLocalTensor = create_local_tensor_test_class(
CPFlexAttentionTest,
skipped_tests=[
# Missing support for batched tensors
"test_cp_flex_attention_causal_mask",
"test_cp_flex_attention_document_mask",
],
)
TestCPCustomOpsWithLocalTensor = create_local_tensor_test_class(
TestCPCustomOps,
skipped_tests=[
# Missing support for fake tensors
"test_flex_cp_custom_op",
],
)
TestShardingWithLocalTensor = create_local_tensor_test_class(
TestSharding,
)
if __name__ == "__main__":
run_tests()

View File

@ -16,7 +16,6 @@ from torch.distributed.tensor import (
from torch.nn import functional as F
from torch.testing._internal.common_utils import run_tests
from torch.testing._internal.distributed._tensor.common_dtensor import (
create_local_tensor_test_class,
DTensorTestBase,
skip_if_lt_x_gpu,
with_comms,
@ -204,42 +203,34 @@ class DistConvolutionOpsTest(DTensorTestBase):
self.assertTrue(b_dt.grad is not None)
self.assertTrue(x_dt.grad is None)
def _run_single_arg_fwd(self, model, arg) -> tuple[torch.Tensor, torch.Tensor]:
"""Given model and arg, runs fwd model local and distbuted given device_mesh"""
device_mesh = self.build_device_mesh()
model_copy = copy.deepcopy(model).to(device=self.device_type)
dist_model = distribute_module(model, device_mesh, _conv_fn)
arg_dt = DTensor.from_local(arg, device_mesh, [Replicate()])
out_dt = dist_model(arg_dt.to(device=self.device_type))
out = model_copy(arg)
return (out_dt.full_tensor(), out)
@with_comms
def test_conv1d(self):
device_mesh = self.build_device_mesh()
model = nn.Conv1d(64, 64, 3, padding=1)
x = torch.randn(1, 64, 8, device=self.device_type)
out_dt, out = self._run_single_arg_fwd(model, x)
model_gt = copy.deepcopy(model)
x = torch.randn(1, 64, 8)
x_dt = DTensor.from_local(x, device_mesh, [Replicate()])
model_dt = distribute_module(
model, device_mesh, _conv_fn, input_fn=None, output_fn=None
)
out_dt = model_dt(x_dt)
out = model_gt(x)
self.assertEqual(out_dt.shape, out.shape)
@with_comms
def test_conv3d(self):
device_mesh = self.build_device_mesh()
model = nn.Conv3d(64, 64, 3, padding=1)
model_gt = copy.deepcopy(model).to(device=self.device_type)
x = torch.randn(1, 64, 8, 8, 8, device=self.device_type)
out_dt, out = self._run_single_arg_fwd(model, x)
x_dt = DTensor.from_local(x, device_mesh, [Replicate()])
model_dt = distribute_module(
model, device_mesh, _conv_fn, input_fn=None, output_fn=None
)
out_dt = model_dt(x_dt)
out = model_gt(x)
self.assertEqual(out_dt.shape, out.shape)
DistConvolutionOpsTestWithLocalTensor = create_local_tensor_test_class(
DistConvolutionOpsTest,
# Send / recv ops are not supported
skipped_tests=[
"test_conv1d",
"test_conv3d",
"test_conv_backward_none_grad_inp",
"test_depthwise_convolution",
"test_downsampling_convolution",
],
)
if __name__ == "__main__":
run_tests()

View File

@ -520,21 +520,6 @@ class DTensorExportTest(TestCase):
2,
)
def test_union_typed_annotation(self):
def fn(leaf: torch.Tensor | DTensor):
def nest_fn(leaf: torch.Tensor | DTensor):
# def nest_fn(leaf: Union[torch.Tensor, DTensor]): # this works
if isinstance(leaf, DTensor):
leaf = leaf.to_local()
return leaf
return nest_fn(leaf) + 1
z = torch.randn(16, 16)
gm = graph_capture_and_aot_export_joint_with_descriptors(fn, (z,))
self.assertEqual(fn(z), gm(z)[0])
instantiate_parametrized_tests(DTensorExportTest)

View File

@ -943,79 +943,6 @@ class TestComputeCommReorderingBucketing(TestComputeCommReorderingMultiProc):
correct = func(inputs_a, inputs_b, ranks=ranks)
self.assertTrue(same(out, correct))
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@torch._inductor.config.patch(get_bucket_patches())
def test_multidtype_bucketing(self):
"""Test that all_gathers with different dtypes get bucketed together."""
def func(a, b, c, *, ranks):
# Three all_gathers with different dtypes
ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks) # float32
ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks) # float16
ag3 = _functional_collectives.all_gather_tensor(c, 0, ranks) # float16
# Use all results
return ag1.sum() + ag2.sum() + ag3.sum()
with _dynamo_dist_per_rank_init(
self.rank,
self.world_size,
self.backend(device_type),
fake_pg=not at_least_x_gpu(2),
):
a = torch.ones(4, 4, dtype=torch.float32, device=device_type)
b = torch.ones(4, 4, dtype=torch.float16, device=device_type) * 2
c = torch.ones(4, 4, dtype=torch.float16, device=device_type) * 3
ranks = list(range(self.world_size))
func_c = functools.partial(func, ranks=ranks)
compiled = torch.compile(func_c)
out, aten_graph_str = run_and_get_aten_graph(compiled, a, b, c)
# Should have 1 bucketed all_gather despite different dtypes
FileCheck().check_count(
"torch.ops._c10d_functional.wait_tensor.default", 1, exactly=True
).run(aten_graph_str)
# Verify correctness
correct = func(a, b, c, ranks=ranks)
self.assertTrue(same(out, correct))
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@torch._inductor.config.patch(get_bucket_patches())
def test_basic_all_reduce_bucketing(self):
"""Test that independent all_reduce operations get bucketed together."""
def func(a, b, c):
# Three independent all_reduces that should be bucketed
ar1 = _functional_collectives.all_reduce(a, "sum", "0")
ar2 = _functional_collectives.all_reduce(b, "sum", "0")
ar3 = _functional_collectives.all_reduce(c, "sum", "0")
return ar1.sum() + ar2.sum() + ar3.sum()
with _dynamo_dist_per_rank_init(
self.rank,
self.world_size,
self.backend(device_type),
fake_pg=not at_least_x_gpu(2),
):
a = torch.ones(4, 4, dtype=torch.float, device=device_type) + self.rank
b = torch.ones(4, 4, dtype=torch.float, device=device_type) * 2
c = torch.ones(4, 4, dtype=torch.float, device=device_type) * 3
compiled = torch.compile(func)
out, aten_graph_str = run_and_get_aten_graph(compiled, a, b, c)
# Should see a single bucketed all_reduce
FileCheck().check_count(
"torch.ops._c10d_functional.wait_tensor.default", 1, exactly=True
).run(aten_graph_str)
# Verify correctness
correct = func(a, b, c)
self.assertTrue(same(out, correct))
if __name__ == "__main__":
from torch._dynamo.test_case import run_tests

View File

@ -195,7 +195,7 @@ if not TEST_WITH_DEV_DBG_ASAN:
for i, t in enumerate(tensors):
self.assertEqual(t, torch.ones(5, 5, device=device) + i)
elif self.rank == 0:
for t in tensors:
for i, t in enumerate(tensors):
zeros = torch.zeros(5, 5, device=device)
self.assertEqual(t, zeros)
y = torch.sum(torch.stack(tensors), axis=0)

View File

@ -1,572 +0,0 @@
# Owner(s): ["module: inductor"]
import unittest
import torch
import torch._dynamo
import torch._dynamo.logging
import torch._dynamo.test_case
import torch.distributed as dist
import torch.fx as fx
# for some reason importing functional collectives after dynamo breaks collectives handling!
from torch._C import FileCheck
from torch._inductor.test_case import TestCase as InductorTestCase
from torch._subclasses.fake_tensor import FakeTensorMode
from torch.fx.experimental.proxy_tensor import make_fx
from torch.testing._internal.common_distributed import requires_accelerator_dist_backend
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
parametrize,
run_tests,
)
from torch.testing._internal.inductor_utils import HAS_GPU
from torch.utils._ordered_set import OrderedSet
# flake8: noqa: B950
# Owner(s): ["module: inductor"]
aten = torch.ops.aten
from torch.testing._internal.common_fsdp import get_devtype
device_type = str(get_devtype())
import torch
import torch._dynamo
import torch._dynamo.logging
import torch._dynamo.test_case
# for some reason importing functional collectives after dynamo breaks collectives handling!
@requires_accelerator_dist_backend(["nccl", "xccl"])
def build_collective_info(graph, hiding_annotations):
"""
Build CollectiveInfo dict from manual hiding annotations.
hiding_annotations: dict mapping collective_start -> hiding_compute_node
"""
from torch._inductor.fx_passes.overlap_scheduling import CollectiveInfo
collective_info = {}
# Find all collective starts and their corresponding waits
start_to_wait = {}
for node in graph.nodes:
if node.op == "call_function" and "wait_tensor" in str(node.target):
wait_input = node.args[0]
if isinstance(wait_input, fx.Node):
start_to_wait[wait_input] = node
# Build CollectiveInfo for each collective
for start_node, wait_node in start_to_wait.items():
hiding_node = hiding_annotations.get(start_node)
# Estimate size and time
size_bytes = 16 * 4 # 4x4 tensor of floats
estimated_time_ms = 1.0 # Dummy time
exposed_time_ms = 0.0 if hiding_node else 1.0 # Hidden if has hiding_node
collective_info[start_node] = CollectiveInfo(
start_node=start_node,
wait_node=wait_node,
size_bytes=size_bytes,
estimated_time_ms=estimated_time_ms,
exposed_time_ms=exposed_time_ms,
hiding_node=hiding_node,
)
return collective_info
def compute_ancestors(graph):
"""Compute ancestor sets for all nodes in the graph."""
node_ancestors = {}
for node in graph.nodes:
ancestors = OrderedSet()
stack = list(node.all_input_nodes)
visited = set()
while stack:
current = stack.pop()
if current in visited:
continue
visited.add(current)
ancestors.add(current)
stack.extend(current.all_input_nodes)
node_ancestors[node] = ancestors
return node_ancestors
@requires_accelerator_dist_backend()
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@instantiate_parametrized_tests
class TestOverlapPreservingBucketing(InductorTestCase):
"""
Unit tests for overlap-preserving bucketing pass.
"""
@classmethod
def setUpClass(cls):
super().setUpClass()
from torch.testing._internal.distributed.fake_pg import FakeStore
store = FakeStore()
dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
cls.device = "cuda"
@classmethod
def tearDownClass(cls):
super().tearDownClass()
dist.destroy_process_group()
def test_can_bucket_independent_collectives(self):
"""
Test that independent collectives with separate hiding nodes CAN bucket.
Graph structure:
ag1_start -> ag2_start -> mm1 (hides ag1) -> mm2 (hides ag2) -> ag1_wait -> ag2_wait
"""
def func(a, b):
group_name = "0"
group_size = 1
# Start both collectives
ag1 = torch.ops._c10d_functional.all_gather_into_tensor(
a, group_size, group_name
)
ag2 = torch.ops._c10d_functional.all_gather_into_tensor(
b, group_size, group_name
)
# Independent compute that can hide both
mm1 = torch.mm(a, a)
mm2 = torch.mm(b, b)
# Wait for both
ag1_out = torch.ops._c10d_functional.wait_tensor(ag1)
ag2_out = torch.ops._c10d_functional.wait_tensor(ag2)
return ag1_out.sum() + ag2_out.sum() + mm1.sum() + mm2.sum()
# Use fake mode to trace without executing
with FakeTensorMode():
a = torch.ones(4, 4, device=self.device)
b = torch.ones(4, 4, device=self.device) * 2
# Trace with make_fx
traced = make_fx(func)(a, b)
# Find nodes using find_nodes
ag1, ag2 = traced.graph.find_nodes(
op="call_function",
target=torch.ops._c10d_functional.all_gather_into_tensor.default,
)
mm1, mm2 = traced.graph.find_nodes(
op="call_function", target=torch.ops.aten.mm.default
)
# Manually annotate hiding relationships
hiding_annotations = {
ag1: mm1, # mm1 hides ag1
ag2: mm2, # mm2 hides ag2
}
# Build collective info and ancestors
collective_info = build_collective_info(traced.graph, hiding_annotations)
node_ancestors = compute_ancestors(traced.graph)
scheduled = OrderedSet(traced.graph.nodes)
# Run bucketing
from torch._inductor.fx_passes.overlap_preserving_bucketer import (
OverlapPreservingBucketer,
)
bucketer = OverlapPreservingBucketer(
traced.graph,
collective_info,
node_ancestors,
scheduled,
)
bucketer.bucket_collectives()
# Verify: should have 1 bucketed collective (all_gather_into_tensor_out)
graph_str = str(traced.graph)
FileCheck().check_count("all_gather_into_tensor_out", 1, exactly=False).run(
graph_str
)
def test_cant_bucket_nested_hiding_intervals(self):
"""
Test that nested hiding intervals prevent bucketing.
Graph structure:
ag1_start -> ag2_start -> mm2 (hides ag2) -> ag2_wait -> mm1 (hides ag1) -> ag1_wait
ag2's hiding interval is nested inside ag1's hiding interval.
"""
def func(a, b):
group_name = "0"
group_size = 1
# ag1 starts first
ag1 = torch.ops._c10d_functional.all_gather_into_tensor(
a, group_size, group_name
)
# ag2 starts (inside ag1's interval)
ag2 = torch.ops._c10d_functional.all_gather_into_tensor(
b, group_size, group_name
)
# mm2 hides ag2
mm2 = torch.mm(b[:2, :2], b[:2, :2])
# ag2 waits (still inside ag1's interval)
ag2_out = torch.ops._c10d_functional.wait_tensor(ag2)
# mm1 uses ag2's result and hides ag1
mm1 = torch.mm(a + ag2_out[:4, :4], a)
# ag1 waits last
ag1_out = torch.ops._c10d_functional.wait_tensor(ag1)
return ag1_out.sum() + ag2_out.sum() + mm1.sum() + mm2.sum()
# Use fake mode to trace without executing
with FakeTensorMode():
a = torch.ones(4, 4, device=self.device)
b = torch.ones(4, 4, device=self.device) * 2
# Trace with make_fx
traced = make_fx(func)(a, b)
# Find nodes using find_nodes
ag1, ag2 = traced.graph.find_nodes(
op="call_function",
target=torch.ops._c10d_functional.all_gather_into_tensor.default,
)
mm_nodes = traced.graph.find_nodes(
op="call_function", target=torch.ops.aten.mm.default
)
# mm2 is the first mm, mm1 is the second (based on graph order)
mm2 = mm_nodes[0]
mm1 = mm_nodes[1]
# Manually annotate hiding relationships
hiding_annotations = {
ag1: mm1, # mm1 hides ag1
ag2: mm2, # mm2 hides ag2
}
# Build collective info and ancestors
collective_info = build_collective_info(traced.graph, hiding_annotations)
node_ancestors = compute_ancestors(traced.graph)
scheduled = OrderedSet(traced.graph.nodes)
# Run bucketing
from torch._inductor.fx_passes.overlap_preserving_bucketer import (
OverlapPreservingBucketer,
)
bucketer = OverlapPreservingBucketer(
traced.graph,
collective_info,
node_ancestors,
scheduled,
)
bucketer.bucket_collectives()
# Verify: nested hiding intervals should prevent bucketing
# Should have 2 separate all_gathers, not 1 bucketed one
graph_str = str(traced.graph)
FileCheck().check_count("all_gather_into_tensor", 2, exactly=False).run(
graph_str
)
@parametrize("final_mm_hidden", (True, False))
def test_cant_bucket_ag_with_rs_hiding_interval_between(self, final_mm_hidden):
"""
Test that all_gathers can't bucket when a reduce_scatter's hiding interval is between them.
Graph structure:
ag1_start -> mm1 (hides ag1) -> ag1_wait ->
rs_start -> mm2 (hides rs) -> rs_wait ->
if final_mm_hidden:
ag2_start -> mm3 (hides ag2) -> ag2_wait
if final_mm_hidden:
Bucketing ag1 and ag2 would require moving one of them, which would break hiding relationships:
- Moving ag2 earlier would break ag2's hiding by mm3
- Moving ag1 later would break ag1's hiding by mm1
- The rs hiding interval creates an obstacle between them
otherwise, we can bucket
"""
def func(a, b, c):
group_name = dist.distributed_c10d._get_default_group().group_name
group_size = 1
# First all_gather
ag1 = torch.ops._c10d_functional.all_gather_into_tensor(
a, group_size, group_name
)
mm1 = torch.mm(a, a) # hides ag1
ag1_out = torch.ops._c10d_functional.wait_tensor(ag1)
# Reduce scatter in between
rs = torch.ops._c10d_functional.reduce_scatter_tensor(
b, "sum", group_size, group_name
)
mm2 = torch.mm(b[:4, :4], b[:4, :4]) # hides rs
rs_out = torch.ops._c10d_functional.wait_tensor(rs)
# Second all_gather
ag2 = torch.ops._c10d_functional.all_gather_into_tensor(
c, group_size, group_name
)
mm3 = torch.mm(c, c) # hides ag2
ag2_out = torch.ops._c10d_functional.wait_tensor(ag2)
return ag1_out.sum() + rs_out.sum() + ag2_out.sum(), mm1, mm2, mm3
# Use fake mode to trace without executing
with FakeTensorMode():
a = torch.ones(4, 4, device=self.device)
b = torch.ones(8, 4, device=self.device)
c = torch.ones(4, 4, device=self.device)
# Trace with make_fx
traced = make_fx(func)(a, b, c)
ag1, ag2 = traced.graph.find_nodes(
op="call_function",
target=torch.ops._c10d_functional.all_gather_into_tensor.default,
)
(rs,) = traced.graph.find_nodes(
op="call_function",
target=torch.ops._c10d_functional.reduce_scatter_tensor.default,
)
mm1, mm2, mm3 = traced.graph.find_nodes(
op="call_function", target=torch.ops.aten.mm.default
)
# Manually annotate hiding relationships
hiding_annotations = {
ag1: mm1, # mm1 hides ag1
# rs: mm2, # mm2 hides rs
ag2: mm3,
}
if final_mm_hidden:
hiding_annotations[rs] = mm2
# Build collective info and ancestors
collective_info = build_collective_info(traced.graph, hiding_annotations)
node_ancestors = compute_ancestors(traced.graph)
scheduled = OrderedSet(traced.graph.nodes)
# Run bucketing logic to find buckets (without applying them, which would require process groups)
from torch._inductor.fx_passes.overlap_preserving_bucketer import (
OverlapPreservingBucketer,
)
bucketer = OverlapPreservingBucketer(
traced.graph,
collective_info,
node_ancestors,
scheduled,
)
bucketer.bucket_collectives()
graph_str = str(traced.graph)
# check order of mms preserved
FileCheck().check("%mm").check("%mm_1").check("%mm_2").run(graph_str)
if final_mm_hidden:
# Should NOT bucket - 2 separate all_gathers
# Count all_gather node names (works even when wrapped in control_deps)
FileCheck().check_count("%all_gather_into_tensor", 2, exactly=False).run(
graph_str
)
else:
# Should bucket - 1 bucketed all_gather (all_gather_into_tensor_out)
FileCheck().check_count(
"%all_gather_into_tensor_out", 1, exactly=False
).run(graph_str)
def test_can_bucket_all_reduce(self):
"""
Test that all_reduce operations CAN bucket together.
Graph structure:
ar1_start -> ar2_start -> mm1 (hides ar1) -> mm2 (hides ar2) -> ar1_wait -> ar2_wait
"""
def func(a, b):
group_name = "0"
# Start both all_reduce operations
ar1 = torch.ops._c10d_functional.all_reduce(a, "sum", group_name)
ar2 = torch.ops._c10d_functional.all_reduce(b, "sum", group_name)
# Independent compute that can hide both
mm1 = torch.mm(a, a)
mm2 = torch.mm(b, b)
# Wait for both
ar1_out = torch.ops._c10d_functional.wait_tensor(ar1)
ar2_out = torch.ops._c10d_functional.wait_tensor(ar2)
return ar1_out.sum() + ar2_out.sum() + mm1.sum() + mm2.sum()
# Use fake mode to trace without executing
with FakeTensorMode():
a = torch.ones(4, 4, device=self.device)
b = torch.ones(4, 4, device=self.device) * 2
# Trace with make_fx
traced = make_fx(func)(a, b)
# Find nodes
ar1, ar2 = traced.graph.find_nodes(
op="call_function",
target=torch.ops._c10d_functional.all_reduce.default,
)
mm1, mm2 = traced.graph.find_nodes(
op="call_function", target=torch.ops.aten.mm.default
)
# For all_reduce, start_node == wait_node (no separate wait)
hiding_annotations = {
ar1: mm1,
ar2: mm2,
}
# Build collective info
collective_info = build_collective_info(traced.graph, hiding_annotations)
node_ancestors = compute_ancestors(traced.graph)
scheduled = OrderedSet(traced.graph.nodes)
# Run bucketing
from torch._inductor.fx_passes.overlap_preserving_bucketer import (
OverlapPreservingBucketer,
)
bucketer = OverlapPreservingBucketer(
traced.graph,
collective_info,
node_ancestors,
scheduled,
)
bucketer.bucket_collectives()
# Verify: should have 1 bucketed all_reduce
# After bucketing, there should be only one all_reduce node (the bucketed one)
graph_str = str(traced.graph)
FileCheck().check_count("%all_reduce", 1, exactly=True).check_count(
"%mm", 2
).run(graph_str)
def test_can_bucket_multidtype_collectives(self):
"""
Test that all_gathers with different dtypes CAN bucket together.
Graph structure:
ag1_float32 -> mm1 (hides ag1) -> ag1_wait
ag2_bfloat16 -> mm2 (hides ag2) -> ag2_wait
"""
def func(a, b):
group_name = "0"
group_size = 1
# Start both collectives with different dtypes
ag1 = torch.ops._c10d_functional.all_gather_into_tensor(
a,
group_size,
group_name, # float32
)
ag2 = torch.ops._c10d_functional.all_gather_into_tensor(
b,
group_size,
group_name, # bfloat16
)
# Independent compute that can hide both
mm1 = torch.mm(a, a)
mm2 = torch.mm(b.float(), b.float())
# Wait for both
ag1_out = torch.ops._c10d_functional.wait_tensor(ag1)
ag2_out = torch.ops._c10d_functional.wait_tensor(ag2)
return ag1_out.sum() + ag2_out.sum() + mm1.sum() + mm2.sum()
# Use fake mode to trace without executing
with FakeTensorMode():
a = torch.ones(4, 4, device=self.device, dtype=torch.float32)
b = torch.ones(4, 4, device=self.device, dtype=torch.bfloat16)
# Trace with make_fx
traced = make_fx(func)(a, b)
# Find nodes using find_nodes
ag1, ag2 = traced.graph.find_nodes(
op="call_function",
target=torch.ops._c10d_functional.all_gather_into_tensor.default,
)
mm_nodes = traced.graph.find_nodes(
op="call_function", target=torch.ops.aten.mm.default
)
mm1 = mm_nodes[0]
mm2 = mm_nodes[1]
# Manually annotate hiding relationships
hiding_annotations = {
ag1: mm1, # mm1 hides ag1
ag2: mm2, # mm2 hides ag2
}
# Build collective info and ancestors
collective_info = build_collective_info(traced.graph, hiding_annotations)
node_ancestors = compute_ancestors(traced.graph)
scheduled = OrderedSet(traced.graph.nodes)
# Run bucketing with multidtype mode
from torch._inductor.fx_passes.overlap_preserving_bucketer import (
OverlapPreservingBucketer,
)
bucketer = OverlapPreservingBucketer(
traced.graph,
collective_info,
node_ancestors,
scheduled,
bucket_mode="custom_ops_multidtype",
)
bucketer.bucket_collectives()
# Verify: should have 1 bucketed collective (all_gather_into_tensor_out)
# even though dtypes are different
graph_str = str(traced.graph)
FileCheck().check_count("all_gather_into_tensor_out", 1, exactly=False).run(
graph_str
)
if __name__ == "__main__":
run_tests()

View File

@ -41,20 +41,6 @@ from torch.testing._internal.triton_utils import requires_cuda_and_triton
from torch.testing._internal.two_tensor import TwoTensor
def aot_eager_regional_inductor():
"""
Regional inductor backend for AOT autograd.
Uses regional_inductor as both forward and backward compiler.
"""
from torch._dynamo.backends.common import aot_autograd
from torch.fx.passes.regional_inductor import regional_inductor
return aot_autograd(
fw_compiler=regional_inductor,
bw_compiler=regional_inductor,
)
def saved_tensors_hooks_to_gm(
pack_fn,
unpack_fn,
@ -1912,171 +1898,6 @@ class AOTAutogradCacheTests(InductorTestCase):
# no recompiles
self.assertFalse(counters)
@inductor_config.patch("fx_graph_remote_cache", False)
@inductor_config.patch("fx_graph_cache", True)
@functorch_config.patch({"enable_autograd_cache": True})
@functorch_config.patch({"bundled_autograd_cache": True})
def test_regional_inductor_basic(self):
"""
Basic test for regional inductor with bundled autograd cache.
Tests that regional inductor compilation results can be cached and hit.
"""
import torch.fx.traceback as fx_traceback
def fn(x, y):
sin = torch.sin(x)
# Mark this region to be compiled with inductor
with fx_traceback.annotate({"compile_with_inductor": 0}):
mul = sin * y
add = mul + 1
return torch.sin(add)
x = torch.randn(10, device="cpu")
y = torch.randn(10, device="cpu")
# Compile with regional inductor backend
compiled_fn = torch.compile(
fn, backend=aot_eager_regional_inductor(), fullgraph=True
)
# First call should miss in cache
result1 = compiled_fn(x, y)
self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
# Second call should hit (after clearing dynamo)
self._clear_dynamo_and_codecache()
result2 = compiled_fn(x, y)
self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
# Results should be the same
self.assertEqual(result1, result2)
@inductor_config.patch("fx_graph_remote_cache", False)
@inductor_config.patch("fx_graph_cache", True)
@functorch_config.patch({"enable_autograd_cache": True})
@functorch_config.patch({"bundled_autograd_cache": True})
def test_regional_inductor_with_backward(self):
"""
Test regional inductor with backward pass and bundled autograd cache.
Note: Regional inductor triggers multiple AOT autograd compilations:
- One for the outer graph (with regional inductor backend)
- One for each marked region (via standalone_compile)
"""
import torch.fx.traceback as fx_traceback
def fn(x, y):
sin = torch.sin(x)
# Mark this region to be compiled with inductor
with fx_traceback.annotate({"compile_with_inductor": 0}):
mul = sin * y
add = mul + 1
return torch.sin(add)
x = torch.randn(10, requires_grad=True)
y = torch.randn(10, requires_grad=True)
x2 = x.detach().clone().requires_grad_(True)
y2 = y.detach().clone().requires_grad_(True)
# Compile with regional inductor backend
compiled_fn = torch.compile(
fn, backend=aot_eager_regional_inductor(), fullgraph=True
)
# First call: AOT autograd compiles the outer graph (1 miss)
# Regional inductor then compiles the marked region (1 more miss)
result1 = compiled_fn(x, y)
result1.sum().backward()
# We expect 2 cache misses: outer graph + marked region
initial_misses = counters["aot_autograd"]["autograd_cache_miss"]
initial_saves = counters["aot_autograd"]["autograd_cache_saved"]
self.assertGreater(initial_misses, 0)
self.assertGreater(initial_saves, 0)
# Second call should hit (after clearing dynamo)
self._clear_dynamo_and_codecache()
result2 = compiled_fn(x2, y2)
result2.sum().backward()
# Should have cache hits now
final_hits = counters["aot_autograd"]["autograd_cache_hit"]
self.assertGreater(final_hits, 0)
# Cache misses and saves should not increase
self.assertEqual(
counters["aot_autograd"]["autograd_cache_miss"], initial_misses
)
self.assertEqual(
counters["aot_autograd"]["autograd_cache_saved"], initial_saves
)
# Results and gradients should be the same
self.assertEqual(result1, result2)
self.assertEqual(x.grad, x2.grad)
self.assertEqual(y.grad, y2.grad)
@inductor_config.patch("fx_graph_remote_cache", False)
@inductor_config.patch("fx_graph_cache", True)
@functorch_config.patch({"enable_autograd_cache": True})
@functorch_config.patch({"bundled_autograd_cache": True})
def test_regional_inductor_cache_miss_on_change(self):
"""
Test that changing the function causes a cache miss with regional inductor.
Regional inductor creates multiple AOT compilations, so we track
the change in cache misses rather than absolute counts.
"""
import torch.fx.traceback as fx_traceback
def fn1(x, y):
sin = torch.sin(x)
with fx_traceback.annotate({"compile_with_inductor": 0}):
mul = sin * y
add = mul + 1
return torch.sin(add)
def fn2(x, y):
sin = torch.sin(x)
with fx_traceback.annotate({"compile_with_inductor": 0}):
mul = sin * y
add = mul + 2 # Changed from +1 to +2
return torch.sin(add)
x = torch.randn(10)
y = torch.randn(10)
# Compile first function
compiled_fn1 = torch.compile(
fn1, backend=aot_eager_regional_inductor(), fullgraph=True
)
result1 = compiled_fn1(x, y)
first_misses = counters["aot_autograd"]["autograd_cache_miss"]
first_saves = counters["aot_autograd"]["autograd_cache_saved"]
self.assertGreater(first_misses, 0)
self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
self.assertGreater(first_saves, 0)
# Compile second function (different graph)
self._clear_dynamo_and_codecache()
compiled_fn2 = torch.compile(
fn2, backend=aot_eager_regional_inductor(), fullgraph=True
)
result2 = compiled_fn2(x, y)
# Should miss because graph is different (more misses than before)
self.assertGreater(
counters["aot_autograd"]["autograd_cache_miss"], first_misses
)
self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
self.assertGreater(
counters["aot_autograd"]["autograd_cache_saved"], first_saves
)
# Results should be different
self.assertNotEqual(result1, result2)
@functorch_config.patch({"bundled_autograd_cache": True})
class AOTAutogradCacheBundledTests(AOTAutogradCacheTests):

View File

@ -4,10 +4,10 @@ import os
from unittest.mock import patch
import torch
from functorch import make_fx
from torch._dynamo import debug_utils
from torch._dynamo.debug_utils import aot_graph_input_parser, generate_env_vars_string
from torch._dynamo.test_case import TestCase
from torch.fx.experimental.proxy_tensor import make_fx
from torch.testing._internal.common_device_type import instantiate_device_type_tests

View File

@ -2064,23 +2064,6 @@ Detected recompile when torch.compile stance is 'fail_on_recompile'. filename: '
self.assertEqual(f(), 1)
def test_error_on_graph_break_nonempty_checkpoint(self):
cnts = torch._dynamo.testing.CompileCounter()
@torch.compile(backend=cnts)
def fn(x):
x = x + 1
x = x + 1
x = x + 1
with torch._dynamo.error_on_graph_break(True):
torch._dynamo.graph_break()
return x + 1
with self.assertRaises(Unsupported):
fn(torch.ones(3))
self.assertEqual(cnts.frame_count, 0)
def test_nested_compile_fullgraph(self):
# Test that fullgraph=True cannot be toggled back by fullgraph=False
inp = torch.ones(3)

View File

@ -341,7 +341,7 @@ class DictTests(torch._dynamo.test_case.TestCase):
def fn(x, d):
y = 0
for idx, value in enumerate(d.values()):
for idx, (key, value) in enumerate(d.items()):
if idx == 0:
y += torch.sin(x * value)
else:
@ -366,7 +366,7 @@ class DictTests(torch._dynamo.test_case.TestCase):
def fn(x, d):
y = 0
for idx, value in enumerate(d.values()):
for idx, (key, value) in enumerate(d.items()):
if idx == 0:
y += torch.sin(x * value)
else:
@ -847,7 +847,7 @@ class DictTests(torch._dynamo.test_case.TestCase):
d = {"a": 2, "b": 3, "c": 5 * x}
mp = types.MappingProxyType(d)
y = torch.sin(x * mp["a"])
for v in mp.values():
for k, v in mp.items(): # noqa: PERF102
y += torch.cos(x * v)
return mp
@ -864,7 +864,7 @@ class DictTests(torch._dynamo.test_case.TestCase):
def fn(x):
mp = types.MappingProxyType(d)
y = torch.sin(x * mp["a"])
for v in mp.values():
for k, v in mp.items(): # noqa: PERF102
y += torch.cos(x * v)
d["d"] = 4
return mp
@ -885,7 +885,7 @@ class DictTests(torch._dynamo.test_case.TestCase):
def fn(x, mp):
y = torch.sin(x * mp["a"])
for v in mp.values():
for k, v in mp.items(): # noqa: PERF102
y += torch.cos(x * v)
if isinstance(mp, types.MappingProxyType):
y *= 2

Some files were not shown because too many files have changed in this diff Show More