mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-02 23:15:01 +08:00
Compare commits
56 Commits
v2.5.0-rc1
...
v2.5.0
| Author | SHA1 | Date | |
|---|---|---|---|
| 32f585d934 | |||
| 417a0763a7 | |||
| 119e7344d9 | |||
| 783a6a424c | |||
| 5375201dff | |||
| 1de132ec9e | |||
| 0b1b609ed7 | |||
| 0b45af9c10 | |||
| 1a0b166ba2 | |||
| 3a541ef8c2 | |||
| f8c4c252ca | |||
| 8af31b2e49 | |||
| 8a71edcca5 | |||
| 058d3de7b9 | |||
| 17d25897b2 | |||
| 70298e91f9 | |||
| 69ed7c7093 | |||
| d80f521ee2 | |||
| 57717c8768 | |||
| 550ed97a89 | |||
| 051df20ac2 | |||
| bc421d456e | |||
| aa574ab7e3 | |||
| 24bd87d5dd | |||
| 6101aafa34 | |||
| 396413f05c | |||
| c25781c5d2 | |||
| ecd330669e | |||
| 1715708183 | |||
| 2e2c00f74c | |||
| cbe476a5a7 | |||
| 4b030d47b1 | |||
| 9b80ddecd6 | |||
| 6e86793f75 | |||
| 7c550fea95 | |||
| 7a00785c23 | |||
| 4e6a99e5f3 | |||
| dd73223b90 | |||
| c5e5254a79 | |||
| ffed7b71e8 | |||
| becdf8ae4f | |||
| fb276d2652 | |||
| b7de7932fd | |||
| 1954439802 | |||
| 3920988456 | |||
| 1db2a6562c | |||
| 4b5bf41476 | |||
| a889c85498 | |||
| 813e06461c | |||
| 9887030485 | |||
| 9e315fef22 | |||
| 6b14e6cfdd | |||
| 828d686e1c | |||
| 612fc7c447 | |||
| cea562006e | |||
| ba27502501 |
@ -1,5 +1,5 @@
|
||||
0.6b
|
||||
0.7b
|
||||
manylinux_2_17
|
||||
rocm6.2
|
||||
7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
|
||||
e4ab195d2bd19e939c675a13280c29714c6ef9f2cf420690da150fa0cac043b1
|
||||
9be04068c3c0857a4cfd17d7e39e71d0423ebac2
|
||||
3e9e1959d23b93d78a08fcc5f868125dc3854dece32fd9458be9ef4467982291
|
||||
|
||||
@ -286,18 +286,7 @@ case "$image" in
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-focal-rocm-n-1-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.8
|
||||
GCC_VERSION=9
|
||||
PROTOBUF=yes
|
||||
DB=yes
|
||||
VISION=yes
|
||||
ROCM_VERSION=6.0
|
||||
NINJA_VERSION=1.9.0
|
||||
CONDA_CMAKE=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-focal-rocm-n-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.8
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
PROTOBUF=yes
|
||||
DB=yes
|
||||
@ -307,6 +296,17 @@ case "$image" in
|
||||
CONDA_CMAKE=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-focal-rocm-n-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
PROTOBUF=yes
|
||||
DB=yes
|
||||
VISION=yes
|
||||
ROCM_VERSION=6.2
|
||||
NINJA_VERSION=1.9.0
|
||||
CONDA_CMAKE=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-xpu-2024.0-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
|
||||
@ -1 +1 @@
|
||||
cc981feba10a3f4c2e46f3fe368e8fcf5f5643df
|
||||
91b14bf5593cf58a8541f3e6b9125600a867d4ef
|
||||
|
||||
@ -1 +1 @@
|
||||
757b6a61e7df814ba806f498f8bb3160f84b120c
|
||||
5fe38ffd73c2ac6ed6323b554205186696631c6f
|
||||
|
||||
@ -4,12 +4,12 @@ set -ex
|
||||
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
|
||||
|
||||
TARBALL='aotriton.tar.bz2'
|
||||
TARBALL='aotriton.tar.gz'
|
||||
# This read command alwasy returns with exit code 1
|
||||
read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
|
||||
ARCH=$(uname -m)
|
||||
AOTRITON_INSTALL_PREFIX="$1"
|
||||
AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.bz2"
|
||||
AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.gz"
|
||||
|
||||
cd "${AOTRITON_INSTALL_PREFIX}"
|
||||
# Must use -L to follow redirects
|
||||
|
||||
@ -10,6 +10,21 @@ if [[ -z $ROCM_VERSION ]]; then
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
IS_UBUNTU=0
|
||||
ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
|
||||
case "$ID" in
|
||||
ubuntu)
|
||||
IS_UBUNTU=1
|
||||
;;
|
||||
centos)
|
||||
IS_UBUNTU=0
|
||||
;;
|
||||
*)
|
||||
echo "Unable to determine OS..."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# To make version comparison easier, create an integer representation.
|
||||
save_IFS="$IFS"
|
||||
IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION})
|
||||
@ -58,8 +73,7 @@ MIOPEN_CMAKE_COMMON_FLAGS="
|
||||
"
|
||||
# Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
|
||||
if [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
|
||||
echo "ROCm 6.2 MIOpen does not need any patches, do not build from source"
|
||||
exit 0
|
||||
MIOPEN_BRANCH="release/rocm-rel-6.2-staging"
|
||||
elif [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
|
||||
echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
|
||||
exit 0
|
||||
@ -93,12 +107,21 @@ else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
yum remove -y miopen-hip
|
||||
|
||||
if [[ ${IS_UBUNTU} == 1 ]]; then
|
||||
apt-get remove -y miopen-hip
|
||||
else
|
||||
yum remove -y miopen-hip
|
||||
fi
|
||||
|
||||
git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
|
||||
pushd MIOpen
|
||||
# remove .git to save disk space since CI runner was running out
|
||||
rm -rf .git
|
||||
# Don't build CK to save docker build time
|
||||
if [[ $ROCM_INT -ge 60200 ]]; then
|
||||
sed -i '/composable_kernel/d' requirements.txt
|
||||
fi
|
||||
# Don't build MLIR to save docker build time
|
||||
# since we are disabling MLIR backend for MIOpen anyway
|
||||
if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
|
||||
@ -111,10 +134,15 @@ cmake -P install_deps.cmake --minimum
|
||||
|
||||
# clean up since CI runner was running out of disk space
|
||||
rm -rf /tmp/*
|
||||
yum clean all
|
||||
rm -rf /var/cache/yum
|
||||
rm -rf /var/lib/yum/yumdb
|
||||
rm -rf /var/lib/yum/history
|
||||
if [[ ${IS_UBUNTU} == 1 ]]; then
|
||||
apt-get autoclean && apt-get clean
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
else
|
||||
yum clean all
|
||||
rm -rf /var/cache/yum
|
||||
rm -rf /var/lib/yum/yumdb
|
||||
rm -rf /var/lib/yum/history
|
||||
fi
|
||||
|
||||
## Build MIOpen
|
||||
mkdir -p build
|
||||
@ -131,7 +159,11 @@ make -j $(nproc) package
|
||||
# clean up since CI runner was running out of disk space
|
||||
rm -rf /usr/local/cget
|
||||
|
||||
yum install -y miopen-*.rpm
|
||||
if [[ ${IS_UBUNTU} == 1 ]]; then
|
||||
sudo dpkg -i miopen-hip*.deb
|
||||
else
|
||||
yum install -y miopen-*.rpm
|
||||
fi
|
||||
|
||||
popd
|
||||
rm -rf MIOpen
|
||||
|
||||
@ -135,7 +135,7 @@ fi
|
||||
)
|
||||
|
||||
GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
|
||||
GIT_BRANCH_NAME=${GITHUB_REF##*/}
|
||||
GIT_BRANCH_NAME="2.5"
|
||||
GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
|
||||
DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
|
||||
DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
|
||||
|
||||
@ -1 +1 @@
|
||||
3.0.0
|
||||
3.1.0
|
||||
|
||||
@ -68,6 +68,8 @@ RUN rm install_rocm.sh
|
||||
COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
|
||||
RUN bash ./install_rocm_magma.sh
|
||||
RUN rm install_rocm_magma.sh
|
||||
ADD ./common/install_miopen.sh install_miopen.sh
|
||||
RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
|
||||
ENV ROCM_PATH /opt/rocm
|
||||
ENV PATH /opt/rocm/bin:$PATH
|
||||
ENV PATH /opt/rocm/hcc/bin:$PATH
|
||||
@ -121,5 +123,8 @@ RUN bash ./install_cache.sh && rm install_cache.sh
|
||||
ARG BUILD_ENVIRONMENT
|
||||
ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
|
||||
|
||||
# Install LLVM dev version (Defined in the pytorch/builder github repository)
|
||||
COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
|
||||
|
||||
USER jenkins
|
||||
CMD ["bash"]
|
||||
|
||||
@ -49,13 +49,8 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
|
||||
fi
|
||||
|
||||
# Enable LLVM dependency for TensorExpr testing
|
||||
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
|
||||
export USE_LLVM=/opt/rocm/llvm
|
||||
export LLVM_DIR=/opt/rocm/llvm/lib/cmake/llvm
|
||||
else
|
||||
export USE_LLVM=/opt/llvm
|
||||
export LLVM_DIR=/opt/llvm/lib/cmake/llvm
|
||||
fi
|
||||
export USE_LLVM=/opt/llvm
|
||||
export LLVM_DIR=/opt/llvm/lib/cmake/llvm
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
|
||||
# To build test_edge_op_registration
|
||||
|
||||
@ -79,7 +79,7 @@ TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' a
|
||||
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
|
||||
# Only linux Python < 3.13 are supported wheels for triton
|
||||
TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
|
||||
if [[ -n "$PYTORCH_BUILD_VERSION" ]]; then
|
||||
if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
|
||||
TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
|
||||
TRITON_REQUIREMENT="pytorch-triton==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
|
||||
fi
|
||||
@ -89,7 +89,7 @@ fi
|
||||
# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
|
||||
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
|
||||
TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
|
||||
if [[ -n "$PYTORCH_BUILD_VERSION" ]]; then
|
||||
if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
|
||||
TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
|
||||
TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
|
||||
fi
|
||||
@ -103,7 +103,7 @@ fi
|
||||
# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
|
||||
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* && $(uname) == "Linux" ]]; then
|
||||
TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
|
||||
if [[ -n "$PYTORCH_BUILD_VERSION" ]]; then
|
||||
if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
|
||||
TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
|
||||
TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
|
||||
fi
|
||||
|
||||
5
.github/scripts/generate_ci_workflows.py
vendored
5
.github/scripts/generate_ci_workflows.py
vendored
@ -70,6 +70,9 @@ class BinaryBuildWorkflow:
|
||||
)
|
||||
else:
|
||||
self.build_environment = f"{self.os}-binary-{self.package_type}"
|
||||
if self.use_split_build:
|
||||
# added to distinguish concurrency groups
|
||||
self.build_environment += "-split"
|
||||
|
||||
def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
|
||||
output_file_path = (
|
||||
@ -79,7 +82,7 @@ class BinaryBuildWorkflow:
|
||||
if self.use_split_build:
|
||||
output_file_path = (
|
||||
GITHUB_DIR
|
||||
/ f"workflows/generated-{self.build_environment}-{self.branches}-split.yml"
|
||||
/ f"workflows/generated-{self.build_environment}-{self.branches}"
|
||||
)
|
||||
with open(output_file_path, "w") as output_file:
|
||||
GENERATED = "generated" # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
|
||||
|
||||
12
.github/workflows/build-manywheel-images.yml
vendored
12
.github/workflows/build-manywheel-images.yml
vendored
@ -27,6 +27,7 @@ env:
|
||||
DOCKER_REGISTRY: "docker.io"
|
||||
DOCKER_BUILDKIT: 1
|
||||
WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
|
||||
WITH_PUSH_ROCM: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
@ -156,25 +157,26 @@ jobs:
|
||||
with:
|
||||
submodules: false
|
||||
- name: Calculate docker image
|
||||
if: env.WITH_PUSH == 'false'
|
||||
if: env.WITH_PUSH_ROCM == 'false'
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.5
|
||||
with:
|
||||
docker-image-name: manylinux-builder-rocm${{matrix.rocm_version}}
|
||||
docker-build-dir: .ci/docker/manywheel
|
||||
always-rebuild: true
|
||||
push: true
|
||||
- name: Authenticate if WITH_PUSH
|
||||
if: env.WITH_PUSH == 'true'
|
||||
- name: Authenticate if WITH_PUSH_ROCM
|
||||
if: env.WITH_PUSH_ROCM == 'true'
|
||||
env:
|
||||
DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
|
||||
DOCKER_ID: ${{ secrets.DOCKER_ID }}
|
||||
run: |
|
||||
if [[ "${WITH_PUSH}" == true ]]; then
|
||||
if [[ "${WITH_PUSH_ROCM}" == true ]]; then
|
||||
echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
|
||||
fi
|
||||
- name: Build Docker Image
|
||||
if: env.WITH_PUSH == 'true'
|
||||
if: env.WITH_PUSH_ROCM == 'true'
|
||||
run: |
|
||||
export WITH_PUSH=true
|
||||
.ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}}
|
||||
build-docker-cpu:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
|
||||
306
.github/workflows/build-triton-wheel.yml
vendored
306
.github/workflows/build-triton-wheel.yml
vendored
@ -1,306 +0,0 @@
|
||||
name: Build Triton wheels
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
# NOTE: Binary build pipelines should only get triggered on release candidate builds
|
||||
# Release candidate tags look like: v1.11.0-rc1
|
||||
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
|
||||
paths:
|
||||
- .github/workflows/build-triton-wheel.yml
|
||||
- .github/scripts/build_triton_wheel.py
|
||||
- .github/ci_commit_pins/triton.txt
|
||||
- .ci/docker/ci_commit_pins/triton.txt
|
||||
- .ci/docker/ci_commit_pins/triton-xpu.txt
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/build-triton-wheel.yml
|
||||
- .github/scripts/build_triton_wheel.py
|
||||
- .github/ci_commit_pins/triton.txt
|
||||
- .ci/docker/ci_commit_pins/triton.txt
|
||||
- .ci/docker/ci_commit_pins/triton-xpu.txt
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build-wheel:
|
||||
name: "Build Triton Wheel"
|
||||
runs-on: [self-hosted, linux.4xlarge]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
py_vers: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
|
||||
device: ["cuda", "rocm", "xpu"]
|
||||
include:
|
||||
- device: "rocm"
|
||||
rocm_version: "6.2"
|
||||
- device: "cuda"
|
||||
rocm_version: ""
|
||||
timeout-minutes: 40
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ matrix.device == 'rocm' && format('pytorch/manylinux-builder:rocm{0}', matrix.rocm_version) || 'pytorch/manylinux-builder:cpu' }}
|
||||
PY_VERS: ${{ matrix.py_vers }}
|
||||
BUILD_DEVICE: ${{ matrix.device }}
|
||||
steps:
|
||||
- name: Setup SSH (Click me for login details)
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.5
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.5
|
||||
with:
|
||||
submodules: false
|
||||
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.5
|
||||
with:
|
||||
docker-image: ${{ env.DOCKER_IMAGE }}
|
||||
|
||||
- name: Build Triton wheel
|
||||
run: |
|
||||
set -x
|
||||
mkdir -p "${RUNNER_TEMP}/artifacts/"
|
||||
container_name=$(docker run \
|
||||
--tty \
|
||||
--detach \
|
||||
-v "${GITHUB_WORKSPACE}:/pytorch" \
|
||||
-v "${RUNNER_TEMP}/artifacts:/artifacts" \
|
||||
-w /artifacts/ \
|
||||
"${DOCKER_IMAGE}" \
|
||||
)
|
||||
|
||||
# Determine python executable for given version
|
||||
case $PY_VERS in
|
||||
3.8)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp38-cp38/bin/python
|
||||
;;
|
||||
3.9)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
|
||||
;;
|
||||
3.10)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
|
||||
;;
|
||||
3.11)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp311-cp311/bin/python
|
||||
;;
|
||||
3.12)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp312-cp312/bin/python
|
||||
;;
|
||||
*)
|
||||
echo "Unsupported python version ${PY_VERS}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
docker exec -t "${container_name}" yum install -y zlib-devel zip
|
||||
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install -U setuptools==67.4.0
|
||||
# Triton xpu build use GCC11
|
||||
if [[ "${BUILD_DEVICE}" == xpu ]]; then
|
||||
docker exec -t "${container_name}" yum install -y devtoolset-11-gcc-c++
|
||||
docker exec -t "${container_name}" bash -c "source /opt/rh/devtoolset-11/enable && ${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE"
|
||||
else
|
||||
docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE"
|
||||
fi
|
||||
docker exec -t "${container_name}" chown -R 1000.1000 /artifacts
|
||||
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
with:
|
||||
name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }}
|
||||
if-no-files-found: error
|
||||
path: ${{ runner.temp }}/artifacts/*
|
||||
|
||||
- name: Teardown Linux
|
||||
uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.5
|
||||
if: always()
|
||||
|
||||
upload-wheel:
|
||||
runs-on: ubuntu-22.04
|
||||
needs: build-wheel
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
container:
|
||||
image: continuumio/miniconda3:4.12.0
|
||||
environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Configure AWS credentials(PyTorch account) for main
|
||||
if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
|
||||
uses: aws-actions/configure-aws-credentials@v3
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
|
||||
aws-region: us-east-1
|
||||
|
||||
- name: Configure AWS credentials(PyTorch account) for RC builds
|
||||
if: ${{ github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }}
|
||||
uses: aws-actions/configure-aws-credentials@v3
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels
|
||||
aws-region: us-east-1
|
||||
|
||||
- name: Download Build Artifacts
|
||||
uses: actions/download-artifact@v4.1.7
|
||||
with:
|
||||
# Download all available artifacts
|
||||
path: ${{ runner.temp }}/artifacts-all
|
||||
|
||||
- name: Select Wheel Artifacts
|
||||
shell: bash
|
||||
run: |
|
||||
set -x
|
||||
mkdir -p "${RUNNER_TEMP}/artifacts/"
|
||||
mv "${RUNNER_TEMP}"/artifacts-all/pytorch-triton-wheel-*/* "${RUNNER_TEMP}/artifacts/"
|
||||
|
||||
- name: Set DRY_RUN (only for tagged pushes)
|
||||
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
|
||||
shell: bash
|
||||
run: |
|
||||
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
|
||||
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
|
||||
shell: bash
|
||||
run: |
|
||||
set -ex
|
||||
|
||||
# reference ends with an RC suffix
|
||||
if [[ "${GITHUB_REF_NAME}" = *-rc[0-9]* ]]; then
|
||||
echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
|
||||
fi
|
||||
|
||||
# NB: This step is gated by DRY_RUN, which is enabled everywhere except main and release branches
|
||||
- name: Upload binaries
|
||||
env:
|
||||
PACKAGE_TYPE: wheel
|
||||
# The UPLOAD_SUBFOLDER needs to be empty here so that triton wheels are uploaded
|
||||
# to nightly or test
|
||||
UPLOAD_SUBFOLDER: ""
|
||||
PKG_DIR: ${{ runner.temp }}/artifacts
|
||||
shell: bash
|
||||
run: |
|
||||
set -ex
|
||||
bash .circleci/scripts/binary_upload.sh
|
||||
|
||||
build-conda:
|
||||
name: "Build Triton Conda"
|
||||
runs-on: [self-hosted, linux.2xlarge]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
py_vers: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
|
||||
timeout-minutes: 40
|
||||
env:
|
||||
DOCKER_IMAGE: pytorch/conda-builder:cpu
|
||||
PY_VERS: ${{ matrix.py_vers }}
|
||||
steps:
|
||||
- name: Setup SSH (Click me for login details)
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.5
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.5
|
||||
with:
|
||||
submodules: false
|
||||
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.5
|
||||
with:
|
||||
docker-image: ${{ env.DOCKER_IMAGE }}
|
||||
|
||||
- name: Build Triton conda package
|
||||
run: |
|
||||
set -x
|
||||
mkdir -p "${RUNNER_TEMP}/artifacts/"
|
||||
container_name=$(docker run \
|
||||
--tty \
|
||||
--detach \
|
||||
-v "${GITHUB_WORKSPACE}:/pytorch" \
|
||||
-v "${RUNNER_TEMP}/artifacts:/artifacts" \
|
||||
-w /artifacts/ \
|
||||
"${DOCKER_IMAGE}" \
|
||||
)
|
||||
|
||||
docker exec -t "${container_name}" yum install -y llvm11 llvm11-devel llvm11-static llvm11-libs zlib-devel
|
||||
docker exec -t "${container_name}" python /pytorch/.github/scripts/build_triton_wheel.py --build-conda --py-version="${PY_VERS}"
|
||||
docker exec -t "${container_name}" chown -R 1000.1000 /artifacts
|
||||
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
with:
|
||||
name: pytorch-triton-conda-${{ matrix.py_vers }}
|
||||
if-no-files-found: error
|
||||
path: ${{ runner.temp }}/artifacts/*
|
||||
|
||||
- name: Teardown Linux
|
||||
uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.5
|
||||
if: always()
|
||||
|
||||
upload-conda:
|
||||
runs-on: ubuntu-22.04
|
||||
needs: build-conda
|
||||
container:
|
||||
image: continuumio/miniconda3:4.12.0
|
||||
environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Download Build Artifacts
|
||||
uses: actions/download-artifact@v4.1.7
|
||||
with:
|
||||
# Download all available artifacts
|
||||
path: ${{ runner.temp }}/artifacts-all
|
||||
|
||||
- name: Select Conda Artifacts
|
||||
shell: bash
|
||||
run: |
|
||||
set -x
|
||||
mkdir -p "${RUNNER_TEMP}/artifacts/"
|
||||
mv "${RUNNER_TEMP}"/artifacts-all/pytorch-triton-conda-*/* "${RUNNER_TEMP}/artifacts/"
|
||||
|
||||
- name: Set DRY_RUN (only for tagged pushes)
|
||||
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
|
||||
shell: bash
|
||||
run: |
|
||||
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
|
||||
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
|
||||
shell: bash
|
||||
run: |
|
||||
set -ex
|
||||
|
||||
# reference ends with an RC suffix
|
||||
if [[ "${GITHUB_REF_NAME}" = *-rc[0-9]* ]]; then
|
||||
echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
|
||||
fi
|
||||
|
||||
# NB: This step is gated by DRY_RUN, which is enabled everywhere except nightly and release branches
|
||||
- name: Upload binaries to Anaconda
|
||||
env:
|
||||
PACKAGE_TYPE: conda
|
||||
PKG_DIR: ${{ runner.temp }}/artifacts
|
||||
# When running these on pull_request events these should be blank
|
||||
CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
|
||||
CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
shell: bash
|
||||
run: |
|
||||
set -ex
|
||||
|
||||
if [[ "${UPLOAD_CHANNEL:-nightly}" == "nightly" ]]; then
|
||||
export ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN}"
|
||||
else
|
||||
export ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN_TEST}"
|
||||
fi
|
||||
bash .circleci/scripts/binary_upload.sh
|
||||
2
.github/workflows/create_release.yml
vendored
2
.github/workflows/create_release.yml
vendored
@ -63,7 +63,7 @@ jobs:
|
||||
files: ${{env.PT_RELEASE_FILE}}
|
||||
- name: Upload source distribution to GHA artifacts for release tags
|
||||
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
|
||||
uses: actions/upload-artifact@v2
|
||||
uses: actions/upload-artifact@v4.4.0
|
||||
with:
|
||||
name: ${{ env.PT_RELEASE_FILE }}
|
||||
path: ${{ env.PT_RELEASE_FILE }}
|
||||
|
||||
2
.github/workflows/docker-builds.yml
vendored
2
.github/workflows/docker-builds.yml
vendored
@ -103,7 +103,7 @@ jobs:
|
||||
- uses: nick-fields/retry@v3.0.0
|
||||
name: Push to https://https://ghcr.io/
|
||||
id: push-to-ghcr-io
|
||||
if: ${{ github.event_name == 'push' }}
|
||||
if: ${{ 0 && github.event_name == 'push' }}
|
||||
env:
|
||||
ECR_DOCKER_IMAGE: ${{ steps.build-docker-image.outputs.docker-image }}
|
||||
GHCR_PAT: ${{ secrets.GHCR_PAT }}
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
# Template is at: .github/templates/linux_binary_build_workflow.yml.j2
|
||||
# Generation script: .github/scripts/generate_ci_workflows.py
|
||||
name: linux-binary-manywheel
|
||||
name: linux-binary-manywheel-split
|
||||
|
||||
|
||||
on:
|
||||
@ -19,7 +19,7 @@ env:
|
||||
ANACONDA_USER: pytorch
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
BINARY_ENV_FILE: /tmp/env
|
||||
BUILD_ENVIRONMENT: linux-binary-manywheel
|
||||
BUILD_ENVIRONMENT: linux-binary-manywheel-split
|
||||
BUILDER_ROOT: /builder
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
@ -28,7 +28,7 @@ env:
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
SKIP_ALL_TESTS: 0
|
||||
concurrency:
|
||||
group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
@ -58,7 +58,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -81,7 +81,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -105,7 +105,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -128,7 +128,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -152,7 +152,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -175,7 +175,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
# Template is at: .github/templates/linux_binary_build_workflow.yml.j2
|
||||
# Generation script: .github/scripts/generate_ci_workflows.py
|
||||
name: linux-binary-manywheel
|
||||
name: linux-binary-manywheel-split
|
||||
|
||||
|
||||
on:
|
||||
@ -24,7 +24,7 @@ env:
|
||||
ANACONDA_USER: pytorch
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
BINARY_ENV_FILE: /tmp/env
|
||||
BUILD_ENVIRONMENT: linux-binary-manywheel
|
||||
BUILD_ENVIRONMENT: linux-binary-manywheel-split
|
||||
BUILDER_ROOT: /builder
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
@ -33,7 +33,7 @@ env:
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
SKIP_ALL_TESTS: 0
|
||||
concurrency:
|
||||
group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
@ -63,7 +63,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -86,7 +86,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -134,7 +134,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -157,7 +157,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -205,7 +205,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -228,7 +228,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -275,7 +275,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cpu-test: # Testing
|
||||
@ -296,7 +296,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge
|
||||
secrets:
|
||||
@ -343,7 +343,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -366,7 +366,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -414,7 +414,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -437,7 +437,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -485,7 +485,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_1-full-test: # Testing
|
||||
@ -507,7 +507,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -555,7 +555,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -578,7 +578,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -625,7 +625,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cpu-test: # Testing
|
||||
@ -646,7 +646,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge
|
||||
secrets:
|
||||
@ -693,7 +693,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -716,7 +716,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -764,7 +764,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -787,7 +787,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -835,7 +835,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -858,7 +858,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -905,7 +905,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cpu-test: # Testing
|
||||
@ -926,7 +926,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge
|
||||
secrets:
|
||||
@ -973,7 +973,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -996,7 +996,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -1044,7 +1044,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1067,7 +1067,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -1115,7 +1115,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1138,7 +1138,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -1185,7 +1185,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cpu-test: # Testing
|
||||
@ -1206,7 +1206,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge
|
||||
secrets:
|
||||
@ -1253,7 +1253,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1276,7 +1276,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -1324,7 +1324,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1347,7 +1347,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -1395,7 +1395,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1418,7 +1418,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -1465,7 +1465,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cpu-test: # Testing
|
||||
@ -1486,7 +1486,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge
|
||||
secrets:
|
||||
18
.github/workflows/inductor-rocm.yml
vendored
18
.github/workflows/inductor-rocm.yml
vendored
@ -22,11 +22,11 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
linux-focal-rocm6_1-py3_8-inductor-build:
|
||||
name: rocm6.1-py3.8-inductor
|
||||
linux-focal-rocm6_2-py3_10-inductor-build:
|
||||
name: rocm6.2-py3.10-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -34,14 +34,14 @@ jobs:
|
||||
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
|
||||
]}
|
||||
|
||||
linux-focal-rocm6_1-py3_8-inductor-test:
|
||||
linux-focal-rocm6_2-py3_10-inductor-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: rocm6.1-py3.8-inductor
|
||||
name: rocm6.2-py3.10-inductor
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs: linux-focal-rocm6_1-py3_8-inductor-build
|
||||
needs: linux-focal-rocm6_2-py3_10-inductor-build
|
||||
with:
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.test-matrix }}
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-inductor-build.outputs.test-matrix }}
|
||||
|
||||
4
.github/workflows/lint.yml
vendored
4
.github/workflows/lint.yml
vendored
@ -216,10 +216,10 @@ jobs:
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
- name: Setup Python 3.8
|
||||
- name: Setup Python 3.9
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
python-version: '3.9'
|
||||
architecture: x64
|
||||
cache: pip
|
||||
- name: Install dependencies
|
||||
|
||||
22
.github/workflows/periodic.yml
vendored
22
.github/workflows/periodic.yml
vendored
@ -214,7 +214,9 @@ jobs:
|
||||
# TODO: Figure out how to migrate this job to M1 runner
|
||||
ios-build-test:
|
||||
name: ios-build-test
|
||||
if: github.event_name != 'schedule' || github.event.schedule == '45 0,8,16 * * 1-5' || github.event.schedule == '45 4 * * 0,6' || github.event.schedule == '29 8 * * *'
|
||||
# Has been broken for a while, see https://github.com/pytorch/pytorch/issues/136284
|
||||
# if: github.event_name != 'schedule' || github.event.schedule == '45 0,8,16 * * 1-5' || github.event.schedule == '45 4 * * 0,6' || github.event.schedule == '29 8 * * *'
|
||||
if: false
|
||||
uses: ./.github/workflows/_ios-build-test.yml
|
||||
with:
|
||||
trigger-event: ${{ github.event_name }}
|
||||
@ -293,13 +295,13 @@ jobs:
|
||||
docker-image: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.test-matrix }}
|
||||
|
||||
linux-focal-rocm6_1-py3_8-build:
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
linux-focal-rocm6_2-py3_10-build:
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -308,19 +310,19 @@ jobs:
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu" },
|
||||
]}
|
||||
|
||||
linux-focal-rocm6_1-py3_8-test:
|
||||
linux-focal-rocm6_2-py3_10-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-focal-rocm6_1-py3_8-build
|
||||
- linux-focal-rocm6_2-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build:
|
||||
name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
|
||||
|
||||
6
.github/workflows/pull.yml
vendored
6
.github/workflows/pull.yml
vendored
@ -503,15 +503,15 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-rocm6_1-py3_8-build:
|
||||
linux-focal-rocm6_2-py3_10-build:
|
||||
# don't run build twice on main
|
||||
if: github.event_name == 'pull_request'
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
|
||||
18
.github/workflows/rocm.yml
vendored
18
.github/workflows/rocm.yml
vendored
@ -31,11 +31,11 @@ jobs:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
linux-focal-rocm6_1-py3_8-build:
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
linux-focal-rocm6_2-py3_10-build:
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
@ -48,16 +48,16 @@ jobs:
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" },
|
||||
]}
|
||||
|
||||
linux-focal-rocm6_1-py3_8-test:
|
||||
linux-focal-rocm6_2-py3_10-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-focal-rocm6_1-py3_8-build
|
||||
- linux-focal-rocm6_2-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
|
||||
|
||||
50
.github/workflows/slow.yml
vendored
50
.github/workflows/slow.yml
vendored
@ -22,6 +22,7 @@ permissions: read-all
|
||||
|
||||
jobs:
|
||||
llm-td:
|
||||
if: false
|
||||
name: before-test
|
||||
uses: ./.github/workflows/llm_td_retrieval.yml
|
||||
permissions:
|
||||
@ -29,6 +30,7 @@ jobs:
|
||||
contents: read
|
||||
|
||||
target-determination:
|
||||
if: false
|
||||
name: before-test
|
||||
uses: ./.github/workflows/target_determination.yml
|
||||
needs: llm-td
|
||||
@ -37,6 +39,7 @@ jobs:
|
||||
contents: read
|
||||
|
||||
get-label-type:
|
||||
if: false
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
@ -46,6 +49,7 @@ jobs:
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-build:
|
||||
if: false
|
||||
name: linux-focal-cuda12.1-py3-gcc9-slow-gradcheck
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
@ -56,15 +60,18 @@ jobs:
|
||||
cuda-arch-list: 8.6
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 1, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 3, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 6, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 7, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 8, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
|
||||
linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-test:
|
||||
if: false
|
||||
name: linux-focal-cuda12.1-py3-gcc9-slow-gradcheck
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
@ -77,6 +84,7 @@ jobs:
|
||||
timeout-minutes: 300
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-sm86-build:
|
||||
if: false
|
||||
name: linux-focal-cuda12.1-py3.10-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
@ -87,11 +95,13 @@ jobs:
|
||||
cuda-arch-list: 8.6
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "slow", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "slow", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "slow", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "slow", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-sm86-test:
|
||||
if: false
|
||||
name: linux-focal-cuda12.1-py3.10-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
@ -103,6 +113,7 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-sm86-build.outputs.test-matrix }}
|
||||
|
||||
linux-focal-py3_9-clang10-build:
|
||||
if: false
|
||||
name: linux-focal-py3.9-clang10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
@ -117,6 +128,7 @@ jobs:
|
||||
]}
|
||||
|
||||
linux-focal-py3_9-clang10-test:
|
||||
if: false
|
||||
name: linux-focal-py3.9-clang10
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
@ -127,13 +139,14 @@ jobs:
|
||||
docker-image: ${{ needs.linux-focal-py3_9-clang10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-py3_9-clang10-build.outputs.test-matrix }}
|
||||
|
||||
linux-focal-rocm6_1-py3_8-build:
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
linux-focal-rocm6_2-py3_10-build:
|
||||
if: false
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -141,21 +154,23 @@ jobs:
|
||||
{ config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
|
||||
]}
|
||||
|
||||
linux-focal-rocm6_1-py3_8-test:
|
||||
linux-focal-rocm6_2-py3_10-test:
|
||||
if: false
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-focal-rocm6_1-py3_8-build
|
||||
- linux-focal-rocm6_2-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
|
||||
|
||||
linux-jammy-py3_10-clang15-asan-build:
|
||||
if: false
|
||||
name: linux-jammy-py3.10-clang15-asan
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
@ -173,6 +188,7 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_10-clang15-asan-test:
|
||||
if: false
|
||||
name: linux-jammy-py3.10-clang15-asan
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
|
||||
18
.github/workflows/trunk.yml
vendored
18
.github/workflows/trunk.yml
vendored
@ -223,13 +223,13 @@ jobs:
|
||||
cuda-version: "12.1"
|
||||
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
|
||||
linux-focal-rocm6_1-py3_8-build:
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
linux-focal-rocm6_2-py3_10-build:
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
@ -240,19 +240,19 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-rocm6_1-py3_8-test:
|
||||
linux-focal-rocm6_2-py3_10-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-focal-rocm6.1-py3.8
|
||||
name: linux-focal-rocm6.2-py3.10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-focal-rocm6_1-py3_8-build
|
||||
- linux-focal-rocm6_2-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
|
||||
build-environment: linux-focal-rocm6.2-py3.10
|
||||
docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
|
||||
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
|
||||
|
||||
linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build:
|
||||
|
||||
59
README.md
59
README.md
@ -27,8 +27,8 @@ Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.o
|
||||
- [NVIDIA CUDA Support](#nvidia-cuda-support)
|
||||
- [AMD ROCm Support](#amd-rocm-support)
|
||||
- [Intel GPU Support](#intel-gpu-support)
|
||||
- [Install Dependencies](#install-dependencies)
|
||||
- [Get the PyTorch Source](#get-the-pytorch-source)
|
||||
- [Install Dependencies](#install-dependencies)
|
||||
- [Install PyTorch](#install-pytorch)
|
||||
- [Adjust Build Options (Optional)](#adjust-build-options-optional)
|
||||
- [Docker Image](#docker-image)
|
||||
@ -161,9 +161,34 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
|
||||
#### Prerequisites
|
||||
If you are installing from source, you will need:
|
||||
- Python 3.8 or later (for Linux, Python 3.8.1+ is needed)
|
||||
- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required)
|
||||
- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
|
||||
- Visual Studio or Visual Studio Build Tool on Windows
|
||||
|
||||
We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
|
||||
\* PyTorch CI uses Visual C++ BuildTools, which come with Visual Studio Enterprise,
|
||||
Professional, or Community Editions. You can also install the build tools from
|
||||
https://visualstudio.microsoft.com/visual-cpp-build-tools/. The build tools *do not*
|
||||
come with Visual Studio Code by default.
|
||||
|
||||
\* We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
|
||||
|
||||
An example of environment setup is shown below:
|
||||
|
||||
* Linux:
|
||||
|
||||
```bash
|
||||
$ source <CONDA_INSTALL_DIR>/bin/activate
|
||||
$ conda create -y -n <CONDA_NAME>
|
||||
$ conda activate <CONDA_NAME>
|
||||
```
|
||||
|
||||
* Windows:
|
||||
|
||||
```bash
|
||||
$ source <CONDA_INSTALL_DIR>\Scripts\activate.bat
|
||||
$ conda create -y -n <CONDA_NAME>
|
||||
$ conda activate <CONDA_NAME>
|
||||
$ call "C:\Program Files\Microsoft Visual Studio\<VERSION>\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
```
|
||||
|
||||
##### NVIDIA CUDA Support
|
||||
If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following:
|
||||
@ -194,12 +219,23 @@ If you want to compile with Intel GPU support, follow these
|
||||
If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`.
|
||||
Other potentially useful environment variables may be found in `setup.py`.
|
||||
|
||||
#### Get the PyTorch Source
|
||||
```bash
|
||||
git clone --recursive https://github.com/pytorch/pytorch
|
||||
cd pytorch
|
||||
# if you are updating an existing checkout
|
||||
git submodule sync
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
#### Install Dependencies
|
||||
|
||||
**Common**
|
||||
|
||||
```bash
|
||||
conda install cmake ninja
|
||||
# Run this command on native Windows
|
||||
conda install rust
|
||||
# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
@ -235,15 +271,6 @@ pip install mkl-static mkl-include
|
||||
conda install -c conda-forge libuv=1.39
|
||||
```
|
||||
|
||||
#### Get the PyTorch Source
|
||||
```bash
|
||||
git clone --recursive https://github.com/pytorch/pytorch
|
||||
cd pytorch
|
||||
# if you are updating an existing checkout
|
||||
git submodule sync
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
#### Install PyTorch
|
||||
**On Linux**
|
||||
|
||||
@ -284,13 +311,6 @@ python3 setup.py develop
|
||||
|
||||
**On Windows**
|
||||
|
||||
Choose Correct Visual Studio Version.
|
||||
|
||||
PyTorch CI uses Visual C++ BuildTools, which come with Visual Studio Enterprise,
|
||||
Professional, or Community Editions. You can also install the build tools from
|
||||
https://visualstudio.microsoft.com/visual-cpp-build-tools/. The build tools *do not*
|
||||
come with Visual Studio Code by default.
|
||||
|
||||
If you want to build legacy python code, please refer to [Building on legacy code and CUDA](https://github.com/pytorch/pytorch/blob/main/CONTRIBUTING.md#building-on-legacy-code-and-cuda)
|
||||
|
||||
**CPU-only builds**
|
||||
@ -298,7 +318,6 @@ If you want to build legacy python code, please refer to [Building on legacy cod
|
||||
In this mode PyTorch computations will run on your CPU, not your GPU
|
||||
|
||||
```cmd
|
||||
conda activate
|
||||
python setup.py develop
|
||||
```
|
||||
|
||||
|
||||
20
RELEASE.md
20
RELEASE.md
@ -48,16 +48,16 @@
|
||||
|
||||
Following is the Release Compatibility Matrix for PyTorch releases:
|
||||
|
||||
| PyTorch version | Python | Stable CUDA | Experimental CUDA | Stable ROCm |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| 2.5 | >=3.9, <=3.12, (3.13 experimental) | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70 | None | ROCm 6.2 |
|
||||
| 2.4 | >=3.8, <=3.12 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70 | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
|
||||
| 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
|
||||
| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
|
||||
| 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 |
|
||||
| 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | ROCm 5.4 |
|
||||
| 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
|
||||
| 1.12 | >=3.7, <=3.10 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |
|
||||
| PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
|
||||
| --- | --- | --- | --- | --- | --- |
|
||||
| 2.5 | >=3.9, <=3.12, (3.13 experimental) | C++17 | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70 | None | ROCm 6.2 |
|
||||
| 2.4 | >=3.8, <=3.12 | C++17 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70 | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
|
||||
| 2.3 | >=3.8, <=3.11, (3.12 experimental) | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
|
||||
| 2.2 | >=3.8, <=3.11, (3.12 experimental) | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
|
||||
| 2.1 | >=3.8, <=3.11 | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 |
|
||||
| 2.0 | >=3.8, <=3.11 | C++14 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | ROCm 5.4 |
|
||||
| 1.13 | >=3.7, <=3.10 | C++14 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
|
||||
| 1.12 | >=3.7, <=3.10 | C++14 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |
|
||||
|
||||
## Release Cadence
|
||||
|
||||
|
||||
@ -145,6 +145,14 @@ void Context::setSDPUseMath(bool e) {
|
||||
enabled_mathSDP = e;
|
||||
}
|
||||
|
||||
bool Context::allowFP16BF16ReductionMathSDP() const {
|
||||
return allow_fp16_bf16_reduction_mathSDP;
|
||||
}
|
||||
|
||||
void Context::setAllowFP16BF16ReductionMathSDP(bool e) {
|
||||
allow_fp16_bf16_reduction_mathSDP = e;
|
||||
}
|
||||
|
||||
bool Context::userEnabledCuDNNSDP() const {
|
||||
return enabled_cudnnSDP;
|
||||
}
|
||||
|
||||
@ -234,6 +234,9 @@ class TORCH_API Context {
|
||||
void setSDPUseCuDNN(bool);
|
||||
bool userEnabledCuDNNSDP() const;
|
||||
|
||||
void setAllowFP16BF16ReductionMathSDP(bool);
|
||||
bool allowFP16BF16ReductionMathSDP() const;
|
||||
|
||||
void setSDPUseOverrideable(bool);
|
||||
bool userEnabledOverrideableSDP() const;
|
||||
|
||||
@ -390,6 +393,7 @@ class TORCH_API Context {
|
||||
bool enabled_mathSDP = true;
|
||||
bool enabled_cudnnSDP = true;
|
||||
bool enabled_overrideable = true;
|
||||
bool allow_fp16_bf16_reduction_mathSDP = false;
|
||||
#ifdef USE_ROCM
|
||||
bool benchmark_cudnn = true;
|
||||
#else
|
||||
|
||||
@ -128,10 +128,26 @@ inline bool _check_tensors_share_device_and_dtype(
|
||||
// corresponding tensors in tensor lists have the same sizes and strides.
|
||||
inline bool _check_tensors_share_sizes_and_strides(
|
||||
ArrayRef<TensorList> tensorLists) {
|
||||
auto is_diff_stride = [](const IntArrayRef& size,
|
||||
const IntArrayRef& left_stride,
|
||||
const IntArrayRef& right_stride) -> bool {
|
||||
const size_t size_size = size.size();
|
||||
for (const auto dim : c10::irange(size_size)) {
|
||||
if (size[dim] == 1)
|
||||
continue;
|
||||
if (left_stride[dim] != right_stride[dim]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
for (const auto i : c10::irange(1, tensorLists.size())) {
|
||||
for (const auto j : c10::irange(tensorLists[0].size())) {
|
||||
if (tensorLists[0][j].sizes() != tensorLists[i][j].sizes() ||
|
||||
tensorLists[0][j].strides() != tensorLists[i][j].strides()) {
|
||||
is_diff_stride(
|
||||
tensorLists[0][j].sizes(),
|
||||
tensorLists[0][j].strides(),
|
||||
tensorLists[i][j].strides())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@ -341,8 +341,8 @@ void gemm_notrans_(
|
||||
at::Half* c,
|
||||
int64_t ldc) {
|
||||
// c += alpha * (a @ b)
|
||||
if (n == 1 && beta == 0.0) {
|
||||
at::native::blas_impl::fp16_gemv_notrans(m, k, alpha, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, beta, reinterpret_cast<float16_t*>(c), 1);
|
||||
if (n == 1 && beta == 0.0 && alpha == 1.0) {
|
||||
at::native::blas_impl::fp16_gemv_notrans(m, k, 1.0, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, 0.0, reinterpret_cast<float16_t*>(c), 1);
|
||||
return;
|
||||
}
|
||||
for (const auto i : c10::irange(m)) {
|
||||
@ -388,8 +388,8 @@ void gemm_transa_(
|
||||
float beta,
|
||||
at::Half *c, int64_t ldc) {
|
||||
// c = alpha * (a.T @ b) + beta * c
|
||||
if (n == 1 && beta == 0.0) {
|
||||
at::native::blas_impl::fp16_gemv_trans(k, m, alpha, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, beta, reinterpret_cast<float16_t*>(c), 1);
|
||||
if (n == 1 && beta == 0.0 && alpha == 1.0) {
|
||||
at::native::blas_impl::fp16_gemv_trans(k, m, 1.0, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, 0.0, reinterpret_cast<float16_t*>(c), 1);
|
||||
return;
|
||||
}
|
||||
parallel_for(0, m, 1, [&](int64_t begin, int64_t end) {
|
||||
|
||||
@ -22,6 +22,7 @@ void run_cudnn_SDP_fprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
@ -43,6 +44,7 @@ void run_cudnn_SDP_bprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
const Tensor& o,
|
||||
const Tensor& dO,
|
||||
const Tensor& softmaxstats,
|
||||
@ -86,9 +88,9 @@ using graph_and_tensors = std::tuple<
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // K,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // V,
|
||||
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>>, // Bias
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
|
||||
// TODO(eqy): additional options
|
||||
// std::shared_ptr<fe::graph::Tensor_attributes>, // Bias,
|
||||
// std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_Q,
|
||||
// std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_KV,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
|
||||
@ -104,7 +106,8 @@ using graph_and_tensors_backward = std::tuple<
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // K,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // V,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale
|
||||
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>>, // Bias,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Offset,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // O,
|
||||
@ -126,6 +129,8 @@ struct MHAParams {
|
||||
std::array<int, MAX_MHA_DIM> q_stride;
|
||||
std::array<int, MAX_MHA_DIM> k_stride;
|
||||
std::array<int, MAX_MHA_DIM> v_stride;
|
||||
std::array<int, MAX_MHA_DIM> bias_dim;
|
||||
std::array<int, MAX_MHA_DIM> bias_stride;
|
||||
int64_t b;
|
||||
int64_t h;
|
||||
int64_t s_q;
|
||||
@ -135,6 +140,9 @@ struct MHAParams {
|
||||
double dropout_probability;
|
||||
bool is_causal;
|
||||
bool return_softmaxstats;
|
||||
// might be redundant if we take 0 dim/stride
|
||||
// as signaling no-bias
|
||||
bool has_attn_bias;
|
||||
};
|
||||
|
||||
void setMHAParams(
|
||||
@ -148,6 +156,7 @@ void setMHAParams(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
double dropout_probability,
|
||||
bool is_causal,
|
||||
bool return_softmaxstats) {
|
||||
@ -166,6 +175,7 @@ void setMHAParams(
|
||||
params.dropout_probability = dropout_probability;
|
||||
params.is_causal = is_causal;
|
||||
params.return_softmaxstats = return_softmaxstats;
|
||||
params.has_attn_bias = attn_bias.has_value();
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
q.sizes().size() == MAX_MHA_DIM,
|
||||
"Q tensor has unexpected number of dims, please report a bug to PyTorch.");
|
||||
@ -190,6 +200,17 @@ void setMHAParams(
|
||||
std::copy(k.strides().begin(), k.strides().end(), params.k_stride.begin());
|
||||
std::copy(v.sizes().begin(), v.sizes().end(), params.v_dim.begin());
|
||||
std::copy(v.strides().begin(), v.strides().end(), params.v_stride.begin());
|
||||
// uninit is OK as the struct is memset 0'd
|
||||
if (params.has_attn_bias) {
|
||||
std::copy(
|
||||
attn_bias.value().sizes().begin(),
|
||||
attn_bias.value().sizes().end(),
|
||||
params.bias_dim.begin());
|
||||
std::copy(
|
||||
attn_bias.value().strides().begin(),
|
||||
attn_bias.value().strides().end(),
|
||||
params.bias_stride.begin());
|
||||
}
|
||||
}
|
||||
|
||||
struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
|
||||
@ -203,6 +224,7 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
double dropout_probability,
|
||||
bool is_causal,
|
||||
bool return_softmaxstats) {
|
||||
@ -217,6 +239,7 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
dropout_probability,
|
||||
is_causal,
|
||||
return_softmaxstats);
|
||||
@ -285,6 +308,7 @@ auto build_graph_and_tensors(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
@ -301,36 +325,6 @@ auto build_graph_and_tensors(
|
||||
mha_graph->set_io_data_type(dtype)
|
||||
.set_intermediate_data_type(fe::DataType_t::FLOAT)
|
||||
.set_compute_data_type(fe::DataType_t::FLOAT);
|
||||
auto Q = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("Q")
|
||||
.set_dim(std::vector<int64_t>(
|
||||
q.sizes().data(), q.sizes().data() + q.sizes().size()))
|
||||
.set_stride(fixSizeOneDimStrideSDPA(
|
||||
q.sizes(),
|
||||
std::vector<int64_t>(
|
||||
q.strides().data(),
|
||||
q.strides().data() + q.strides().size()))));
|
||||
auto K = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("K")
|
||||
.set_dim(std::vector<int64_t>(
|
||||
k.sizes().data(), k.sizes().data() + k.sizes().size()))
|
||||
.set_stride(fixSizeOneDimStrideSDPA(
|
||||
k.sizes(),
|
||||
std::vector<int64_t>(
|
||||
k.strides().data(),
|
||||
k.strides().data() + k.strides().size()))));
|
||||
auto V = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("V")
|
||||
.set_dim(std::vector<int64_t>(
|
||||
v.sizes().data(), v.sizes().data() + v.sizes().size()))
|
||||
.set_stride(fixSizeOneDimStrideSDPA(
|
||||
v.sizes(),
|
||||
std::vector<int64_t>(
|
||||
v.strides().data(),
|
||||
v.strides().data() + v.strides().size()))));
|
||||
auto attn_scale =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Attn_scale")
|
||||
@ -338,11 +332,6 @@ auto build_graph_and_tensors(
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_is_pass_by_value(true)
|
||||
.set_data_type(fe::DataType_t::FLOAT));
|
||||
// TODO(eqy): support bias in the future in a follow-up PR
|
||||
// auto bias = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
// .set_name("bias")
|
||||
// .set_dim({b, 1, s_q, s_kv})
|
||||
// .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
|
||||
auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Seed")
|
||||
.set_dim({1, 1, 1, 1})
|
||||
@ -360,11 +349,30 @@ auto build_graph_and_tensors(
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale)
|
||||
.set_dropout(dropout_probability, seed, offset);
|
||||
// Optional bias in flash attention is only supported 8.9.3 onwards
|
||||
if (cudnnGetVersion() >= 8904) {
|
||||
// scaled_dot_product_flash_attention_options.set_alibi_mask(true);
|
||||
auto Q = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("Q")
|
||||
.set_dim(q.sizes().vec())
|
||||
.set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec())));
|
||||
auto K = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("K")
|
||||
.set_dim(k.sizes().vec())
|
||||
.set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec())));
|
||||
auto V = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("V")
|
||||
.set_dim(v.sizes().vec())
|
||||
.set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec())));
|
||||
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
|
||||
if (attn_bias.has_value()) {
|
||||
bias =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("bias")
|
||||
.set_dim(attn_bias.value().sizes().vec())
|
||||
.set_stride(attn_bias.value().strides().vec()));
|
||||
scaled_dot_product_flash_attention_options.set_bias(bias.value());
|
||||
}
|
||||
|
||||
auto seq_q = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Seq_q")
|
||||
.set_dim({b, 1, 1, 1})
|
||||
@ -376,20 +384,9 @@ auto build_graph_and_tensors(
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
|
||||
// if (cudnnGetVersion() >= 8903) {
|
||||
// scaled_dot_product_flash_attention_options.set_bias(bias)
|
||||
// .set_padding_mask(true)
|
||||
// .set_seq_len_q(seq_q)
|
||||
// .set_seq_len_kv(seq_kv);
|
||||
// }
|
||||
|
||||
auto [O, Stats] =
|
||||
mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options);
|
||||
O->set_output(true)
|
||||
.set_dim(std::vector<int64_t>(
|
||||
o.sizes().data(), o.sizes().data() + o.sizes().size()))
|
||||
.set_stride(std::vector<int64_t>(
|
||||
o.strides().data(), o.strides().data() + o.strides().size()));
|
||||
O->set_output(true).set_dim(o.sizes().vec()).set_stride(o.strides().vec());
|
||||
|
||||
if (Stats) {
|
||||
Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
|
||||
@ -407,6 +404,7 @@ auto build_graph_and_tensors(
|
||||
std::move(Q),
|
||||
std::move(K),
|
||||
std::move(V),
|
||||
std::move(bias),
|
||||
std::move(attn_scale),
|
||||
std::move(seed),
|
||||
std::move(offset),
|
||||
@ -427,6 +425,7 @@ auto build_graph_and_tensors_backward(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
const Tensor& o,
|
||||
const Tensor& dO,
|
||||
const Tensor& softmaxstats,
|
||||
@ -447,24 +446,6 @@ auto build_graph_and_tensors_backward(
|
||||
mha_graph->set_io_data_type(dtype)
|
||||
.set_intermediate_data_type(fe::DataType_t::FLOAT)
|
||||
.set_compute_data_type(fe::DataType_t::FLOAT);
|
||||
auto Q = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("Q")
|
||||
.set_dim(std::vector<int64_t>(q.sizes().begin(), q.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(q.strides().begin(), q.strides().end())));
|
||||
auto K = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("K")
|
||||
.set_dim(std::vector<int64_t>(k.sizes().begin(), k.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(k.strides().begin(), k.strides().end())));
|
||||
auto V = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("V")
|
||||
.set_dim(std::vector<int64_t>(v.sizes().begin(), v.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(v.strides().begin(), v.strides().end())));
|
||||
auto attn_scale =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Attn_scale")
|
||||
@ -472,6 +453,31 @@ auto build_graph_and_tensors_backward(
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_is_pass_by_value(true)
|
||||
.set_data_type(fe::DataType_t::FLOAT));
|
||||
auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
|
||||
.set_name("CUDNN_SDPA_BACKWARD")
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale);
|
||||
auto Q = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Q")
|
||||
.set_dim(q.sizes().vec())
|
||||
.set_stride(q.strides().vec()));
|
||||
auto K = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("K")
|
||||
.set_dim(k.sizes().vec())
|
||||
.set_stride(k.strides().vec()));
|
||||
auto V = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("V")
|
||||
.set_dim(v.sizes().vec())
|
||||
.set_stride(v.strides().vec()));
|
||||
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
|
||||
if (attn_bias.has_value()) {
|
||||
bias =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("bias")
|
||||
.set_dim(attn_bias.value().sizes().vec())
|
||||
.set_stride(attn_bias.value().strides().vec()));
|
||||
sdpa_backward_options.set_bias(bias.value());
|
||||
}
|
||||
auto Seed = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Seed")
|
||||
.set_dim({1, 1, 1, 1})
|
||||
@ -482,47 +488,27 @@ auto build_graph_and_tensors_backward(
|
||||
.set_dim({1, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
auto O = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("O")
|
||||
.set_dim(std::vector<int64_t>(o.sizes().begin(), o.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(o.strides().begin(), o.strides().end())));
|
||||
auto STATS = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("Stats")
|
||||
.set_dim(std::vector<int64_t>(
|
||||
softmaxstats.sizes().begin(), softmaxstats.sizes().end()))
|
||||
.set_stride(std::vector<int64_t>(
|
||||
softmaxstats.strides().begin(), softmaxstats.strides().end()))
|
||||
.set_data_type(fe::DataType_t::FLOAT));
|
||||
auto DO = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("DO")
|
||||
.set_dim(std::vector<int64_t>(dO.sizes().begin(), dO.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(dO.strides().begin(), dO.strides().end())));
|
||||
auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
|
||||
.set_name("CUDNN_SDPA_BACKWARD")
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale);
|
||||
auto O = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("O")
|
||||
.set_dim(o.sizes().vec())
|
||||
.set_stride(o.strides().vec()));
|
||||
auto STATS = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Stats")
|
||||
.set_dim(softmaxstats.sizes().vec())
|
||||
.set_stride(softmaxstats.strides().vec())
|
||||
.set_data_type(fe::DataType_t::FLOAT));
|
||||
auto DO = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("DO")
|
||||
.set_dim(dO.sizes().vec())
|
||||
.set_stride(dO.strides().vec()));
|
||||
if (dropout_probability != 0.0f) {
|
||||
sdpa_backward_options.set_dropout(dropout_probability, Seed, Offset);
|
||||
}
|
||||
auto [DQ, DK, DV] =
|
||||
mha_graph->sdpa_backward(Q, K, V, O, DO, STATS, sdpa_backward_options);
|
||||
DQ->set_output(true)
|
||||
.set_dim(std::vector<int64_t>(dQ.sizes().begin(), dQ.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(dQ.strides().begin(), dQ.strides().end()));
|
||||
DK->set_output(true)
|
||||
.set_dim(std::vector<int64_t>(dK.sizes().begin(), dK.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(dK.strides().begin(), dK.strides().end()));
|
||||
DV->set_output(true)
|
||||
.set_dim(std::vector<int64_t>(dV.sizes().begin(), dV.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(dV.strides().begin(), dV.strides().end()));
|
||||
DQ->set_output(true).set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec());
|
||||
DK->set_output(true).set_dim(dK.sizes().vec()).set_stride(dK.strides().vec());
|
||||
DV->set_output(true).set_dim(dV.sizes().vec()).set_stride(dV.strides().vec());
|
||||
AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
|
||||
AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
|
||||
AT_CUDNN_FRONTEND_CHECK(
|
||||
@ -534,6 +520,7 @@ auto build_graph_and_tensors_backward(
|
||||
std::move(Q),
|
||||
std::move(K),
|
||||
std::move(V),
|
||||
std::move(bias),
|
||||
std::move(attn_scale),
|
||||
std::move(Seed),
|
||||
std::move(Offset),
|
||||
@ -559,6 +546,7 @@ void run_cudnn_SDP_fprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
@ -573,6 +561,11 @@ void run_cudnn_SDP_fprop(
|
||||
softmaxstats = at::empty({b, h, s_q}, q.options().dtype(kFloat));
|
||||
}
|
||||
|
||||
// do nothing if we got 0-element tensors
|
||||
if (!q.numel() || !k.numel() || !v.numel()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto key = MHACacheKeyWrapper(
|
||||
b,
|
||||
h,
|
||||
@ -583,6 +576,7 @@ void run_cudnn_SDP_fprop(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
dropout_probability,
|
||||
is_causal,
|
||||
return_softmaxstats);
|
||||
@ -605,13 +599,14 @@ void run_cudnn_SDP_fprop(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
softmaxstats,
|
||||
o,
|
||||
dropoutseed,
|
||||
dropoutoffset,
|
||||
handle);
|
||||
}
|
||||
auto [mha_graph, Q, K, V, attn_scale, seed, offset, O, Stats] =
|
||||
auto [mha_graph, Q, K, V, bias, attn_scale, seed, offset, O, Stats] =
|
||||
graph_and_tensors_values;
|
||||
std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
|
||||
variant_pack = {
|
||||
@ -619,13 +614,15 @@ void run_cudnn_SDP_fprop(
|
||||
{K, k.data_ptr()},
|
||||
{V, v.data_ptr()},
|
||||
{attn_scale, &scaling_factor},
|
||||
//{bias, bias.data_ptr()},
|
||||
{seed, dropoutseed.data_ptr()},
|
||||
{offset, dropoutoffset.data_ptr()},
|
||||
{O, o.data_ptr()}};
|
||||
if (return_softmaxstats) {
|
||||
variant_pack[Stats] = softmaxstats.data_ptr();
|
||||
}
|
||||
if (attn_bias.has_value()) {
|
||||
variant_pack[bias.value()] = attn_bias.value().data_ptr();
|
||||
}
|
||||
auto workspace_size = mha_graph->get_workspace_size();
|
||||
auto workspace_ptr =
|
||||
c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
|
||||
@ -647,6 +644,7 @@ void run_cudnn_SDP_bprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
const Tensor& o,
|
||||
const Tensor& dO,
|
||||
const Tensor& softmaxstats,
|
||||
@ -655,6 +653,12 @@ void run_cudnn_SDP_bprop(
|
||||
Tensor& dV,
|
||||
const Tensor& dropoutseed,
|
||||
const Tensor& dropoutoffset) {
|
||||
// do nothing if we got 0-element tensors
|
||||
if (!q.numel() || !k.numel() || !v.numel() || !o.numel() || !dO.numel() ||
|
||||
!softmaxstats.numel()) {
|
||||
return;
|
||||
}
|
||||
|
||||
Tensor dO_ = dO;
|
||||
if (!dO.strides()[dO.strides().size() - 1]) {
|
||||
TORCH_WARN(
|
||||
@ -694,6 +698,7 @@ void run_cudnn_SDP_bprop(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
dropout_probability,
|
||||
is_causal,
|
||||
true);
|
||||
@ -715,6 +720,7 @@ void run_cudnn_SDP_bprop(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
o,
|
||||
dO_,
|
||||
softmaxstats,
|
||||
@ -726,8 +732,20 @@ void run_cudnn_SDP_bprop(
|
||||
handle);
|
||||
}
|
||||
auto
|
||||
[mha_graph, Q, K, V, attn_scale, Seed, Offset, O, Do, Stats, Dq, Dk, Dv] =
|
||||
graph_and_tensors_backward_values;
|
||||
[mha_graph,
|
||||
Q,
|
||||
K,
|
||||
V,
|
||||
bias,
|
||||
attn_scale,
|
||||
Seed,
|
||||
Offset,
|
||||
O,
|
||||
Do,
|
||||
Stats,
|
||||
Dq,
|
||||
Dk,
|
||||
Dv] = graph_and_tensors_backward_values;
|
||||
std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
|
||||
variant_pack = {// inputs
|
||||
{Q, q.data_ptr()},
|
||||
@ -746,6 +764,9 @@ void run_cudnn_SDP_bprop(
|
||||
variant_pack[Seed] = dropoutseed.data_ptr();
|
||||
variant_pack[Offset] = dropoutoffset.data_ptr();
|
||||
}
|
||||
if (attn_bias.has_value()) {
|
||||
variant_pack[bias.value()] = attn_bias.value().data_ptr();
|
||||
}
|
||||
auto workspace_size = mha_graph->get_workspace_size();
|
||||
auto workspace_ptr =
|
||||
c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
|
||||
|
||||
@ -18,6 +18,7 @@ void run_cudnn_SDP_fprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
@ -36,6 +37,7 @@ void run_cudnn_SDP_bprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
const Tensor& o,
|
||||
const Tensor& dO,
|
||||
const Tensor& softmaxstats,
|
||||
|
||||
@ -153,13 +153,16 @@ static void reduction_out_mps(const Tensor& input_t,
|
||||
const std::string& func_name) {
|
||||
bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
|
||||
MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, func_name);
|
||||
// NS: TODO: get rid of all those shenanigans and just call reduction_op with view tensor
|
||||
bool canSqueezeLastDim = true;
|
||||
IntArrayRef input_shape = input_t.sizes();
|
||||
if (opt_dim.has_value()) {
|
||||
IntArrayRef dim = opt_dim.value();
|
||||
for (const auto dim_val : dim) {
|
||||
auto wrap_dim = maybe_wrap_dim(dim_val, input_shape.size());
|
||||
if (wrap_dim >= 4) {
|
||||
// canSqueeze logic is broken when dim is negative, it introduces off-by-one-erros or crashes
|
||||
// See https://github.com/pytorch/pytorch/issues/136132#issuecomment-2354482608
|
||||
if (wrap_dim >= 4 || dim_val < 0) {
|
||||
canSqueezeLastDim = false;
|
||||
}
|
||||
TORCH_CHECK(
|
||||
|
||||
@ -8554,7 +8554,7 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: method, function
|
||||
dispatch:
|
||||
CPU, CUDA: __rshift__
|
||||
CPU, CUDA, MPS: __rshift__
|
||||
tags: pointwise
|
||||
|
||||
- func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
|
||||
|
||||
@ -536,6 +536,24 @@ std::optional<Tensor> convert_boolean_attn_mask(const std::optional<Tensor>& att
|
||||
// Otherwise, attn_mask represents an additive attention tensor
|
||||
return attn_mask;
|
||||
}
|
||||
|
||||
// alternate version to workaround -inf issue with cuDNN
|
||||
// TODO(eqy): delete this when cuDNN -inf issue is resolved
|
||||
std::optional<Tensor> convert_boolean_attn_mask_cudnn(const std::optional<Tensor>& attn_mask, caffe2::TypeMeta dtype) {
|
||||
// Pass through
|
||||
if(!attn_mask.has_value()){
|
||||
return std::nullopt;
|
||||
}
|
||||
// Convert boolean mask to additive mask; need to invert mask to indicate what
|
||||
// to mask *out*.
|
||||
if (attn_mask->dtype() == at::kBool) {
|
||||
// TODO Use the max type of the input and output
|
||||
return at::where(attn_mask->logical_not(), -65504.0, at::scalar_tensor(0.0, at::TensorOptions().dtype(dtype)));
|
||||
}
|
||||
// Otherwise, attn_mask represents an additive attention tensor
|
||||
return attn_mask;
|
||||
}
|
||||
|
||||
// Memory Efficient Attention requires a padded attn mask bias
|
||||
// This function pads the attn_mask bias to be a multiple of 16
|
||||
// Then slices the padded bias to the original size
|
||||
@ -698,15 +716,16 @@ Tensor scaled_dot_product_attention(
|
||||
query_, key, value, attn_mask_, dropout_p, is_causal, scale, enable_gqa);
|
||||
}
|
||||
sdp::SDPBackend backend = static_cast<sdp::SDPBackend>(choice_int);
|
||||
std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
|
||||
switch (backend) {
|
||||
case sdp::SDPBackend::cudnn_attention: {
|
||||
std::optional<Tensor> attn_mask = convert_boolean_attn_mask_cudnn(attn_mask_, query_.dtype());
|
||||
bool compute_logsumexp = should_compute_logsumexp(query_, key, value);
|
||||
auto out_lse_softmax = at::_scaled_dot_product_cudnn_attention(
|
||||
query_, key, value, attn_mask_, compute_logsumexp, dropout_p, is_causal, false /*return_debug_mask*/, scale);
|
||||
query_, key, value, attn_mask, compute_logsumexp, dropout_p, is_causal, false /*return_debug_mask*/, scale);
|
||||
return std::get<0>(out_lse_softmax);
|
||||
}
|
||||
case sdp::SDPBackend::flash_attention: {
|
||||
std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
|
||||
if(query_.device().type() == DeviceType::CUDA){
|
||||
c10::SymInt og_size = query_.sym_size(-1);
|
||||
Tensor query_padded = pad_last_dim<8, false>(query_);
|
||||
@ -723,6 +742,7 @@ Tensor scaled_dot_product_attention(
|
||||
query_, key, value, dropout_p, is_causal, attn_mask, scale));
|
||||
}
|
||||
case sdp::SDPBackend::efficient_attention: {
|
||||
std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
|
||||
bool compute_logsumexp = should_compute_logsumexp(query_, key, value);
|
||||
if (attn_mask.has_value()) {
|
||||
attn_mask.value() = preprocess_mask(attn_mask.value(), query_, key, value);;
|
||||
@ -732,11 +752,13 @@ Tensor scaled_dot_product_attention(
|
||||
return std::get<0>(out_and_lse);
|
||||
}
|
||||
case sdp::SDPBackend::overrideable: {
|
||||
std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
|
||||
auto out_lse_softmax = at::_scaled_dot_product_fused_attention_overrideable(
|
||||
query_, key, value, attn_mask, dropout_p, is_causal, false /*return_debug_mask*/, scale);
|
||||
return std::get<0>(out_lse_softmax);
|
||||
}
|
||||
case sdp::SDPBackend::math:
|
||||
case sdp::SDPBackend::math: {
|
||||
std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
|
||||
if ((!GradMode::is_enabled() || (!query_.requires_grad() && !key.requires_grad() && !value.requires_grad()))
|
||||
&& query_.device().type() == DeviceType::MPS && dropout_p == 0.0
|
||||
&& query_.is_contiguous() && key.is_contiguous() && value.is_contiguous()
|
||||
@ -761,6 +783,7 @@ Tensor scaled_dot_product_attention(
|
||||
std::nullopt, /*dropout_mask*/
|
||||
scale,
|
||||
enable_gqa));
|
||||
}
|
||||
default:
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
@ -780,22 +803,26 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
|
||||
value.is_contiguous(),
|
||||
"scaled_dot_product_attention: If inputs are nested tensors they must be contiguous");
|
||||
}
|
||||
auto& ctx = at::globalContext();
|
||||
auto origin_dtype = query_.scalar_type();
|
||||
// Keep query, key, value in high precision for accuracy
|
||||
// NestedTensor reports issues for backward with autograd so disabled: must be
|
||||
// contiguous to get buffer.
|
||||
auto query_acc = (query_.scalar_type() == at::kHalf ||
|
||||
query_.scalar_type() == at::kBFloat16) &&
|
||||
auto query_acc = !ctx.allowFP16BF16ReductionMathSDP() &&
|
||||
(query_.scalar_type() == at::kHalf ||
|
||||
query_.scalar_type() == at::kBFloat16) &&
|
||||
!query_.is_nested()
|
||||
? query_.to(at::kFloat)
|
||||
: query_;
|
||||
auto key_acc =
|
||||
(key.scalar_type() == at::kHalf || key.scalar_type() == at::kBFloat16) &&
|
||||
auto key_acc = !ctx.allowFP16BF16ReductionMathSDP() &&
|
||||
(key.scalar_type() == at::kHalf ||
|
||||
key.scalar_type() == at::kBFloat16) &&
|
||||
!key.is_nested()
|
||||
? key.to(at::kFloat)
|
||||
: key;
|
||||
auto value_acc = (value.scalar_type() == at::kHalf ||
|
||||
value.scalar_type() == at::kBFloat16) &&
|
||||
auto value_acc = !ctx.allowFP16BF16ReductionMathSDP() &&
|
||||
(value.scalar_type() == at::kHalf ||
|
||||
value.scalar_type() == at::kBFloat16) &&
|
||||
!value.is_nested()
|
||||
? value.to(at::kFloat)
|
||||
: value;
|
||||
|
||||
@ -774,6 +774,18 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
|
||||
TORCH_CHECK(
|
||||
max_seqlen_batch_k == max_seqlen_batch_v,
|
||||
"Key and Value must have the same sequence length");
|
||||
auto attn_bias_ = attn_bias;
|
||||
if (attn_bias_.has_value()) {
|
||||
const auto bias_dim = attn_bias_.value().dim();
|
||||
if (bias_dim == 2) {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
} else if (bias_dim == 3) {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
} else {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
|
||||
}
|
||||
}
|
||||
|
||||
Tensor attention, log_sumexp;
|
||||
|
||||
@ -818,13 +830,14 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
|
||||
query/* Tensor q*/,
|
||||
key/* Tensor k*/,
|
||||
value/* Tensor v*/,
|
||||
attn_bias_ /* std::optional<Tensor> */,
|
||||
log_sumexp/*Tensor softmaxstats*/,
|
||||
attention/*Tensor o*/,
|
||||
cudnn_seed/*Tensor dropoutseed*/,
|
||||
cudnn_offset/*Tensor dropoutoffset*/);
|
||||
|
||||
// TODO(eqy): support debug_attn_mask
|
||||
return std::make_tuple(attention, log_sumexp, Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, cudnn_seed, cudnn_offset, Tensor());
|
||||
return std::make_tuple(std::move(attention), std::move(log_sumexp), Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_efficient_attention_cuda(
|
||||
@ -1102,10 +1115,13 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
|
||||
offset_t = at::empty({}, at::dtype(at::kLong).device(device));
|
||||
} else {
|
||||
auto [seed, offset] = at::cuda::philox::unpack(philox_state);
|
||||
seed_t = at::scalar_tensor(
|
||||
at::Scalar(static_cast<int64_t>(seed)), at::dtype(at::kLong));
|
||||
offset_t = at::scalar_tensor(
|
||||
at::Scalar(static_cast<int64_t>(offset)), at::dtype(at::kLong));
|
||||
#ifdef USE_ROCM
|
||||
const auto options = at::dtype(at::kLong).device(at::kCUDA);
|
||||
#else
|
||||
const auto options = at::dtype(at::kLong);
|
||||
#endif
|
||||
seed_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(seed)), options);
|
||||
offset_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(offset)), options);
|
||||
}
|
||||
} else {
|
||||
// Not using dropout
|
||||
@ -1118,7 +1134,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
|
||||
auto ret = aotriton::v2::flash::check_gpu(stream);
|
||||
if (hipSuccess != ret) {
|
||||
TORCH_CHECK(false,
|
||||
"[AOTriton] Accelerated SDPA only supports MI200/MI300X GPUs (gfx90a:sramecc+:xnack- or gfx94a:sramecc+:xnack-)")
|
||||
"[AOTriton] Accelerated SDPA only supports MI200/MI300X/Navi31 GPUs"
|
||||
" (gfx90a:sramecc+:xnack-/gfx942:sramecc+:xnack-/gfx1100)")
|
||||
}
|
||||
|
||||
// AOTriton may accept aligned on logsumexp tensor in the future for better
|
||||
@ -1147,8 +1164,16 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
|
||||
|
||||
using aotriton::v2::flash::attn_fwd;
|
||||
using sdp::aotriton_adapter::mk_aotensor;
|
||||
using sdp::aotriton_adapter::mk_aoscalartensor;
|
||||
using sdp::aotriton_adapter::mk_philoxtensor;
|
||||
aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, aotriton::DType::kFloat16);
|
||||
at::Tensor softmax_fa_t = at::empty({ 0, 0, 0, 0 }, query.options());
|
||||
const bool use_philox_state = in_capture_stream;
|
||||
auto seed = use_philox_state ? mk_philoxtensor(philox_state.seed_.ptr) : mk_aoscalartensor(seed_t);
|
||||
auto offset1 = use_philox_state ? mk_philoxtensor(philox_state.offset_.ptr) : mk_aoscalartensor(offset_t);
|
||||
auto offset2 = use_philox_state ? philox_state.offset_intragraph_ : 0;
|
||||
auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr<int64_t>()) : mk_philoxtensor(nullptr);
|
||||
auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr<int64_t>()) : mk_philoxtensor(nullptr);
|
||||
hipError_t err; // TODO: Error handling
|
||||
err = attn_fwd(mk_aotensor(q_t, "q"),
|
||||
mk_aotensor(k_t, "k"),
|
||||
@ -1158,8 +1183,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
|
||||
mk_aotensor<2>(softmax_lse, "M"),
|
||||
mk_aotensor(output_t, "Out"),
|
||||
dropout_p,
|
||||
use_dropout ? *seed_t.data_ptr<int64_t>() : 0,
|
||||
use_dropout ? *offset_t.data_ptr<int64_t>() : 0,
|
||||
seed,
|
||||
offset1,
|
||||
offset2,
|
||||
seed_output,
|
||||
offset_output,
|
||||
mk_aotensor(softmax_fa_t, "encoded_softmax"),
|
||||
is_causal,
|
||||
stream);
|
||||
|
||||
@ -195,6 +195,27 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_
|
||||
const int64_t num_heads = query.size(1);
|
||||
const int64_t head_dim_qk = query.size(3);
|
||||
const int64_t head_dim_v = value.size(3);
|
||||
const int64_t max_seqlen_batch_q = query.size(2);
|
||||
const int64_t max_seqlen_batch_k = key.size(2);
|
||||
|
||||
// This is needed because SaveVariable automatically converts
|
||||
// std::optional to undefined tensor
|
||||
std::optional<Tensor> attn_bias_;
|
||||
if (attn_bias.defined()) {
|
||||
attn_bias_ = attn_bias;
|
||||
}
|
||||
if (attn_bias_.has_value()) {
|
||||
const auto bias_dim = attn_bias_.value().dim();
|
||||
if (bias_dim == 2) {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
} else if (bias_dim == 3) {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
} else {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
|
||||
}
|
||||
}
|
||||
|
||||
const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
|
||||
auto dq = at::empty_like(query);
|
||||
auto dk = at::empty_like(key);
|
||||
@ -211,6 +232,7 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_
|
||||
query /*const Tensor& q*/,
|
||||
key /*const Tensor& k*/,
|
||||
value /*const Tensor& v*/,
|
||||
attn_bias_ /*const std::optional<Tensor>& attn_bias*/,
|
||||
out /*const Tensor& o*/,
|
||||
grad_out/*const Tensor& dO*/,
|
||||
logsumexp.unsqueeze(-1)/*const Tensor& softmaxstats*/,
|
||||
@ -219,7 +241,7 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_
|
||||
dv/*Tensor& dV*/,
|
||||
philox_seed/*Tensor& dropoutseed*/,
|
||||
philox_offset/*Tensor& dropoutoffset*/);
|
||||
return std::make_tuple(dq, dk, dv);
|
||||
return std::make_tuple(std::move(dq), std::move(dk), std::move(dv));
|
||||
}
|
||||
|
||||
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
|
||||
@ -394,7 +416,8 @@ _efficient_attention_backward(
|
||||
auto ret = aotriton::v2::flash::check_gpu(stream);
|
||||
if (hipSuccess != ret) {
|
||||
TORCH_CHECK(false,
|
||||
"[AOTriton] Accelerated SDPA only supports MI200/MI300X GPUs (gfx90a:sramecc+:xnack- or gfx942:sramecc+:xnack-)")
|
||||
"[AOTriton] Accelerated SDPA only supports MI200/MI300X/Navi31 GPUs"
|
||||
" (gfx90a:sramecc+:xnack-/gfx942:sramecc+:xnack-/gfx1100)")
|
||||
}
|
||||
const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
|
||||
bool is_causal;
|
||||
@ -419,6 +442,7 @@ _efficient_attention_backward(
|
||||
hipError_t err;
|
||||
using aotriton::v2::flash::attn_bwd;
|
||||
using sdp::aotriton_adapter::mk_aotensor;
|
||||
using sdp::aotriton_adapter::mk_aoscalartensor;
|
||||
using sdp::aotriton_adapter::cast_dtype;
|
||||
aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, cast_dtype(query.dtype()));
|
||||
err = attn_bwd(mk_aotensor(q_t, "q"),
|
||||
@ -435,8 +459,9 @@ _efficient_attention_backward(
|
||||
mk_aotensor<2>(softmax_lse, "L"),
|
||||
mk_aotensor<2>(delta, "delta"),
|
||||
float(dropout_p),
|
||||
rng_engine_inputs.seed_.val,
|
||||
rng_engine_inputs.offset_.val,
|
||||
mk_aoscalartensor(philox_seed),
|
||||
mk_aoscalartensor(philox_offset),
|
||||
0,
|
||||
is_causal,
|
||||
stream);
|
||||
#else
|
||||
|
||||
@ -210,6 +210,7 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
|
||||
// Check that the gpu is capable of running flash attention
|
||||
using sm80 = SMVersion<8, 0>;
|
||||
using sm90 = SMVersion<9, 0>;
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
#if USE_ROCM
|
||||
#if USE_AOTRITON
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
@ -221,11 +222,19 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
|
||||
}
|
||||
return false;
|
||||
}
|
||||
c10::string_view arch(dprops->gcnArchName);
|
||||
if (arch == "gfx1100") {
|
||||
static const bool enable_navi3x = c10::utils::check_env("TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL") == true;
|
||||
if (!enable_navi3x) {
|
||||
TORCH_WARN_ONCE("Flash attention support on Navi31 GPU is still experimental."
|
||||
" Enable it with TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1.");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
#else
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
if (!check_sm_version<sm80, sm90>(dprops)) {
|
||||
if (debug) {
|
||||
TORCH_WARN(
|
||||
@ -245,6 +254,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
|
||||
// Mem Efficient attention supports hardware in the range [sm_50, sm_90]
|
||||
using sm50 = SMVersion<5, 0>;
|
||||
using sm90 = SMVersion<9, 0>;
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
#if USE_ROCM
|
||||
#if USE_AOTRITON
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
@ -256,11 +266,19 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
|
||||
}
|
||||
return false;
|
||||
}
|
||||
c10::string_view arch(dprops->gcnArchName);
|
||||
if (arch == "gfx1100") {
|
||||
static const bool enable_navi3x = c10::utils::check_env("TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL") == true;
|
||||
if (!enable_navi3x) {
|
||||
TORCH_WARN_ONCE("Memory Efficient attention on Navi31 GPU is still experimental."
|
||||
" Enable it with TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1.");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
#else
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
if (!check_sm_version<sm50, sm90>(dprops)) {
|
||||
if (debug) {
|
||||
TORCH_WARN(
|
||||
@ -561,7 +579,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
|
||||
check_cudnn_deterministic,
|
||||
// check_is_causal,
|
||||
check_dtypes_low_precision,
|
||||
check_for_attn_mask_cudnn,
|
||||
check_attn_mask_shape,
|
||||
check_cudnn_hardware_support
|
||||
);
|
||||
for (auto& constraint : general_constraints) {
|
||||
@ -616,9 +634,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
|
||||
}
|
||||
}
|
||||
}
|
||||
#if USE_ROCM
|
||||
constexpr bool backend_supports_grouped_query_attention = false;
|
||||
#else
|
||||
constexpr bool backend_supports_grouped_query_attention = true;
|
||||
#endif
|
||||
if (has_only_dense_inputs(params)) {
|
||||
constexpr auto dense_constraints = array_of<bool (*)(sdp_params const&, bool)>(
|
||||
check_batch_size_and_num_heads_dense<true /*supports_grouped_query_attention=*/>,
|
||||
check_batch_size_and_num_heads_dense<backend_supports_grouped_query_attention>,
|
||||
check_nonzero_sequence_lengths_dense,
|
||||
check_last_dim_stride_equals_1_dense<true /*ignore_singleton_dim=*/>);
|
||||
for (auto& constraint : dense_constraints) {
|
||||
|
||||
@ -115,6 +115,18 @@ aotriton::TensorView<Rank> mk_aotensor(const at::Tensor& q, c10::string_view ten
|
||||
cast_dtype(q.dtype()));
|
||||
}
|
||||
|
||||
inline aotriton::TensorView<0> mk_aoscalartensor(const at::Tensor& q)
|
||||
{
|
||||
return aotriton::TensorView<0>(reinterpret_cast<intptr_t>(q.data_ptr()),
|
||||
cast_dtype(q.dtype()));
|
||||
}
|
||||
|
||||
inline aotriton::TensorView<0> mk_philoxtensor(const int64_t* ptr)
|
||||
{
|
||||
return aotriton::TensorView<0>(reinterpret_cast<intptr_t>(ptr),
|
||||
aotriton::DType::kUInt64); // AOTriton excepts unsigned int64
|
||||
}
|
||||
|
||||
} // namespace aotriton_adapter
|
||||
|
||||
} // namespace sdp
|
||||
|
||||
@ -72,7 +72,8 @@ void check_gpu_arch(hipStream_t stream) {
|
||||
auto ret = aotriton::v2::flash::check_gpu(stream);
|
||||
if (hipSuccess != ret) {
|
||||
TORCH_CHECK(false,
|
||||
"FlashAttention only supports MI200/MI300X GPUs (gfx90a:sramecc+:xnack- or gfx942:sramecc+:xnack-)")
|
||||
"[AOTriton] Accelerated SDPA only supports MI200/MI300X/Navi31 GPUs"
|
||||
" (gfx90a:sramecc+:xnack-/gfx942:sramecc+:xnack-/gfx1100)")
|
||||
}
|
||||
}
|
||||
|
||||
@ -164,6 +165,8 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
|
||||
at::Tensor seed_t, offset_t;
|
||||
|
||||
at::PhiloxCudaState philox_state;
|
||||
bool use_philox_state = false;
|
||||
if (p_dropout > 0.0) {
|
||||
// number of times random will be generated per thread, to offset philox counter in thc random
|
||||
// state
|
||||
@ -171,12 +174,14 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
int64_t counter_offset = batch_size * num_heads * 32;
|
||||
// See Note [Acquire lock when using random generators]
|
||||
std::lock_guard<std::mutex> lock(gen->mutex_);
|
||||
at::PhiloxCudaState philox_state = gen->philox_cuda_state(counter_offset);
|
||||
philox_state = gen->philox_cuda_state(counter_offset);
|
||||
if (at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None) {
|
||||
auto [seed, offset] = at::cuda::philox::unpack(philox_state);
|
||||
seed_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(seed)), at::dtype(at::kLong));
|
||||
offset_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(offset)), at::dtype(at::kLong));
|
||||
seed_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(seed)), at::dtype(at::kLong).device(at::kCUDA));
|
||||
offset_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(offset)), at::dtype(at::kLong).device(at::kCUDA));
|
||||
} else {
|
||||
// See Note [CUDA Graph-safe RNG states] about the design
|
||||
use_philox_state = true;
|
||||
seed_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
offset_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
}
|
||||
@ -185,8 +190,8 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
seed_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
offset_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
} else {
|
||||
seed_t = at::empty({}, at::dtype(at::kLong));
|
||||
offset_t = at::empty({}, at::dtype(at::kLong));
|
||||
seed_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
offset_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
}
|
||||
}
|
||||
|
||||
@ -219,9 +224,17 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
|
||||
hipError_t err; // TODO: Error handling
|
||||
using aotriton::v2::flash::attn_fwd;
|
||||
using aotriton::TensorView;
|
||||
using sdp::aotriton_adapter::mk_aotensor;
|
||||
using sdp::aotriton_adapter::mk_aoscalartensor;
|
||||
using sdp::aotriton_adapter::mk_philoxtensor;
|
||||
using sdp::aotriton_adapter::cast_dtype;
|
||||
aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
|
||||
auto seed = use_philox_state ? mk_philoxtensor(philox_state.seed_.ptr) : mk_aoscalartensor(seed_t);
|
||||
auto offset1 = use_philox_state ? mk_philoxtensor(philox_state.offset_.ptr) : mk_aoscalartensor(offset_t);
|
||||
auto offset2 = use_philox_state ? philox_state.offset_intragraph_ : 0;
|
||||
auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr<int64_t>()) : mk_philoxtensor(nullptr);
|
||||
auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr<int64_t>()) : mk_philoxtensor(nullptr);
|
||||
err = attn_fwd(mk_aotensor(q_t, "q"),
|
||||
mk_aotensor(k_t, "k"),
|
||||
mk_aotensor(v_t, "v"),
|
||||
@ -230,8 +243,11 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
mk_aotensor<2>(M, "M"),
|
||||
mk_aotensor(output_t, "Out"),
|
||||
p_dropout,
|
||||
philox_args.seed_.val,
|
||||
philox_args.offset_.val,
|
||||
seed,
|
||||
offset1,
|
||||
offset2,
|
||||
seed_output,
|
||||
offset_output,
|
||||
mk_aotensor(softmax_fa_t, "encoded_softmax"),
|
||||
is_causal,
|
||||
stream);
|
||||
@ -420,6 +436,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
|
||||
{
|
||||
using aotriton::v2::flash::attn_bwd;
|
||||
using sdp::aotriton_adapter::mk_aotensor;
|
||||
using sdp::aotriton_adapter::mk_aoscalartensor;
|
||||
using sdp::aotriton_adapter::cast_dtype;
|
||||
aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
|
||||
err = attn_bwd(mk_aotensor(q_t, "q"),
|
||||
@ -436,8 +453,9 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
|
||||
mk_aotensor<2>(softmax_lse_cont, "L"),
|
||||
mk_aotensor<2>(delta, "delta"),
|
||||
p_dropout,
|
||||
philox_args.seed_.val,
|
||||
philox_args.offset_.val,
|
||||
mk_aoscalartensor(philox_seed),
|
||||
mk_aoscalartensor(philox_offset),
|
||||
0,
|
||||
is_causal,
|
||||
stream);
|
||||
}
|
||||
|
||||
@ -275,17 +275,6 @@ inline bool check_for_attn_mask(sdp_params const& params, bool debug) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// TODO(eqy): remove this once support is added
|
||||
inline bool check_for_attn_mask_cudnn(sdp_params const& params, bool debug) {
|
||||
if (params.attn_mask.has_value()) {
|
||||
if (debug) {
|
||||
TORCH_WARN("cuDNN Attention does not support non-null attn_mask.");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool check_attn_mask_shape(sdp_params const& params, bool debug) {
|
||||
auto attn_mask = params.attn_mask;
|
||||
if (!attn_mask.has_value()) {
|
||||
|
||||
@ -130,6 +130,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,0
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -142,6 +146,10 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
@ -170,6 +178,10 @@ maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet1_0,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2,pass,0
|
||||
|
||||
|
||||
@ -242,6 +254,10 @@ resnext50_32x4d,pass,0
|
||||
|
||||
|
||||
|
||||
shufflenet_v2_x1_0,pass,0
|
||||
|
||||
|
||||
|
||||
soft_actor_critic,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
|
@ -126,6 +126,10 @@ MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
|
@ -130,6 +130,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,0
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -142,6 +146,10 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
@ -170,6 +178,10 @@ maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet1_0,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2,pass,0
|
||||
|
||||
|
||||
@ -242,6 +254,10 @@ resnext50_32x4d,pass,0
|
||||
|
||||
|
||||
|
||||
shufflenet_v2_x1_0,pass,0
|
||||
|
||||
|
||||
|
||||
soft_actor_critic,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
|
@ -130,6 +130,10 @@ MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
|
@ -138,6 +138,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,19
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -150,10 +154,18 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
@ -178,6 +190,10 @@ maml,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
mnasnet1_0,pass,0
|
||||
|
||||
|
||||
|
||||
maml_omniglot,pass,0
|
||||
|
||||
|
||||
@ -258,6 +274,10 @@ resnext50_32x4d,pass,0
|
||||
|
||||
|
||||
|
||||
shufflenet_v2_x1_0,pass,0
|
||||
|
||||
|
||||
|
||||
soft_actor_critic,pass,0
|
||||
|
||||
|
||||
|
||||
|
@ -130,6 +130,10 @@ MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
|
@ -138,6 +138,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,19
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -150,10 +154,18 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
@ -182,6 +194,10 @@ maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet1_0,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2,pass,0
|
||||
|
||||
|
||||
@ -258,6 +274,10 @@ resnext50_32x4d,pass,0
|
||||
|
||||
|
||||
|
||||
shufflenet_v2_x1_0,pass,0
|
||||
|
||||
|
||||
|
||||
soft_actor_critic,pass,0
|
||||
|
||||
|
||||
|
||||
|
@ -130,6 +130,10 @@ MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
|
@ -138,6 +138,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,13
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -150,10 +154,18 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
||||
|
@ -114,6 +114,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,0
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -126,6 +130,10 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
@ -158,6 +166,10 @@ mobilenet_v2,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet1_0,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2_quantized_qat,fail_to_run,0
|
||||
|
||||
|
||||
@ -226,6 +238,10 @@ resnext50_32x4d,pass,0
|
||||
|
||||
|
||||
|
||||
shufflenet_v2_x1_0,pass,0
|
||||
|
||||
|
||||
|
||||
soft_actor_critic,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
|
@ -114,6 +114,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,0
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -126,6 +130,10 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
@ -154,6 +162,10 @@ maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet1_0,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2,pass,0
|
||||
|
||||
|
||||
@ -226,6 +238,10 @@ resnext50_32x4d,pass,0
|
||||
|
||||
|
||||
|
||||
shufflenet_v2_x1_0,pass,0
|
||||
|
||||
|
||||
|
||||
soft_actor_critic,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
|
@ -130,6 +130,10 @@ MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
|
@ -122,6 +122,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,13
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -134,10 +138,18 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
||||
|
@ -2370,7 +2370,11 @@ class BenchmarkRunner:
|
||||
return set()
|
||||
|
||||
@property
|
||||
def skip_models_for_freezing(self):
|
||||
def skip_models_for_freezing_cpu(self):
|
||||
return set()
|
||||
|
||||
@property
|
||||
def skip_models_for_freezing_cuda(self):
|
||||
return set()
|
||||
|
||||
@property
|
||||
@ -4275,7 +4279,6 @@ def run(runner, args, original_dir=None):
|
||||
runner.skip_models.update(runner.slow_models)
|
||||
|
||||
if args.devices == ["cpu"]:
|
||||
runner.skip_models.update(runner.very_slow_models)
|
||||
runner.skip_models.update(runner.skip_models_for_cpu)
|
||||
elif args.devices == ["cuda"]:
|
||||
runner.skip_models.update(runner.skip_models_for_cuda)
|
||||
@ -4284,7 +4287,10 @@ def run(runner, args, original_dir=None):
|
||||
runner.skip_models.update(runner.skip_multiprocess_models)
|
||||
|
||||
if args.freezing:
|
||||
runner.skip_models.update(runner.skip_models_for_freezing)
|
||||
if args.devices == ["cpu"]:
|
||||
runner.skip_models.update(runner.skip_models_for_freezing_cpu)
|
||||
elif args.devices == ["cuda"]:
|
||||
runner.skip_models.update(runner.skip_models_for_freezing_cuda)
|
||||
|
||||
if args.no_skip:
|
||||
runner.skip_models.clear()
|
||||
|
||||
@ -505,7 +505,7 @@ class HuggingfaceRunner(BenchmarkRunner):
|
||||
return 4e-3, cosine
|
||||
if (
|
||||
current_device == "cpu"
|
||||
and name in self._config["tolerance"]["higher_inference"]
|
||||
and name in self._config["tolerance"]["higher_inference_cpu"]
|
||||
):
|
||||
return 4e-3, cosine
|
||||
return 1e-3, cosine
|
||||
|
||||
@ -11,9 +11,7 @@ skip:
|
||||
- GPTJForQuestionAnswering
|
||||
|
||||
device:
|
||||
cpu:
|
||||
# OOMs
|
||||
- OPTForCausalLM
|
||||
cpu: []
|
||||
|
||||
control_flow:
|
||||
- AllenaiLongformerBase
|
||||
@ -71,6 +69,7 @@ batch_size:
|
||||
TrOCRForCausalLM: 2
|
||||
XGLMForCausalLM: 4
|
||||
XLNetLMHeadModel: 2
|
||||
YituTechConvBert: 2
|
||||
|
||||
|
||||
tolerance:
|
||||
|
||||
@ -387,11 +387,6 @@ def get_skip_tests(suite, device, is_training: bool):
|
||||
skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cpu)
|
||||
elif device == "cuda":
|
||||
skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cuda)
|
||||
else:
|
||||
if hasattr(module, "SKIP"):
|
||||
skip_tests.update(module.SKIP)
|
||||
if is_training and hasattr(module, "SKIP_TRAIN"):
|
||||
skip_tests.update(module.SKIP_TRAIN)
|
||||
|
||||
skip_tests = (f"-x {name}" for name in skip_tests)
|
||||
skip_str = " ".join(skip_tests)
|
||||
@ -438,7 +433,7 @@ def generate_commands(args, dtypes, suites, devices, compilers, output_dir):
|
||||
if args.enable_cpu_launcher:
|
||||
launcher_cmd = f"python -m torch.backends.xeon.run_cpu {args.cpu_launcher_args}"
|
||||
cmd = f"{launcher_cmd} benchmarks/dynamo/{suite}.py --{testing} --{dtype} -d{device} --output={output_filename}"
|
||||
cmd = f"{cmd} {base_cmd} {args.extra_args} --no-skip --dashboard"
|
||||
cmd = f"{cmd} {base_cmd} {args.extra_args} --dashboard"
|
||||
skip_tests_str = get_skip_tests(suite, device, args.training)
|
||||
cmd = f"{cmd} {skip_tests_str}"
|
||||
|
||||
|
||||
@ -138,8 +138,12 @@ class TorchBenchmarkRunner(BenchmarkRunner):
|
||||
return self._skip["device"]["cuda"]
|
||||
|
||||
@property
|
||||
def skip_models_for_freezing(self):
|
||||
return self._skip["freezing"]
|
||||
def skip_models_for_freezing_cuda(self):
|
||||
return self._skip["freezing"]["cuda"]
|
||||
|
||||
@property
|
||||
def skip_models_for_freezing_cpu(self):
|
||||
return self._skip["freezing"]["cpu"]
|
||||
|
||||
@property
|
||||
def slow_models(self):
|
||||
|
||||
@ -191,6 +191,7 @@ skip:
|
||||
- hf_Whisper
|
||||
- stable_diffusion_text_encoder
|
||||
- llava
|
||||
- moco
|
||||
|
||||
cuda: []
|
||||
|
||||
@ -232,10 +233,13 @@ skip:
|
||||
|
||||
# for these models, conv-batchnorm fusing causes big numerical churn.
|
||||
# Skip them
|
||||
# mnasnet1_0 and shufflenet_v2_x1_0 can pass on cpu, moco cuda only.
|
||||
freezing:
|
||||
- mnasnet1_0
|
||||
- moco
|
||||
- shufflenet_v2_x1_0
|
||||
cuda:
|
||||
- mnasnet1_0
|
||||
- moco
|
||||
- shufflenet_v2_x1_0
|
||||
cpu: []
|
||||
|
||||
|
||||
|
||||
|
||||
@ -19,6 +19,7 @@
|
||||
} while (0)
|
||||
|
||||
#define C10_LIBCUDA_DRIVER_API(_) \
|
||||
_(cuDeviceGetAttribute) \
|
||||
_(cuMemAddressReserve) \
|
||||
_(cuMemRelease) \
|
||||
_(cuMemMap) \
|
||||
|
||||
@ -48,9 +48,7 @@ if(NOT BUILD_LIBTORCHLESS)
|
||||
endif()
|
||||
|
||||
# ---[ Dependency of c10_hip
|
||||
target_link_libraries(c10_hip PUBLIC c10)
|
||||
|
||||
target_link_libraries(c10_hip PUBLIC ${PYTORCH_HIP_LIBRARIES})
|
||||
target_link_libraries(c10_hip PUBLIC ${C10_LIB} hip::amdhip64)
|
||||
|
||||
target_include_directories(
|
||||
c10_hip PUBLIC
|
||||
|
||||
@ -168,6 +168,8 @@ class DeviceCachingAllocator {
|
||||
!block->allocated && block->event_count == 0 &&
|
||||
block->stream_uses.empty());
|
||||
|
||||
size_t original_block_size = block->size;
|
||||
size_t requested_size = block->requested_size;
|
||||
auto& pool = *block->pool;
|
||||
const std::array<Block*, 2> merge_candidates = {block->prev, block->next};
|
||||
for (Block* merge_candidate : merge_candidates) {
|
||||
@ -180,8 +182,8 @@ class DeviceCachingAllocator {
|
||||
|
||||
StatTypes stat_types = get_stat_types_for_pool(pool);
|
||||
for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
|
||||
stats.active_bytes[stat_type].decrease(block->size);
|
||||
stats.requested_bytes[stat_type].decrease(block->requested_size);
|
||||
stats.active_bytes[stat_type].decrease(original_block_size);
|
||||
stats.requested_bytes[stat_type].decrease(requested_size);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@ -544,11 +544,6 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
|
||||
# Disable I8MM For CI since clang 9 does not support neon i8mm.
|
||||
set(XNNPACK_ENABLE_ARM_I8MM OFF CACHE BOOL "")
|
||||
|
||||
# Older MSVC versions don't support AVX512FP. TODO Minimum version support?
|
||||
IF(CMAKE_C_COMPILER_ID STREQUAL "MSVC")
|
||||
set(XNNPACK_ENABLE_AVX512FP16 OFF CACHE BOOL "")
|
||||
ENDIF()
|
||||
|
||||
# Conditionally disable AVX512AMX, as it requires Clang 11 or later. Note that
|
||||
# XNNPACK does conditionally compile this based on GCC version. Once it also does
|
||||
# so based on Clang version, this logic can be removed.
|
||||
@ -1093,8 +1088,8 @@ if(USE_ROCM)
|
||||
hip_include_directories(${Caffe2_HIP_INCLUDE})
|
||||
|
||||
set(Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
|
||||
${PYTORCH_HIP_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB} ${ROCM_ROCTX_LIB})
|
||||
list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS ${hipblaslt_LIBRARIES})
|
||||
hip::amdhip64 MIOpen hiprtc::hiprtc) # libroctx will be linked in with MIOpen
|
||||
list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS roc::hipblaslt)
|
||||
|
||||
list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
|
||||
roc::hipblas hip::hipfft hip::hiprand roc::hipsparse roc::hipsolver)
|
||||
|
||||
@ -7,7 +7,7 @@ set(CAFFE2_USE_EXCEPTION_PTR 1)
|
||||
# ---[ Check if we want to turn off deprecated warning due to glog.
|
||||
if(USE_GLOG)
|
||||
cmake_push_check_state(RESET)
|
||||
set(CMAKE_REQUIRED_FLAGS "-std=c++14")
|
||||
set(CMAKE_REQUIRED_FLAGS "-std=c++17")
|
||||
CHECK_CXX_SOURCE_COMPILES(
|
||||
"#include <glog/stl_logging.h>
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
@ -146,6 +146,7 @@ if(HIP_FOUND)
|
||||
set(hipcub_DIR ${ROCM_PATH}/lib/cmake/hipcub)
|
||||
set(rocthrust_DIR ${ROCM_PATH}/lib/cmake/rocthrust)
|
||||
set(hipsolver_DIR ${ROCM_PATH}/lib/cmake/hipsolver)
|
||||
set(hiprtc_DIR ${ROCM_PATH}/lib/cmake/hiprtc)
|
||||
|
||||
|
||||
find_package_and_print_version(hip REQUIRED)
|
||||
@ -164,6 +165,7 @@ if(HIP_FOUND)
|
||||
find_package_and_print_version(hipcub REQUIRED)
|
||||
find_package_and_print_version(rocthrust REQUIRED)
|
||||
find_package_and_print_version(hipsolver REQUIRED)
|
||||
find_package_and_print_version(hiprtc REQUIRED)
|
||||
|
||||
|
||||
find_library(PYTORCH_HIP_LIBRARIES amdhip64 HINTS ${ROCM_PATH}/lib)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
The C++ Frontend
|
||||
================
|
||||
|
||||
The PyTorch C++ frontend is a C++14 library for CPU and GPU
|
||||
The PyTorch C++ frontend is a C++17 library for CPU and GPU
|
||||
tensor computation, with automatic differentiation and high level building
|
||||
blocks for state of the art machine learning applications.
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@ Tensor Basics
|
||||
=============
|
||||
|
||||
The ATen tensor library backing PyTorch is a simple tensor library that exposes
|
||||
the Tensor operations in Torch directly in C++14. ATen's API is auto-generated
|
||||
the Tensor operations in Torch directly in C++17. ATen's API is auto-generated
|
||||
from the same declarations PyTorch uses so the two APIs will track each other
|
||||
over time.
|
||||
|
||||
|
||||
@ -95,6 +95,11 @@ updates the parameters, so the scale factor does not interfere with the learning
|
||||
|
||||
.. currentmodule:: torch.cuda.amp
|
||||
|
||||
.. autoclass:: GradScaler
|
||||
:members:
|
||||
|
||||
.. currentmodule:: torch.cpu.amp
|
||||
|
||||
.. autoclass:: GradScaler
|
||||
:members:
|
||||
|
||||
@ -365,7 +370,7 @@ in which unlisted ops run if they're downstream from autocasted ops.
|
||||
|
||||
If an op is unlisted, we assume it's numerically stable in ``bfloat16``.
|
||||
If you believe an unlisted op is numerically unstable in ``bfloat16``,
|
||||
please file an issue.
|
||||
please file an issue. ``float16`` shares the lists of ``bfloat16``.
|
||||
|
||||
CPU Ops that can autocast to ``bfloat16``
|
||||
"""""""""""""""""""""""""""""""""""""""""
|
||||
@ -375,19 +380,25 @@ CPU Ops that can autocast to ``bfloat16``
|
||||
``conv3d``,
|
||||
``bmm``,
|
||||
``mm``,
|
||||
``linalg_vecdot``,
|
||||
``baddbmm``,
|
||||
``addmm``,
|
||||
``addbmm``,
|
||||
``linear``,
|
||||
``matmul``,
|
||||
``_convolution``
|
||||
``_convolution``,
|
||||
``conv_tbc``,
|
||||
``mkldnn_rnn_layer``,
|
||||
``conv_transpose1d``,
|
||||
``conv_transpose2d``,
|
||||
``conv_transpose3d``,
|
||||
``prelu``,
|
||||
``scaled_dot_product_attention``,
|
||||
``_native_multi_head_attention``
|
||||
|
||||
CPU Ops that can autocast to ``float32``
|
||||
""""""""""""""""""""""""""""""""""""""""
|
||||
|
||||
``conv_transpose1d``,
|
||||
``conv_transpose2d``,
|
||||
``conv_transpose3d``,
|
||||
``avg_pool3d``,
|
||||
``binary_cross_entropy``,
|
||||
``grid_sampler``,
|
||||
@ -421,9 +432,22 @@ CPU Ops that can autocast to ``float32``
|
||||
``replication_pad2d``,
|
||||
``replication_pad3d``,
|
||||
``mse_loss``,
|
||||
``cosine_embedding_loss``,
|
||||
``nll_loss``,
|
||||
``nll_loss2d``,
|
||||
``hinge_embedding_loss``,
|
||||
``poisson_nll_loss``,
|
||||
``cross_entropy_loss``,
|
||||
``l1_loss``,
|
||||
``huber_loss``,
|
||||
``margin_ranking_loss``,
|
||||
``soft_margin_loss``,
|
||||
``triplet_margin_loss``,
|
||||
``multi_margin_loss``,
|
||||
``ctc_loss``,
|
||||
``kl_div``,
|
||||
``multilabel_margin_loss``,
|
||||
``binary_cross_entropy_with_logits``,
|
||||
``fft_fft``,
|
||||
``fft_ifft``,
|
||||
``fft_fft2``,
|
||||
@ -438,7 +462,6 @@ CPU Ops that can autocast to ``float32``
|
||||
``fft_irfftn``,
|
||||
``fft_hfft``,
|
||||
``fft_ihfft``,
|
||||
``linalg_matrix_norm``,
|
||||
``linalg_cond``,
|
||||
``linalg_matrix_rank``,
|
||||
``linalg_solve``,
|
||||
@ -451,14 +474,10 @@ CPU Ops that can autocast to ``float32``
|
||||
``linalg_tensorinv``,
|
||||
``linalg_tensorsolve``,
|
||||
``fake_quantize_per_tensor_affine``,
|
||||
``eig``,
|
||||
``geqrf``,
|
||||
``lstsq``,
|
||||
``_lu_with_info``,
|
||||
``qr``,
|
||||
``solve``,
|
||||
``svd``,
|
||||
``symeig``,
|
||||
``triangular_solve``,
|
||||
``fractional_max_pool2d``,
|
||||
``fractional_max_pool3d``,
|
||||
|
||||
@ -85,6 +85,10 @@ torch.backends.cuda
|
||||
|
||||
.. autofunction:: torch.backends.cuda.enable_math_sdp
|
||||
|
||||
.. autofunction:: torch.backends.cuda.fp16_bf16_reduction_math_sdp_allowed
|
||||
|
||||
.. autofunction:: torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp
|
||||
|
||||
.. autofunction:: torch.backends.cuda.cudnn_sdp_enabled
|
||||
|
||||
.. autofunction:: torch.backends.cuda.enable_cudnn_sdp
|
||||
|
||||
@ -33,39 +33,82 @@ Module-level maintainers
|
||||
NN APIs (torch.nn)
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Greg Chanan (`gchanan <https://github.com/gchanan>`__)
|
||||
- Soumith Chintala (`soumith <https://github.com/soumith>`__)
|
||||
- Joel Schlosser (`jbschlosser <https://github.com/jbschlosser>`__)
|
||||
- Mikayla Gawarecki (`mikaylagawarecki <https://github.com/mikaylagawarecki>`__)
|
||||
- Alban Desmaison (`albanD <https://github.com/albanD>`__)
|
||||
- Joel Schlosser (`jbschlosser <https://github.com/jbschlosser>`__)
|
||||
- (emeritus) Greg Chanan (`gchanan <https://github.com/gchanan>`__)
|
||||
- (emeritus) Soumith Chintala (`soumith <https://github.com/soumith>`__)
|
||||
- (emeritus) Sam Gross (`colesbury <https://github.com/colesbury>`__)
|
||||
- (emeritus) Adam Paszke (`apaszke <https://github.com/apaszke>`__)
|
||||
|
||||
Optimizers (torch.optim)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Jane Xu (`janeyx99 <https://github.com/janeyx99>`__)
|
||||
- Alban Desmaison (`albanD <https://github.com/albanD>`__)
|
||||
- Joel Schlosser (`jbschlosser <https://github.com/jbschlosser>`__)
|
||||
- Soumith Chintala (`soumith <https://github.com/soumith>`__)
|
||||
- (emeritus) Soumith Chintala (`soumith <https://github.com/soumith>`__)
|
||||
- (emeritus) Ilqar Ramazanli (`iramazanli <https://github.com/iramazanli>`__)
|
||||
- (emeritus) Vincent Quenneville-Belair (`vincentqb <https://github.com/vincentqb>`__)
|
||||
|
||||
Autograd (torch.autograd)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Edward Yang (`ezyang <https://github.com/ezyang>`__)
|
||||
- Alban Desmaison (`alband <https://github.com/alband>`__)
|
||||
- Jeffrey Wan (`soulitzer <https://github.com/soulitzer>`__)
|
||||
- Alban Desmaison (`alband <https://github.com/alband>`__)
|
||||
- Edward Yang (`ezyang <https://github.com/ezyang>`__)
|
||||
- (emeritus) Adam Paszke (`apaszke <https://github.com/apaszke>`__)
|
||||
|
||||
Compilers (JIT / TorchScript / FX / TorchDynamo)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
TorchDynamo
|
||||
~~~~~~~~~~~
|
||||
|
||||
- Animesh Jain (`anijain2305 <https://github.com/anijain2305>`__)
|
||||
- Jason Ansel (`jansel <https://github.com/jansel>`__)
|
||||
- Edward Yang (`ezyang <https://github.com/ezyang>`__)
|
||||
|
||||
TorchInductor
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
- Elias Ellison (`eellison <https://github.com/eellison>`__)
|
||||
- Michael Suo (`suo <https://github.com/suo>`__)
|
||||
- Yanan Cao (`gmagogsfm <https://github.com/gmagogsfm>`__)
|
||||
- James Reed (`jamesr66a <https://github.com/jamesr66a>`__)
|
||||
- Horace He (`Chillee <https://github.com/Chillee>`__)
|
||||
- Shunting Zhang (`shunting314 <https://github.com/shunting314>`__)
|
||||
- Jason Ansel (`jansel <https://github.com/jansel>`__)
|
||||
- Jiong Gong (`jgong5 <https://github.com/jgong5>`__)
|
||||
|
||||
Cudagraph Tree
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
- Elias Ellison (`eellison <https://github.com/eellison>`__)
|
||||
|
||||
PT2 Dispatcher
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
- Brian Hirsh (`bdhirsh <https://github.com/bdhirsh>`__)
|
||||
- Richard Zou (`zou3519 <https://github.com/zou3519>`__)
|
||||
- Horace He (`Chillee <https://github.com/Chillee>`__)
|
||||
- Edward Yang (`ezyang <https://github.com/ezyang>`__)
|
||||
|
||||
PT2 Export (torch.export)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Avik Chaudhuri (`avikchaudhuri <https://github.com/avikchaudhuri>`__)
|
||||
- Yanan Cao (`gmagogsfm <https://github.com/gmagogsfm>`__)
|
||||
|
||||
AOT Inductor (AOTI) & AOTI Runtime
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Bin Bao (`desertfire <https://github.com/desertfire>`__)
|
||||
- Angela Yi (`angelayi <https://github.com/angelayi>`__)
|
||||
- Yang Chen (`chenyang78 <https://github.com/chenyang78>`__)
|
||||
|
||||
Compilers (JIT / TorchScript / Package / Deploy)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- (emeritus) Elias Ellison (`eellison <https://github.com/eellison>`__)
|
||||
- (emeritus) Michael Suo (`suo <https://github.com/suo>`__)
|
||||
- (emeritus) Yanan Cao (`gmagogsfm <https://github.com/gmagogsfm>`__)
|
||||
- (emeritus) James Reed (`jamesr66a <https://github.com/jamesr66a>`__)
|
||||
- (emeritus) Jason Ansel (`jansel <https://github.com/jansel>`__)
|
||||
- (emeritus) Jiong Gong (`jgong5 <https://github.com/jgong5>`__)
|
||||
- (emeritus) Zach Devito (`zdevito <https://github.com/zdevito>`__)
|
||||
|
||||
|
||||
@ -79,56 +122,57 @@ Distributions & RNG
|
||||
|
||||
Distributed
|
||||
~~~~~~~~~~~
|
||||
|
||||
- Shen Li (`mrshenli <https://github.com/mrshenli>`__)
|
||||
- Pritam Damania (`pritamdamania87 <https://github.com/pritamdamania87>`__)
|
||||
- Yanli Zhao (`zhaojuanmao <https://github.com/zhaojuanmao>`__)
|
||||
- Rohan Varma (`rohan-varma <https://github.com/rohan-varma>`__)
|
||||
- Wanchao Liang (`wanchaol <https://github.com/wanchaol>`__)
|
||||
- Junjie Wang (`fduwjj <https://github.com/fduwjj>`__)
|
||||
- Will Constable (`wconstab <https://github.com/wconstab>`__)
|
||||
- Howard Huang (`H-Huang <https://github.com/H-Huang>`__)
|
||||
- Tristan Rice (`d4l3k <https://github.com/d4l3k>`__)
|
||||
- Alisson Azzolini (`aazzolini <https://github.com/aazzolini>`__)
|
||||
- Wanchao Liang (`wanchaol <https://github.com/wanchaol>`__)
|
||||
- Ke Wen (`kwen2501 <https://github.com/kwen2501>`__)
|
||||
- James Reed (`jamesr66a <https://github.com/jamesr66a>`__)
|
||||
- Kiuk Chung (`kiukchung <https://github.com/kiukchung>`__)
|
||||
- Chien-Chin Huang (`fegin <https://github.com/fegin>`__)
|
||||
- Tristan Rice (`d4l3k <https://github.com/d4l3k>`__)
|
||||
- (emeritus) Shen Li (`mrshenli <https://github.com/mrshenli>`__)
|
||||
- (emeritus) Pritam Damania (`pritamdamania87 <https://github.com/pritamdamania87>`__)
|
||||
- (emeritus) Yanli Zhao (`zhaojuanmao <https://github.com/zhaojuanmao>`__)
|
||||
- (emeritus) Rohan Varma (`rohan-varma <https://github.com/rohan-varma>`__)
|
||||
- (emeritus) Junjie Wang (`fduwjj <https://github.com/fduwjj>`__)
|
||||
- (emeritus) Alisson Azzolini (`aazzolini <https://github.com/aazzolini>`__)
|
||||
- (emeritus) James Reed (`jamesr66a <https://github.com/jamesr66a>`__)
|
||||
- (emeritus) Kiuk Chung (`kiukchung <https://github.com/kiukchung>`__)
|
||||
- (emeritus) Pieter Noordhuis (`pietern <https://github.com/pietern>`__)
|
||||
- (emeritus) Mingzhe Li (`mingzhe09088 <https://github.com/mingzhe09088>`__)
|
||||
- (emeritus) Omkar Salpekar (`osalpekar <https://github.com/osalpekar>`__)
|
||||
|
||||
Multiprocessing and DataLoaders
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Multiprocessing
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
- Simon Wang (`SsnL <https://github.com/SsnL>`__)
|
||||
- (emeritus) Simon Wang (`SsnL <https://github.com/SsnL>`__)
|
||||
- (emeritus) Vitaly Fedyunin (`VitalyFedyunin <https://github.com/VitalyFedyunin>`__)
|
||||
- (emeritus) Adam Paszke (`apaszke <https://github.com/apaszke>`__)
|
||||
|
||||
Linear Algebra (torch.linalg)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Mike Ruberry (`mruberry <https://github.com/mruberry>`__)
|
||||
- Mario Lezcano (`lezcano <https://github.com/lezcano>`__)
|
||||
- Ivan Yashchuk (`IvanYashchuk <https://github.com/IvanYashchuk>`__)
|
||||
- (emeritus) Mike Ruberry (`mruberry <https://github.com/mruberry>`__)
|
||||
- (emeritus) Ivan Yashchuk (`IvanYashchuk <https://github.com/IvanYashchuk>`__)
|
||||
- (emeritus) Vishwak Srinivasan (`vishwakftw <https://github.com/vishwakftw>`__)
|
||||
|
||||
Sparse (torch.sparse)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Pearu Peterson (`pearu <https://github.com/pearu>`__)
|
||||
- Nikita Vedeneev (`nikitaved <https://github.com/nikitaved>`__)
|
||||
- Ivan Yashchuk (`IvanYashchuk <https://github.com/IvanYashchuk>`__)
|
||||
- Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
|
||||
- Andrew James (`amjames <https://github.com/amjames>`__)
|
||||
- (emeritus) Pearu Peterson (`pearu <https://github.com/pearu>`__)
|
||||
- (emeritus) Nikita Vedeneev (`nikitaved <https://github.com/nikitaved>`__)
|
||||
- (emeritus) Ivan Yashchuk (`IvanYashchuk <https://github.com/IvanYashchuk>`__)
|
||||
- (emeritus) Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
|
||||
- (emeritus) Andrew James (`amjames <https://github.com/amjames>`__)
|
||||
|
||||
NestedTensor (torch.nested)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Alban Desmaison (`albanD <https://github.com/albanD>`__)
|
||||
- Joel Schlosser (`jbschlosser <https://github.com/jbschlosser>`__)
|
||||
- Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
|
||||
- Driss Guessous (`drisspg <https://github.com/drisspg>`__)
|
||||
- Joel Schlosser (`jbschlosser <https://github.com/jbschlosser>`__)
|
||||
- Mikayla Gawarecki (`mikaylagawarecki <https://github.com/mikaylagawarecki>`__)
|
||||
- Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__)
|
||||
- Alban Desmaison (`albanD <https://github.com/albanD>`__)
|
||||
- (emeritus) Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__)
|
||||
|
||||
MaskedTensor (torch.masked)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -139,15 +183,15 @@ MaskedTensor (torch.masked)
|
||||
Fast Fourier Transform (torch.fft)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Mike Ruberry (`mruberry <https://github.com/mruberry>`__)
|
||||
- Peter Bell (`peterbell10 <https://github.com/peterbell10>`__)
|
||||
- (emeritus) Mike Ruberry (`mruberry <https://github.com/mruberry>`__)
|
||||
- (emeritus) Peter Bell (`peterbell10 <https://github.com/peterbell10>`__)
|
||||
|
||||
CPU Performance (Torch Inductor / MKLDNN)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
MKLDNN
|
||||
~~~~~~
|
||||
|
||||
- Xiaobing Zhang (`XiaobingSuper <https://github.com/XiaobingSuper>`__)
|
||||
- Mingfei Ma (`mingfeima <https://github.com/mingfeima>`__)
|
||||
- Jiong Gong (`jgong5 <https://github.com/jgong5>`__)
|
||||
- Xiaobing Zhang (`XiaobingSuper <https://github.com/XiaobingSuper>`__)
|
||||
- (emeritus) Xiaoqiang Zheng (`zheng-xq <https://github.com/zheng-xq>`__)
|
||||
- (emeritus) Sam Gross (`colesbury <https://github.com/colesbury>`__)
|
||||
- (emeritus) Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
|
||||
@ -157,31 +201,22 @@ CPU Performance (Torch Inductor / MKLDNN)
|
||||
- (emeritus) Vitaly Fedyunin (`VitalyFedyunin <https://github.com/VitalyFedyunin>`__)
|
||||
- (emeritus) Jianhui Li (`Jianhui-Li <https://github.com/Jianhui-Li>`__)
|
||||
|
||||
GPU Performance (Torch Inductor / Triton / CUDA)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
CUDA
|
||||
~~~~
|
||||
|
||||
- Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__)
|
||||
- Edward Yang (`ezyang <https://github.com/ezyang>`__)
|
||||
- Piotr Bialecki (`ptrblck <https://github.com/ptrblck>`__)
|
||||
- Christian Sarofeen (`csarofeen <https://github.com/csarofeen>`__)
|
||||
- Andrew Tulloch (`ajtulloch <https://github.com/ajtulloch>`__)
|
||||
- (emeritus) Andrew Tulloch (`ajtulloch <https://github.com/ajtulloch>`__)
|
||||
- (emeritus) Xiaoqiang Zheng (`zheng-xq <https://github.com/zheng-xq>`__)
|
||||
|
||||
NVFuser
|
||||
~~~~~~~
|
||||
|
||||
- Christian Sarofeen (`csarofeen <https://github.com/csarofeen>`__)
|
||||
- Alex Jann (`jjsjann123 <https://github.com/jjsjann123>`__)
|
||||
- Piotr Bialecki (`ptrblck <https://github.com/ptrblck>`__)
|
||||
- Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__)
|
||||
|
||||
|
||||
AMD/ROCm/HIP
|
||||
~~~~~~~~~~~~
|
||||
|
||||
- Peng Sun (`sunway513 <https://github.com/sunway513>`__)
|
||||
- Jithun Nair (`jithunnair-amd <https://github.com/jithunnair-amd>`__)
|
||||
- Jeff Daily (`jeffdaily <https://github.com/jeffdaily>`__)
|
||||
- Jithun Nair (`jithunnair-amd <https://github.com/jithunnair-amd>`__)
|
||||
- (emeritus) Junjie Bai (`bddppq <https://github.com/bddppq>`__)
|
||||
|
||||
Build + CI
|
||||
@ -190,11 +225,11 @@ Build + CI
|
||||
- Nikita Shulga (`malfet <https://github.com/malfet>`__)
|
||||
- Eli Uriegas (`seemethere <https://github.com/seemethere>`__)
|
||||
- Alban Desmaison (`alband <https://github.com/alband>`__)
|
||||
- Mikey Dagitses (`dagitses <https://github.com/dagitses>`__)
|
||||
- Omkar Salpekar (`osalpekar <https://github.com/osalpekar>`__)
|
||||
- Zain Rizvi (`ZainRizvi <https://github.com/ZainRizvi>`__)
|
||||
- Nirav Mehta (`mehtanirav <https://github.com/mehtanirav>`__)
|
||||
- Andrey Talman (`atalman <https://github.com/atalman>`__)
|
||||
- Zain Rizvi (`ZainRizvi <https://github.com/ZainRizvi>`__)
|
||||
- (emeritus) Mikey Dagitses (`dagitses <https://github.com/dagitses>`__)
|
||||
- (emeritus) Omkar Salpekar (`osalpekar <https://github.com/osalpekar>`__)
|
||||
- (emeritus) Nirav Mehta (`mehtanirav <https://github.com/mehtanirav>`__)
|
||||
- (emeritus) Zhuojie Zhou (`zhouzhuojie <https://github.com/zhouzhuojie>`__)
|
||||
- (emeritus) Edward Yang (`ezyang <https://github.com/ezyang>`__)
|
||||
- (emeritus) Karl Ostmo (`kostmo <https://github.com/kostmo>`__)
|
||||
@ -202,11 +237,8 @@ Build + CI
|
||||
Performance Tools
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Adnan Aziz (`adnanaziz <https://github.com/adnanaziz>`__)
|
||||
- CK Luk (`ckluk <https://github.com/ckluk>`__)
|
||||
- Taylor Robie (`robieta <https://github.com/robieta>`__)
|
||||
- Xu Zhao (`xuzhao9 <https://github.com/xuzhao9>`__)
|
||||
- Geeta Chauhan (`chauhang <https://github.com/chauhang>`__)
|
||||
- (emeritus) Victor Bittorf (`bitfort <https://github.com/bitfort>`__)
|
||||
- (emeritus) Gisle Dankel (`gdankel <https://github.com/gdankel>`__)
|
||||
- (emeritus) Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__)
|
||||
@ -215,7 +247,7 @@ Performance Tools
|
||||
C++ API
|
||||
~~~~~~~
|
||||
|
||||
- Joel Schlosser (`jbschlosser <https://github.com/jbschlosser>`__)
|
||||
- (emeritus) Joel Schlosser (`jbschlosser <https://github.com/jbschlosser>`__)
|
||||
- (emeritus) Will Feng (`yf225 <https://github.com/yf225>`__)
|
||||
|
||||
C10 utils and operator dispatch
|
||||
@ -223,7 +255,7 @@ C10 utils and operator dispatch
|
||||
|
||||
- Brian Hirsh (`bdhirsh <https://github.com/bdhirsh>`__)
|
||||
- Edward Yang (`ezyang <https://github.com/ezyang>`__)
|
||||
- Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
|
||||
- (emeritus) Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
|
||||
- (emeritus) Sebastian Messmer (`smessmer <https://github.com/smessmer>`__)
|
||||
|
||||
ONNX exporter
|
||||
@ -241,19 +273,20 @@ ONNX exporter
|
||||
- (emeritus) Negin Raoof (`neginraoof <https://github.com/neginraoof>`__)
|
||||
- (emeritus) Spandan Tiwari (`spandantiwari <https://github.com/spandantiwari>`__)
|
||||
|
||||
Mobile / Edge
|
||||
~~~~~~~~~~~~~
|
||||
- David Reiss (`dreiss <https://github.com/dreiss>`__)
|
||||
- Raziel Guevara (`raziel <https://github.com/raziel>`__)
|
||||
- Linbin Yu (`linbinyu <https://github.com/linbinyu>`__)
|
||||
- Ivan Kobzarev (`IvanKobzarev <https://github.com/IvanKobzarev>`__)
|
||||
- Tao Xu (`xta0 <https://github.com/xta0>`__)
|
||||
LiteInterpreter
|
||||
~~~~~~~~~~~~~~~
|
||||
- (emeritus) David Reiss (`dreiss <https://github.com/dreiss>`__)
|
||||
- (emeritus) Raziel Guevara (`raziel <https://github.com/raziel>`__)
|
||||
- (emeritus) Linbin Yu (`linbinyu <https://github.com/linbinyu>`__)
|
||||
- (emeritus) Ivan Kobzarev (`IvanKobzarev <https://github.com/IvanKobzarev>`__)
|
||||
- (emeritus) Tao Xu (`xta0 <https://github.com/xta0>`__)
|
||||
|
||||
Model Compression & Optimization
|
||||
Quantization (torch/ao)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Mark Saroufim (`msaroufim <https://github.com/msaroufim>`__)
|
||||
- Vasiliy Kuznetsov (`vkuzo <https://github.com/vkuzo>`__)
|
||||
- Jerry Zhang (`jerryzh168 <https://github.com/jerryzh168>`__)
|
||||
- Supriya Rao (`supriyar <https://github.com/supriyar>`__)
|
||||
- (emeritus) Zafar Takhirov (`z-a-f <https://github.com/z-a-f>`__)
|
||||
- (emeritus) Raghuraman Krishnamoorthi (`raghuramank100 <https://github.com/raghuramank100>`__)
|
||||
|
||||
@ -265,18 +298,18 @@ Windows
|
||||
- (emeritus) Teng Gao (`gaoteng-git <https://github.com/gaoteng-git>`__)
|
||||
- (emeritus) Peter Johnson (`peterjc123 <https://github.com/peterjc123>`__)
|
||||
|
||||
Apple M1/MPS
|
||||
~~~~~~~~~~~~
|
||||
Apple M1/MPS/Metal
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Kulin Seth (`kulinseth <https://github.com/kulinseth>`__)
|
||||
- Alban Desmaison (`alband <https://github.com/alband>`__)
|
||||
- Nikita Shulga (`malfet <https://github.com/malfet>`__)
|
||||
- Kulin Seth (`kulinseth <https://github.com/kulinseth>`__)
|
||||
- Ramin Azarmehr (`razarmehr <https://github.com/razarmehr>`__)
|
||||
- (emeritus) Ramin Azarmehr (`razarmehr <https://github.com/razarmehr>`__)
|
||||
|
||||
PowerPC
|
||||
~~~~~~~
|
||||
|
||||
- Alfredo Mendoza (`avmgithub <https://github.com/avmgithub>`__)
|
||||
- (emeritus) Alfredo Mendoza (`avmgithub <https://github.com/avmgithub>`__)
|
||||
|
||||
AArch64 CPU
|
||||
~~~~~~~~~~~~
|
||||
@ -306,26 +339,29 @@ XLA
|
||||
TorchServe
|
||||
~~~~~~~~~~
|
||||
|
||||
- Geeta Chauhan (`chauhang <https://github.com/chauhang>`__)
|
||||
- Manoj Rao (`mycpuorg <https://github.com/mycpuorg>`__)
|
||||
- Vamshi Dantu (`vdantu <https://github.com/vdantu>`__)
|
||||
- Dhanasekar Karuppasamy (`dhanainme <https://github.com/dhanainme>`__)
|
||||
- Li Ning (`lxning <https://github.com/lxning>`__)
|
||||
- Ankith Gunapal (`agunapal <https://github.com/agunapal>`__)
|
||||
- Hamid Shojanazeri (`HamidShojanazeri <https://github.com/HamidShojanazeri>`__)
|
||||
- (emeritus) Mark Saroufim (`msaroufIm <https://github.com/msaroufIm>`__)
|
||||
- (emeritus) Manoj Rao (`mycpuorg <https://github.com/mycpuorg>`__)
|
||||
- (emeritus) Vamshi Dantu (`vdantu <https://github.com/vdantu>`__)
|
||||
- (emeritus) Dhanasekar Karuppasamy (`dhanainme <https://github.com/dhanainme>`__)
|
||||
|
||||
TorchVision
|
||||
~~~~~~~~~~~
|
||||
|
||||
- Francisco Massa (`fmassa <https://github.com/fmassa>`__)
|
||||
- Vasilis Vryniotis (`datumbox <https://github.com/datumbox>`__)
|
||||
- Nicolas Hug (`NicolasHug <https://github.com/NicolasHug>`__)
|
||||
- Yosua Michael Maranatha (`YosuaMichael <https://github.com/YosuaMichael>`__)
|
||||
- Joao Gomes (`jdsgomes <https://github.com/jdsgomes>`__)
|
||||
- Philip Meier (`pmeier <https://github.com/pmeier>`__)
|
||||
- Victor Fomin (`vfdev-5 <https://github.com/vfdev-5>`__)
|
||||
- (emeritus) Francisco Massa (`fmassa <https://github.com/fmassa>`__)
|
||||
- (emeritus) Vasilis Vryniotis (`datumbox <https://github.com/datumbox>`__)
|
||||
- (emeritus) Yosua Michael Maranatha (`YosuaMichael <https://github.com/YosuaMichael>`__)
|
||||
- (emeritus) Joao Gomes (`jdsgomes <https://github.com/jdsgomes>`__)
|
||||
|
||||
TorchText
|
||||
~~~~~~~~~
|
||||
|
||||
- Nayef Ahmed (`Nayef211 <https://github.com/Nayef211>`__)
|
||||
- (emeritus) Nayef Ahmed (`Nayef211 <https://github.com/Nayef211>`__)
|
||||
- (emeritus) Parmeet Singh Bhatia (`parmeet <https://github.com/parmeet>`__)
|
||||
- (emeritus) Guanheng George Zhang (`zhangguanheng66 <https://github.com/zhangguanheng66>`__)
|
||||
- (emeritus) Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
|
||||
@ -334,7 +370,7 @@ TorchAudio
|
||||
~~~~~~~~~~
|
||||
|
||||
- Moto Hira (`mthrok <https://github.com/mthrok>`__)
|
||||
- Jeff Hwang (`hwangjeff <https://github.com/hwangjeff>`__)
|
||||
- (emeritus) Jeff Hwang (`hwangjeff <https://github.com/hwangjeff>`__)
|
||||
- (emeritus) Caroline Chen (`carolineechen <https://github.com/carolineechen>`__)
|
||||
- (emeritus) Xiaohui Zhang (`xiaohui-zhang <https://github.com/xiaohui-zhang>`__)
|
||||
- (emeritus) Zhaoheng Ni (`nateanl <https://github.com/nateanl>`__)
|
||||
@ -344,17 +380,53 @@ TorchAudio
|
||||
TorchRec
|
||||
~~~~~~~~
|
||||
|
||||
- Dmytro Ivchenko (`divchenko <https://github.com/divchenko>`__)
|
||||
- Colin Taylor (`colin2328 <https://github.com/colin2328>`__)
|
||||
- Paul Zhang (`PaulZhang12 <https://github.com/PaulZhang12>`__)
|
||||
- (emeritus) Dmytro Ivchenko (`divchenko <https://github.com/divchenko>`__)
|
||||
|
||||
TorchX
|
||||
~~~~~~
|
||||
|
||||
- Tristan Rice (`d4l3k <https://github.com/d4l3k>`__)
|
||||
- Kiuk Chung (`kiukchung <https://github.com/kiukchung>`__)
|
||||
- (emeritus) Tristan Rice (`d4l3k <https://github.com/d4l3k>`__)
|
||||
- (emeritus) Kiuk Chung (`kiukchung <https://github.com/kiukchung>`__)
|
||||
|
||||
TorchData / TorchArrow
|
||||
TorchData
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Wenlei Xie (`wenleix <https://github.com/wenleix>`__)
|
||||
- Andrew Ho (`andrewkho <https://github.com/andrewkho>`__)
|
||||
- Divyansh Khanna (`divyanshk <https://github.com/divyanshk>`__)
|
||||
|
||||
TorchArrow
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- (emeritus) Wenlei Xie (`wenleix <https://github.com/wenleix>`__)
|
||||
- (emeritus) Vitaly Fedyunin (`VitalyFedyunin <https://github.com/VitalyFedyunin>`__)
|
||||
|
||||
ExecuTorch (Edge, Mobile)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Mergen Nachin (`mergennachin <https://github.com/mergennachin>`__)
|
||||
- Kimish Patel (`kimishpatel <https://github.com/kimishpatel>`__)
|
||||
- Dave Bort (`dbort <https://github.com/dbort>`__)
|
||||
- Martin Yuan (`iseeyuan <https://github.com/iseeyuan>`__)
|
||||
|
||||
TorchTune
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Kartikay Khandelwal (`kartikayk <https://github.com/kartikayk>`__)
|
||||
- Evan Smothers (`ebsmothers <https://github.com/ebsmothers>`__)
|
||||
- Joe Cummings (`joecummings <https://github.com/joecummings>`__)
|
||||
|
||||
TorchChat
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Jack Khuu (`Jack-Khuu <https://github.com/Jack-Khuu>`__)
|
||||
- Jesse White (`byjlw <https://github.com/byjlw>`__)
|
||||
- (emeritus) Michael Gschwind (`mikekgfb <https://github.com/mikekgfb>`__)
|
||||
|
||||
TorchCodec
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Nicolas Hug (`nicolashug <https://github.com/nicolashug>`__)
|
||||
- Ahmad Sharif (`ahmadsharif1 <https://github.com/ahmadsharif1>`__)
|
||||
- Scott Schneider (`scotts <https://github.com/scotts>`__)
|
||||
|
||||
@ -9,7 +9,7 @@ Ordinarily, "automatic mixed precision training" means training with
|
||||
:class:`torch.autocast` and :class:`torch.amp.GradScaler` together.
|
||||
|
||||
Instances of :class:`torch.autocast` enable autocasting for chosen regions.
|
||||
Autocasting automatically chooses the precision for GPU operations to improve performance
|
||||
Autocasting automatically chooses the precision for operations to improve performance
|
||||
while maintaining accuracy.
|
||||
|
||||
Instances of :class:`torch.amp.GradScaler` help perform the steps of
|
||||
|
||||
@ -1,101 +1,84 @@
|
||||
Pytorch 2.4: Getting Started on Intel GPU
|
||||
=========================================
|
||||
Getting Started on Intel GPU
|
||||
============================
|
||||
|
||||
The support for Intel GPUs is released alongside PyTorch v2.4.
|
||||
|
||||
This release only supports build from source for Intel GPUs.
|
||||
|
||||
Hardware Prerequisites
|
||||
----------------------
|
||||
Hardware Prerequisite
|
||||
---------------------
|
||||
|
||||
.. list-table::
|
||||
:widths: 50 50
|
||||
:header-rows: 1
|
||||
|
||||
* - Supported Hardware
|
||||
- Intel® Data Center GPU Max Series
|
||||
* - Supported OS
|
||||
* - Validated Hardware
|
||||
- Supported OS
|
||||
* - Intel® Data Center GPU Max Series
|
||||
- Linux
|
||||
* - Intel Client GPU
|
||||
- Windows/Linux
|
||||
|
||||
Intel GPUs support (Beta) is ready in PyTorch* 2.5 for Intel® Data Center GPU Max Series and Intel® Client GPUs on both Linux and Windows, which brings Intel GPUs and the SYCL* software stack into the official PyTorch stack with consistent user experience to embrace more AI application scenarios.
|
||||
|
||||
Software Prerequisite
|
||||
---------------------
|
||||
|
||||
Visit `PyTorch Installation Prerequisites for Intel GPUs <https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html>`_ for more detailed information regarding:
|
||||
|
||||
#. Intel GPU driver installation
|
||||
#. Intel support package installation
|
||||
#. Environment setup
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
Binaries
|
||||
^^^^^^^^
|
||||
|
||||
Platform Linux
|
||||
""""""""""""""
|
||||
|
||||
|
||||
PyTorch for Intel GPUs is compatible with Intel® Data Center GPU Max Series and only supports OS Linux with release 2.4.
|
||||
Now we have all the required packages installed and environment activated. Use the following commands to install ``pytorch``, ``torchvision``, ``torchaudio`` on Linux.
|
||||
|
||||
Software Prerequisites
|
||||
----------------------
|
||||
|
||||
As a prerequisite, install the driver and required packages by following the `PyTorch Installation Prerequisites for Intel GPUs <https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html>`_.
|
||||
|
||||
Set up Environment
|
||||
------------------
|
||||
|
||||
Before you begin, you need to set up the environment. This can be done by sourcing the ``setvars.sh`` script provided by the ``intel-for-pytorch-gpu-dev`` and ``intel-pti-dev`` packages.
|
||||
For release wheels
|
||||
|
||||
.. code-block::
|
||||
|
||||
source ${ONEAPI_ROOT}/setvars.sh
|
||||
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
|
||||
|
||||
.. note::
|
||||
The ``ONEAPI_ROOT`` is the folder you installed your ``intel-for-pytorch-gpu-dev`` and ``intel-pti-dev`` packages. Typically, it is located at ``/opt/intel/oneapi/`` or ``~/intel/oneapi/``.
|
||||
|
||||
Build from source
|
||||
-----------------
|
||||
|
||||
Now we have all the required packages installed and environment acitvated. Use the following commands to install ``pytorch``, ``torchvision``, ``torchaudio`` by building from source. For more details, refer to official guides in `PyTorch from source <https://github.com/pytorch/pytorch?tab=readme-ov-file#intel-gpu-support>`_, `Vision from source <https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation>`_ and `Audio from source <https://pytorch.org/audio/main/build.linux.html>`_.
|
||||
For nightly wheels
|
||||
|
||||
.. code-block::
|
||||
|
||||
# Get PyTorch Source Code
|
||||
git clone --recursive https://github.com/pytorch/pytorch
|
||||
cd pytorch
|
||||
git checkout main # or checkout the specific release version >= v2.4
|
||||
git submodule sync
|
||||
git submodule update --init --recursive
|
||||
pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu
|
||||
|
||||
# Get required packages for compilation
|
||||
conda install cmake ninja
|
||||
pip install -r requirements.txt
|
||||
Platform Windows
|
||||
""""""""""""""""
|
||||
|
||||
# Pytorch for Intel GPUs only support Linux platform for now.
|
||||
# Install the required packages for pytorch compilation.
|
||||
conda install intel::mkl-static intel::mkl-include
|
||||
Now we have all the required packages installed and environment activated. Use the following commands to install ``pytorch`` on Windows, build from source for ``torchvision`` and ``torchaudio``.
|
||||
|
||||
# (optional) If using torch.compile with inductor/triton, install the matching version of triton
|
||||
# Run from the pytorch directory after cloning
|
||||
# For Intel GPU support, please explicitly `export USE_XPU=1` before running command.
|
||||
USE_XPU=1 make triton
|
||||
For release wheels
|
||||
|
||||
# If you would like to compile PyTorch with new C++ ABI enabled, then first run this command:
|
||||
export _GLIBCXX_USE_CXX11_ABI=1
|
||||
.. code-block::
|
||||
|
||||
# pytorch build from source
|
||||
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
|
||||
python setup.py develop
|
||||
cd ..
|
||||
pip3 install torch --index-url https://download.pytorch.org/whl/xpu
|
||||
|
||||
# (optional) If using torchvison.
|
||||
# Get torchvision Code
|
||||
git clone https://github.com/pytorch/vision.git
|
||||
cd vision
|
||||
git checkout main # or specific version
|
||||
python setup.py develop
|
||||
cd ..
|
||||
For nightly wheels
|
||||
|
||||
# (optional) If using torchaudio.
|
||||
# Get torchaudio Code
|
||||
git clone https://github.com/pytorch/audio.git
|
||||
cd audio
|
||||
pip install -r requirements.txt
|
||||
git checkout main # or specific version
|
||||
git submodule sync
|
||||
git submodule update --init --recursive
|
||||
python setup.py develop
|
||||
cd ..
|
||||
.. code-block::
|
||||
|
||||
pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/xpu
|
||||
|
||||
From Source
|
||||
^^^^^^^^^^^
|
||||
|
||||
Build from source for ``torch`` refer to `PyTorch Installation Build from source <https://github.com/pytorch/pytorch?tab=readme-ov-file#from-source>`_.
|
||||
|
||||
Build from source for ``torchvision`` refer to `Torchvision Installation Build from source <https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation>`_.
|
||||
|
||||
Build from source for ``torchaudio`` refert to `Torchaudio Installation Build from source <https://github.com/pytorch/audio/blob/main/CONTRIBUTING.md#building-torchaudio-from-source>`_.
|
||||
|
||||
Check availability for Intel GPU
|
||||
--------------------------------
|
||||
|
||||
.. note::
|
||||
Make sure the environment is properly set up by following `Environment Set up <#set-up-environment>`_ before running the code.
|
||||
|
||||
To check if your Intel GPU is available, you would typically use the following code:
|
||||
|
||||
.. code-block::
|
||||
@ -103,7 +86,11 @@ To check if your Intel GPU is available, you would typically use the following c
|
||||
import torch
|
||||
torch.xpu.is_available() # torch.xpu is the API for Intel GPU support
|
||||
|
||||
If the output is ``False``, ensure that you have Intel GPU in your system and correctly follow the `PyTorch Installation Prerequisites for Intel GPUs <https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html>`_. Then, check that the PyTorch compilation is correctly finished.
|
||||
If the output is ``False``, double check following steps below.
|
||||
|
||||
#. Intel GPU driver installation
|
||||
#. Intel support package installation
|
||||
#. Environment setup
|
||||
|
||||
Minimum Code Change
|
||||
-------------------
|
||||
@ -123,7 +110,6 @@ The following points outline the support and limitations for PyTorch with Intel
|
||||
#. Both training and inference workflows are supported.
|
||||
#. Both eager mode and ``torch.compile`` is supported.
|
||||
#. Data types such as FP32, BF16, FP16, and Automatic Mixed Precision (AMP) are all supported.
|
||||
#. Models that depend on third-party components, will not be supported until PyTorch v2.5 or later.
|
||||
|
||||
Examples
|
||||
--------
|
||||
@ -148,10 +134,8 @@ Inference with FP32
|
||||
model.eval()
|
||||
data = torch.rand(1, 3, 224, 224)
|
||||
|
||||
######## code changes #######
|
||||
model = model.to("xpu")
|
||||
data = data.to("xpu")
|
||||
######## code changes #######
|
||||
|
||||
with torch.no_grad():
|
||||
model(data)
|
||||
@ -170,18 +154,14 @@ Inference with AMP
|
||||
model.eval()
|
||||
data = torch.rand(1, 3, 224, 224)
|
||||
|
||||
#################### code changes #################
|
||||
model = model.to("xpu")
|
||||
data = data.to("xpu")
|
||||
#################### code changes #################
|
||||
|
||||
with torch.no_grad():
|
||||
d = torch.rand(1, 3, 224, 224)
|
||||
############################# code changes #####################
|
||||
d = d.to("xpu")
|
||||
# set dtype=torch.bfloat16 for BF16
|
||||
with torch.autocast(device_type="xpu", dtype=torch.float16, enabled=True):
|
||||
############################# code changes #####################
|
||||
model(data)
|
||||
|
||||
print("Execution finished")
|
||||
@ -193,21 +173,32 @@ Inference with ``torch.compile``
|
||||
|
||||
import torch
|
||||
import torchvision.models as models
|
||||
import time
|
||||
|
||||
model = models.resnet50(weights="ResNet50_Weights.DEFAULT")
|
||||
model.eval()
|
||||
data = torch.rand(1, 3, 224, 224)
|
||||
ITERS = 10
|
||||
|
||||
######## code changes #######
|
||||
model = model.to("xpu")
|
||||
data = data.to("xpu")
|
||||
######## code changes #######
|
||||
|
||||
model = torch.compile(model)
|
||||
for i in range(ITERS):
|
||||
with torch.no_grad():
|
||||
model(data)
|
||||
for i in range(ITERS):
|
||||
start = time.time()
|
||||
with torch.no_grad():
|
||||
model(data)
|
||||
torch.xpu.synchronize()
|
||||
end = time.time()
|
||||
print(f"Inference time before torch.compile for iteration {i}: {(end-start)*1000} ms")
|
||||
|
||||
model = torch.compile(model)
|
||||
for i in range(ITERS):
|
||||
start = time.time()
|
||||
with torch.no_grad():
|
||||
model(data)
|
||||
torch.xpu.synchronize()
|
||||
end = time.time()
|
||||
print(f"Inference time after torch.compile for iteration {i}: {(end-start)*1000} ms")
|
||||
|
||||
print("Execution finished")
|
||||
|
||||
@ -242,27 +233,27 @@ Train with FP32
|
||||
download=DOWNLOAD,
|
||||
)
|
||||
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128)
|
||||
train_len = len(train_loader)
|
||||
|
||||
model = torchvision.models.resnet50()
|
||||
criterion = torch.nn.CrossEntropyLoss()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9)
|
||||
model.train()
|
||||
######################## code changes #######################
|
||||
model = model.to("xpu")
|
||||
criterion = criterion.to("xpu")
|
||||
######################## code changes #######################
|
||||
|
||||
print(f"Initiating training")
|
||||
for batch_idx, (data, target) in enumerate(train_loader):
|
||||
########## code changes ##########
|
||||
data = data.to("xpu")
|
||||
target = target.to("xpu")
|
||||
########## code changes ##########
|
||||
optimizer.zero_grad()
|
||||
output = model(data)
|
||||
loss = criterion(output, target)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
print(batch_idx)
|
||||
if (batch_idx + 1) % 10 == 0:
|
||||
iteration_loss = loss.item()
|
||||
print(f"Iteration [{batch_idx+1}/{train_len}], Loss: {iteration_loss:.4f}")
|
||||
torch.save(
|
||||
{
|
||||
"model_state_dict": model.state_dict(),
|
||||
@ -301,6 +292,7 @@ Train with AMP
|
||||
download=DOWNLOAD,
|
||||
)
|
||||
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128)
|
||||
train_len = len(train_loader)
|
||||
|
||||
model = torchvision.models.resnet50()
|
||||
criterion = torch.nn.CrossEntropyLoss()
|
||||
@ -308,16 +300,13 @@ Train with AMP
|
||||
scaler = torch.amp.GradScaler(enabled=use_amp)
|
||||
|
||||
model.train()
|
||||
######################## code changes #######################
|
||||
model = model.to("xpu")
|
||||
criterion = criterion.to("xpu")
|
||||
######################## code changes #######################
|
||||
|
||||
print(f"Initiating training")
|
||||
for batch_idx, (data, target) in enumerate(train_loader):
|
||||
########## code changes ##########
|
||||
data = data.to("xpu")
|
||||
target = target.to("xpu")
|
||||
########## code changes ##########
|
||||
# set dtype=torch.bfloat16 for BF16
|
||||
with torch.autocast(device_type="xpu", dtype=torch.float16, enabled=use_amp):
|
||||
output = model(data)
|
||||
@ -326,7 +315,9 @@ Train with AMP
|
||||
scaler.step(optimizer)
|
||||
scaler.update()
|
||||
optimizer.zero_grad()
|
||||
print(batch_idx)
|
||||
if (batch_idx + 1) % 10 == 0:
|
||||
iteration_loss = loss.item()
|
||||
print(f"Iteration [{batch_idx+1}/{train_len}], Loss: {iteration_loss:.4f}")
|
||||
|
||||
torch.save(
|
||||
{
|
||||
@ -337,3 +328,61 @@ Train with AMP
|
||||
)
|
||||
|
||||
print("Execution finished")
|
||||
|
||||
Train with ``torch.compile``
|
||||
""""""""""""""""""""""""""""
|
||||
|
||||
.. code-block::
|
||||
|
||||
import torch
|
||||
import torchvision
|
||||
|
||||
LR = 0.001
|
||||
DOWNLOAD = True
|
||||
DATA = "datasets/cifar10/"
|
||||
|
||||
transform = torchvision.transforms.Compose(
|
||||
[
|
||||
torchvision.transforms.Resize((224, 224)),
|
||||
torchvision.transforms.ToTensor(),
|
||||
torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
|
||||
]
|
||||
)
|
||||
train_dataset = torchvision.datasets.CIFAR10(
|
||||
root=DATA,
|
||||
train=True,
|
||||
transform=transform,
|
||||
download=DOWNLOAD,
|
||||
)
|
||||
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128)
|
||||
train_len = len(train_loader)
|
||||
|
||||
model = torchvision.models.resnet50()
|
||||
criterion = torch.nn.CrossEntropyLoss()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9)
|
||||
model.train()
|
||||
model = model.to("xpu")
|
||||
criterion = criterion.to("xpu")
|
||||
model = torch.compile(model)
|
||||
|
||||
print(f"Initiating training with torch compile")
|
||||
for batch_idx, (data, target) in enumerate(train_loader):
|
||||
data = data.to("xpu")
|
||||
target = target.to("xpu")
|
||||
optimizer.zero_grad()
|
||||
output = model(data)
|
||||
loss = criterion(output, target)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
if (batch_idx + 1) % 10 == 0:
|
||||
iteration_loss = loss.item()
|
||||
print(f"Iteration [{batch_idx+1}/{train_len}], Loss: {iteration_loss:.4f}")
|
||||
torch.save(
|
||||
{
|
||||
"model_state_dict": model.state_dict(),
|
||||
"optimizer_state_dict": optimizer.state_dict(),
|
||||
},
|
||||
"checkpoint.pth",
|
||||
)
|
||||
|
||||
print("Execution finished")
|
||||
|
||||
@ -110,6 +110,13 @@ reduced-precision reductions are problematic, they can be turned off with
|
||||
|
||||
For more information see :ref:`allow_fp16_reduced_precision_reduction<fp16reducedprecision>` and :ref:`allow_bf16_reduced_precision_reduction<bf16reducedprecision>`
|
||||
|
||||
Reduced Precision Reduction for FP16 and BF16 in Scaled Dot Product Attention (SDPA)
|
||||
------------------------------------------------------------------------------------
|
||||
A naive SDPA math backend, when using FP16/BF16 inputs, can accumulate significant numerical errors due to the usage of low-precision intermediate buffers. To mitigate this issue, the default behavior now involves upcasting FP16/BF16 inputs to FP32. Computations are performed in FP32/TF32, and the final FP32 results are then downcasted back to FP16/BF16. This will improve numerical accuracy of the final output for the math backend with FP16/BF16 inputs, but increases memory usages and may cause the performance regressions in the math backend as computations shift from FP16/BF16 BMM to FP32/TF32 BMM/Matmul.
|
||||
|
||||
For scenarios where reduced-precision reductions are preferred for speed, they can be enabled with the following setting:
|
||||
``torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)``
|
||||
|
||||
.. _fp16_on_mi200:
|
||||
|
||||
Reduced Precision FP16 and BF16 GEMMs and Convolutions on AMD Instinct MI200 devices
|
||||
|
||||
@ -13,7 +13,35 @@ The exported model can be consumed by any of the many
|
||||
`runtimes that support ONNX <https://onnx.ai/supported-tools.html#deployModel>`_, including
|
||||
Microsoft's `ONNX Runtime <https://www.onnxruntime.ai>`_.
|
||||
|
||||
**There are two flavors of ONNX exporter API that you can use, as listed below:**
|
||||
**There are two flavors of ONNX exporter API that you can use, as listed below.**
|
||||
Both can be called through function :func:`torch.onnx.export`.
|
||||
Next example shows how to export a simple model.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import torch
|
||||
|
||||
class MyModel(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super(MyModel, self).__init__()
|
||||
self.conv1 = torch.nn.Conv2d(1, 128, 5)
|
||||
|
||||
def forward(self, x):
|
||||
return torch.relu(self.conv1(x))
|
||||
|
||||
input_tensor = torch.rand((1, 1, 128, 128), dtype=torch.float32)
|
||||
|
||||
model = MyModel()
|
||||
|
||||
torch.onnx.export(
|
||||
model, # model to export
|
||||
(input_tensor,), # inputs of the model,
|
||||
"my_model.onnx", # filename of the ONNX model
|
||||
input_names=["input"], # Rename inputs for the ONNX model
|
||||
dynamo=True # True or False to select the exporter to use
|
||||
)
|
||||
|
||||
Next sections introduces the two versions of the exporter.
|
||||
|
||||
TorchDynamo-based ONNX Exporter
|
||||
-------------------------------
|
||||
|
||||
@ -44,6 +44,9 @@ They can be installed through `pip <https://pypi.org/project/pip/>`_:
|
||||
|
||||
pip install --upgrade onnx onnxscript
|
||||
|
||||
`onnxruntime <https://onnxruntime.ai>`_ can then be used to execute the model
|
||||
on a large variety of processors.
|
||||
|
||||
A simple example
|
||||
----------------
|
||||
|
||||
@ -74,9 +77,9 @@ See below a demonstration of exporter API in action with a simple Multilayer Per
|
||||
|
||||
model = MLPModel()
|
||||
tensor_x = torch.rand((97, 8), dtype=torch.float32)
|
||||
onnx_program = torch.onnx.dynamo_export(model, tensor_x)
|
||||
onnx_program = torch.onnx.export(model, (tensor_x,), dynamo=True)
|
||||
|
||||
As the code above shows, all you need is to provide :func:`torch.onnx.dynamo_export` with an instance of the model and its input.
|
||||
As the code above shows, all you need is to provide :func:`torch.onnx.export` with an instance of the model and its input.
|
||||
The exporter will then return an instance of :class:`torch.onnx.ONNXProgram` that contains the exported ONNX graph along with extra information.
|
||||
|
||||
The in-memory model available through ``onnx_program.model_proto`` is an ``onnx.ModelProto`` object in compliance with the `ONNX IR spec <https://github.com/onnx/onnx/blob/main/docs/IR.md>`_.
|
||||
@ -86,6 +89,17 @@ The ONNX model may then be serialized into a `Protobuf file <https://protobuf.de
|
||||
|
||||
onnx_program.save("mlp.onnx")
|
||||
|
||||
Two functions exist to export the model to ONNX based on TorchDynamo engine.
|
||||
They slightly differ in the way they produce the :class:`ExportedProgram`.
|
||||
:func:`torch.onnx.dynamo_export` was introduced with PyTorch 2.1 and
|
||||
:func:`torch.onnx.export` was extended with PyTorch 2.5 to easily switch
|
||||
from TorchScript to TorchDynamo. To call the former function,
|
||||
the last line of the previous example can be replaced by the following one.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
onnx_program = torch.onnx.dynamo_export(model, tensor_x)
|
||||
|
||||
Inspecting the ONNX model using GUI
|
||||
-----------------------------------
|
||||
|
||||
@ -109,9 +123,14 @@ By expanding it, the function body is shown.
|
||||
|
||||
The function body is a sequence of ONNX operators or other functions.
|
||||
|
||||
Diagnosing issues with SARIF
|
||||
----------------------------
|
||||
When the conversion fails
|
||||
-------------------------
|
||||
|
||||
Function :func:`torch.onnx.export` should called a second time with
|
||||
parameter ``report=True``. A markdown report is generated to help the user
|
||||
to resolve the issue.
|
||||
|
||||
Function :func:`torch.onnx.dynamo_export` generates a report using 'SARIF' format.
|
||||
ONNX diagnostics goes beyond regular logs through the adoption of
|
||||
`Static Analysis Results Interchange Format (aka SARIF) <https://docs.oasis-open.org/sarif/sarif/v2.1.0/sarif-v2.1.0.html>`__
|
||||
to help users debug and improve their model using a GUI, such as
|
||||
|
||||
@ -25,7 +25,7 @@ Pod::Spec.new do |s|
|
||||
s.user_target_xcconfig = {
|
||||
'HEADER_SEARCH_PATHS' => '$(inherited) "$(PODS_ROOT)/LibTorch-Lite-Nightly/install/include/"',
|
||||
'OTHER_LDFLAGS' => '-force_load "$(PODS_ROOT)/LibTorch-Lite-Nightly/install/lib/libtorch.a" -force_load "$(PODS_ROOT)/LibTorch-Lite-Nightly/install/lib/libtorch_cpu.a"',
|
||||
'CLANG_CXX_LANGUAGE_STANDARD' => 'c++14',
|
||||
'CLANG_CXX_LANGUAGE_STANDARD' => 'c++17',
|
||||
'CLANG_CXX_LIBRARY' => 'libc++'
|
||||
}
|
||||
s.pod_target_xcconfig = {
|
||||
|
||||
@ -25,7 +25,7 @@ Pod::Spec.new do |s|
|
||||
s.user_target_xcconfig = {
|
||||
'HEADER_SEARCH_PATHS' => '$(inherited) "$(PODS_ROOT)/LibTorch/install/include/"',
|
||||
'OTHER_LDFLAGS' => '-force_load "$(PODS_ROOT)/LibTorch/install/lib/libtorch.a" -force_load "$(PODS_ROOT)/LibTorch/install/lib/libtorch_cpu.a"',
|
||||
'CLANG_CXX_LANGUAGE_STANDARD' => 'c++14',
|
||||
'CLANG_CXX_LANGUAGE_STANDARD' => 'c++17',
|
||||
'CLANG_CXX_LIBRARY' => 'libc++'
|
||||
}
|
||||
s.pod_target_xcconfig = {
|
||||
|
||||
@ -6,7 +6,10 @@ numpy
|
||||
psutil
|
||||
pyyaml
|
||||
requests
|
||||
setuptools
|
||||
# Setuptools>=74.0.0 stopped support for directly using private funcs(_msvccompiler)
|
||||
# and consolidated all compiler logic in distutils used in Pytorch build, so older
|
||||
# is required until pytorch build not refactored to work for latest setuptools.
|
||||
setuptools<=72.1.0
|
||||
types-dataclasses
|
||||
typing-extensions>=4.8.0
|
||||
sympy==1.12.1 ; python_version == "3.8"
|
||||
@ -17,7 +20,5 @@ jinja2
|
||||
fsspec
|
||||
lintrunner
|
||||
ninja
|
||||
# setuptools was removed from default python install
|
||||
setuptools ; python_version >= "3.12"
|
||||
packaging
|
||||
optree>=0.12.0 ; python_version <= "3.12"
|
||||
|
||||
@ -17,7 +17,9 @@ from torch.distributed._tensor import DTensor, init_device_mesh, Replicate, Shar
|
||||
from torch.distributed.checkpoint.state_dict import (
|
||||
get_model_state_dict,
|
||||
get_optimizer_state_dict,
|
||||
set_model_state_dict,
|
||||
set_optimizer_state_dict,
|
||||
StateDictOptions,
|
||||
)
|
||||
from torch.distributed.device_mesh import DeviceMesh
|
||||
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
||||
@ -335,6 +337,57 @@ class TestFullyShard2DTraining(FSDPTest):
|
||||
self.assertEqual(loss_no_cp2, loss_cp2)
|
||||
|
||||
|
||||
class TestFullyShard2DStateDict(DTensorTestBase):
|
||||
@property
|
||||
def backend(self):
|
||||
# need to specify gloo backend for testing cpu offload
|
||||
return "cpu:gloo,cuda:nccl"
|
||||
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_fully_shard_tp_2d_set_full_state_dict(self):
|
||||
dummy_model = SimpleModel().cuda()
|
||||
mesh_2d = init_device_mesh(
|
||||
"cuda",
|
||||
(2, self.world_size // 2),
|
||||
mesh_dim_names=("dp", "tp"),
|
||||
)
|
||||
tp_mesh = mesh_2d["tp"]
|
||||
dp_mesh = mesh_2d["dp"]
|
||||
parallelize_plan = {
|
||||
"net1": ColwiseParallel(),
|
||||
"net2": RowwiseParallel(),
|
||||
"net3": ColwiseParallel(),
|
||||
}
|
||||
model = parallelize_module(dummy_model, tp_mesh, parallelize_plan)
|
||||
fully_shard(model, mesh=dp_mesh)
|
||||
optim = torch.optim.Adam(model.parameters(), lr=0.01)
|
||||
model(model.get_input()).sum().backward()
|
||||
optim.step()
|
||||
# ref_msd, ref_osd are both the default sharded state dict
|
||||
ref_msd = copy.deepcopy(get_model_state_dict(model))
|
||||
ref_osd = copy.deepcopy(get_optimizer_state_dict(model, optimizers=optim))
|
||||
|
||||
options = StateDictOptions(
|
||||
full_state_dict=True, cpu_offload=True, broadcast_from_rank0=True
|
||||
)
|
||||
full_msd = get_model_state_dict(model, options=options)
|
||||
full_osd = get_optimizer_state_dict(model, optimizers=optim, options=options)
|
||||
# load full_msd and full_osd into model and optim.
|
||||
# this loads the slice of full tensor into each rank's local DTensor.
|
||||
set_model_state_dict(model, full_msd, options=options)
|
||||
set_optimizer_state_dict(
|
||||
model, optimizers=optim, optim_state_dict=full_osd, options=options
|
||||
)
|
||||
|
||||
# check after setting full state dict, the model and optim default sharded state dict
|
||||
# are the same as the initial default sharded state dict.
|
||||
new_msd = get_model_state_dict(model)
|
||||
new_osd = get_optimizer_state_dict(model, optimizers=optim)
|
||||
self.assertEqual(ref_msd, new_msd)
|
||||
self.assertEqual(ref_osd, new_osd)
|
||||
|
||||
|
||||
class Test2dFSDP1ParallelIntegration(DTensorTestBase):
|
||||
def init_model(self, device_type, model_parallel_size=2):
|
||||
torch.manual_seed(0)
|
||||
@ -544,6 +597,11 @@ class TestNew2dParallelTraining(DTensorTestBase):
|
||||
# TODO: update all state dict unit tests to use distributed.checkpoint.state_dict,
|
||||
# and consolidate all the state_dict test in test.distributed.checkpoint.
|
||||
class TestNew2dParallelStateDict(DTensorTestBase):
|
||||
@property
|
||||
def backend(self):
|
||||
# need to specify gloo backend for testing cpu offload
|
||||
return "cpu:gloo,cuda:nccl"
|
||||
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_fsdp_2d_extension(self):
|
||||
|
||||
@ -29,9 +29,9 @@ from torch.nn.parallel.distributed import DistributedDataParallel as DDP
|
||||
from torch.testing._internal.common_distributed import (
|
||||
MultiProcessTestCase,
|
||||
skip_if_lt_x_gpu,
|
||||
skip_if_rocm,
|
||||
skip_if_rocm_multiprocess,
|
||||
)
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.common_utils import run_tests, skipIfRocm
|
||||
from torch.testing._internal.distributed.fake_pg import FakeStore
|
||||
from torch.utils._triton import has_triton
|
||||
from torch.utils.checkpoint import checkpoint
|
||||
@ -217,21 +217,21 @@ class ReplicateTest(MultiProcessInductorTestCase):
|
||||
self._test_compile(use_gpu=False, no_sync=True)
|
||||
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@torch._inductor.config.patch(reorder_for_locality=False)
|
||||
def test_compile_gpu(self):
|
||||
self._test_compile(use_gpu=True, no_sync=False, checkpoint=False)
|
||||
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@torch._inductor.config.patch(reorder_for_locality=False)
|
||||
def test_compile_gpu_ac(self):
|
||||
self._test_compile(use_gpu=True, no_sync=False, checkpoint=True)
|
||||
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_compile_bf16(self):
|
||||
def setup(model, compiled_replicate_model, compiled_ddp_model) -> None:
|
||||
@ -245,7 +245,7 @@ class ReplicateTest(MultiProcessInductorTestCase):
|
||||
self._test_compile(use_gpu=True, no_sync=False, setup_func=setup)
|
||||
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_compile_fp16(self):
|
||||
def setup(model, compiled_replicate_model, compiled_ddp_model) -> None:
|
||||
@ -262,7 +262,7 @@ class ReplicateTest(MultiProcessInductorTestCase):
|
||||
)
|
||||
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_compile_backward_only(self):
|
||||
self._test_compile(use_gpu=True, no_sync=False, no_compile_forward=True)
|
||||
@ -386,7 +386,7 @@ class DDP_TP_Test(InductorTestCase):
|
||||
dist.destroy_process_group()
|
||||
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@skip_if_rocm
|
||||
@skipIfRocm
|
||||
def test_ddp_tp(self):
|
||||
ref_model = Net()
|
||||
compiled_replicate_model = deepcopy(ref_model)
|
||||
|
||||
@ -14,7 +14,7 @@ from torch.testing._internal.common_distributed import (
|
||||
requires_gloo,
|
||||
requires_nccl,
|
||||
skip_if_lt_x_gpu,
|
||||
skip_if_rocm,
|
||||
skip_if_rocm_multiprocess,
|
||||
)
|
||||
from torch.testing._internal.common_utils import (
|
||||
NO_MULTIPROCESSING_SPAWN,
|
||||
@ -112,7 +112,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
|
||||
BACKEND != "nccl", "Only nccl backend supports all_to_all_fp16"
|
||||
)
|
||||
@skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
def test_all_to_all_fp16(self):
|
||||
store = dist.FileStore(self.file_name, self.world_size)
|
||||
dist.init_process_group(
|
||||
@ -137,7 +137,7 @@ if BACKEND == "gloo" or BACKEND == "nccl":
|
||||
BACKEND != "nccl", "Only nccl backend supports all_to_all_fp16"
|
||||
)
|
||||
@skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
def test_all_to_all_bfp16(self):
|
||||
store = dist.FileStore(self.file_name, self.world_size)
|
||||
dist.init_process_group(
|
||||
|
||||
@ -86,18 +86,19 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
model, optim, copy_optim, dist_model, dist_optim = init_model_optim()
|
||||
|
||||
# Train 10 steps.
|
||||
_dist_optim = [dist_optim] if not isinstance(dist_optim, list) else dist_optim
|
||||
for i in range(10):
|
||||
optim.zero_grad()
|
||||
for d_optim in _dist_optim:
|
||||
d_optim.zero_grad()
|
||||
|
||||
batch = torch.rand(8, 100, device="cuda")
|
||||
model(batch).sum().backward()
|
||||
optim.step()
|
||||
dist_model(batch).sum().backward()
|
||||
if not isinstance(dist_optim, list):
|
||||
dist_optim.step()
|
||||
dist_optim.zero_grad()
|
||||
else:
|
||||
for _dist_optim in dist_optim:
|
||||
_dist_optim.zero_grad()
|
||||
optim.zero_grad()
|
||||
|
||||
optim.step()
|
||||
for d_optim in _dist_optim:
|
||||
d_optim.step()
|
||||
|
||||
# Get the state_dict, and compare the result
|
||||
msd = model.state_dict()
|
||||
@ -176,8 +177,8 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
device_mesh = init_device_mesh("cuda", (self.world_size,))
|
||||
|
||||
orig_model = CompositeParamModel(device=torch.device("cuda"))
|
||||
orig_optim = optimizer_class(orig_model.parameters(), lr=1e-3, foreach=True)
|
||||
copy_optim = optimizer_class(orig_model.parameters(), lr=1e-3, foreach=True)
|
||||
orig_optim = optimizer_class(orig_model.parameters(), lr=1e-4, foreach=True)
|
||||
copy_optim = optimizer_class(orig_model.parameters(), lr=1e-4, foreach=True)
|
||||
if wrapping:
|
||||
strategy = set(wrapping)
|
||||
else:
|
||||
@ -204,7 +205,7 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
|
||||
if compile_model:
|
||||
dist_model = torch.compile(dist_model)
|
||||
dist_optim = optimizer_class(dist_model.parameters(), lr=1e-3, foreach=True)
|
||||
dist_optim = optimizer_class(dist_model.parameters(), lr=1e-4, foreach=True)
|
||||
return orig_model, orig_optim, copy_optim, dist_model, dist_optim
|
||||
|
||||
self._test_save_load(init_model_optim)
|
||||
@ -218,7 +219,11 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
"use_composable": [True, False],
|
||||
"use_dtensor": [True, False],
|
||||
"wrapping": [(), (nn.Linear, UnitModule)],
|
||||
"optimizer_class": [torch.optim.Adam, torch.optim.AdamW],
|
||||
"optimizer_class": [
|
||||
torch.optim.Adam,
|
||||
torch.optim.AdamW,
|
||||
torch.optim.SGD,
|
||||
],
|
||||
},
|
||||
self._test_fsdp,
|
||||
)
|
||||
@ -248,10 +253,10 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
def init_model_optim():
|
||||
orig_model = CompositeParamModel(device=torch.device("cuda"))
|
||||
orig_optim = optimizer_class(
|
||||
orig_model.parameters(), lr=1e-3, foreach=foreach
|
||||
orig_model.parameters(), lr=1e-4, foreach=foreach
|
||||
)
|
||||
copy_optim = optimizer_class(
|
||||
orig_model.parameters(), lr=1e-3, foreach=foreach
|
||||
orig_model.parameters(), lr=1e-4, foreach=foreach
|
||||
)
|
||||
|
||||
dist_model = FSDP2(
|
||||
@ -262,7 +267,7 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
if compile_model:
|
||||
dist_model = torch.compile(dist_model)
|
||||
dist_optim = optimizer_class(
|
||||
dist_model.parameters(), lr=1e-3, foreach=foreach
|
||||
dist_model.parameters(), lr=1e-4, foreach=foreach
|
||||
)
|
||||
|
||||
return orig_model, orig_optim, copy_optim, dist_model, dist_optim
|
||||
@ -284,13 +289,13 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
def _test_ddp(self, use_composable: bool, optimizer_class: Type[Optimizer]) -> None:
|
||||
def init_model_optim():
|
||||
orig_model = CompositeParamModel(device=torch.device("cuda"))
|
||||
orig_optim = optimizer_class(orig_model.parameters(), lr=1e-3)
|
||||
copy_optim = optimizer_class(orig_model.parameters(), lr=1e-3)
|
||||
orig_optim = optimizer_class(orig_model.parameters(), lr=1e-4)
|
||||
copy_optim = optimizer_class(orig_model.parameters(), lr=1e-4)
|
||||
if use_composable:
|
||||
dist_model = replicate(copy.deepcopy(orig_model))
|
||||
else:
|
||||
dist_model = DDP(copy.deepcopy(orig_model))
|
||||
dist_optim = optimizer_class(dist_model.parameters(), lr=1e-3)
|
||||
dist_optim = optimizer_class(dist_model.parameters(), lr=1e-4)
|
||||
return orig_model, orig_optim, copy_optim, dist_model, dist_optim
|
||||
|
||||
self._test_save_load(init_model_optim)
|
||||
@ -301,7 +306,11 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
self.run_subtests(
|
||||
{
|
||||
"use_composable": [True, False],
|
||||
"optimizer_class": [torch.optim.Adam, torch.optim.AdamW],
|
||||
"optimizer_class": [
|
||||
torch.optim.Adam,
|
||||
torch.optim.AdamW,
|
||||
torch.optim.SGD,
|
||||
],
|
||||
},
|
||||
self._test_ddp,
|
||||
)
|
||||
@ -320,8 +329,8 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
orig_model.u1.parameters(), orig_model.u2.parameters()
|
||||
):
|
||||
param.requires_grad = False
|
||||
orig_optim = optimizer_class(orig_model.parameters(), lr=1e-3)
|
||||
copy_optim = optimizer_class(orig_model.parameters(), lr=1e-3)
|
||||
orig_optim = optimizer_class(orig_model.parameters(), lr=1e-4)
|
||||
copy_optim = optimizer_class(orig_model.parameters(), lr=1e-4)
|
||||
dist_model = copy.deepcopy(orig_model)
|
||||
if use_composable:
|
||||
replicate(dist_model.l)
|
||||
@ -336,13 +345,13 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
)
|
||||
if optim_in_backward:
|
||||
_apply_optimizer_in_backward(
|
||||
optimizer_class, dist_model.parameters(), {"lr": 1e-3}
|
||||
optimizer_class, dist_model.parameters(), {"lr": 1e-4}
|
||||
)
|
||||
dist_optim = [
|
||||
p._in_backward_optimizers[0] for p in dist_model.parameters()
|
||||
]
|
||||
else:
|
||||
dist_optim = optimizer_class(dist_model.parameters(), lr=1e-3)
|
||||
dist_optim = optimizer_class(dist_model.parameters(), lr=1e-4)
|
||||
return orig_model, orig_optim, copy_optim, dist_model, dist_optim
|
||||
|
||||
self._test_save_load(init_model_optim, test_frozen)
|
||||
@ -395,10 +404,10 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
def _test_single_gpu(self, optimizer_class: Type[Optimizer]) -> None:
|
||||
def init_model_optim():
|
||||
orig_model = CompositeParamModel(device=torch.device("cuda"))
|
||||
orig_optim = optimizer_class(orig_model.parameters(), lr=1e-3)
|
||||
copy_optim = optimizer_class(orig_model.parameters(), lr=1e-3)
|
||||
orig_optim = optimizer_class(orig_model.parameters(), lr=1e-4)
|
||||
copy_optim = optimizer_class(orig_model.parameters(), lr=1e-4)
|
||||
model_copy = copy.deepcopy(orig_model)
|
||||
optim_copy = optimizer_class(model_copy.parameters(), lr=1e-3)
|
||||
optim_copy = optimizer_class(model_copy.parameters(), lr=1e-4)
|
||||
return orig_model, orig_optim, copy_optim, model_copy, optim_copy
|
||||
|
||||
self._test_save_load(init_model_optim)
|
||||
@ -445,7 +454,7 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
device_mesh=device_mesh,
|
||||
)
|
||||
|
||||
dist_optim = optimizer_class(dist_model.parameters(), lr=1e-3)
|
||||
dist_optim = optimizer_class(dist_model.parameters(), lr=1e-4)
|
||||
|
||||
mst, ost = get_state_dict(
|
||||
dist_model,
|
||||
@ -887,10 +896,10 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
def init_model_optim():
|
||||
device_mesh = init_device_mesh("cuda", (self.world_size,))
|
||||
orig_model = TiedEmbeddingModel(10000, 300).to(torch.device("cuda"))
|
||||
orig_optim = torch.optim.AdamW(orig_model.parameters(), lr=1e-3)
|
||||
copy_optim = torch.optim.AdamW(orig_model.parameters(), lr=1e-3)
|
||||
orig_optim = torch.optim.AdamW(orig_model.parameters(), lr=1e-4)
|
||||
copy_optim = torch.optim.AdamW(orig_model.parameters(), lr=1e-4)
|
||||
dist_model = FSDP(copy.deepcopy(orig_model), device_mesh=device_mesh)
|
||||
dist_optim = torch.optim.AdamW(dist_model.parameters(), lr=1e-3)
|
||||
dist_optim = torch.optim.AdamW(dist_model.parameters(), lr=1e-4)
|
||||
return orig_model, orig_optim, copy_optim, dist_model, dist_optim
|
||||
|
||||
self._test_save_load(init_model_optim)
|
||||
@ -958,7 +967,7 @@ class TestNoComm(MultiProcessTestCase):
|
||||
@skip_if_lt_x_gpu(1)
|
||||
def test_no_dist(self) -> None:
|
||||
model = CompositeParamModel(device=torch.device("cuda"))
|
||||
optim = torch.optim.AdamW(model.parameters(), lr=1e-3)
|
||||
optim = torch.optim.AdamW(model.parameters(), lr=1e-4)
|
||||
|
||||
self.assertFalse(dist.is_initialized())
|
||||
msd = get_model_state_dict(
|
||||
|
||||
@ -9,10 +9,16 @@ from torch.distributed._state_dict_utils import (
|
||||
_check_state_dict_similarity,
|
||||
_copy_state_dict,
|
||||
_create_cpu_state_dict,
|
||||
_distribute_tensors,
|
||||
_gather_state_dict,
|
||||
_offload_state_dict_to_cpu,
|
||||
)
|
||||
from torch.distributed._tensor import DTensor, Shard
|
||||
from torch.distributed._tensor import (
|
||||
distribute_tensor,
|
||||
DTensor,
|
||||
init_device_mesh,
|
||||
Shard,
|
||||
)
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
DTensorTestBase,
|
||||
@ -170,6 +176,37 @@ class TestStateDictUtils(DTensorTestBase):
|
||||
)
|
||||
_verify(cpu_state_dict)
|
||||
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_state_dict_util_distribute_tensors(self):
|
||||
even_tensor = torch.randn(self.world_size, 2)
|
||||
uneven_tensor = torch.randn(1, 2)
|
||||
|
||||
mesh = init_device_mesh("cuda", mesh_shape=(self.world_size,))
|
||||
even_dtensor = distribute_tensor(
|
||||
torch.randn(self.world_size, 2), mesh, [Shard(0)]
|
||||
)
|
||||
uneven_dtensor = distribute_tensor(torch.randn(1, 2), mesh, [Shard(0)])
|
||||
|
||||
# the dtensor and tensor are different before _distribute_tensors is called.
|
||||
local_state_dict = {
|
||||
"even": [even_dtensor, even_tensor],
|
||||
"uneven": [uneven_dtensor, uneven_tensor],
|
||||
}
|
||||
ref_local_state_dict = copy.deepcopy(local_state_dict)
|
||||
keys = ["even", "uneven"]
|
||||
|
||||
_distribute_tensors(local_state_dict, keys, self.device_type)
|
||||
for local_v, ref_v in zip(
|
||||
local_state_dict.values(), ref_local_state_dict.values()
|
||||
):
|
||||
self.assertEqual(local_v.size(), ref_v[0].size())
|
||||
self.assertEqual(local_v.stride(), ref_v[0].stride())
|
||||
self.assertNotEqual(
|
||||
local_v_full_tensor := local_v.full_tensor(), ref_v[0].full_tensor()
|
||||
)
|
||||
self.assertEqual(local_v_full_tensor, ref_v[1])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -403,7 +403,7 @@ class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer):
|
||||
)
|
||||
|
||||
@common_distributed.skip_if_no_gpu
|
||||
@common_distributed.skip_if_rocm
|
||||
@common_distributed.skip_if_rocm_multiprocess
|
||||
def test_step(self):
|
||||
"""Check that ZeroRedundancyOptimizer properly exposes the ``step()``
|
||||
interface."""
|
||||
@ -443,7 +443,7 @@ class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer):
|
||||
self.assertEqual(m.bias, m_zero.bias)
|
||||
|
||||
@common_distributed.skip_if_no_gpu
|
||||
@common_distributed.skip_if_rocm
|
||||
@common_distributed.skip_if_rocm_multiprocess
|
||||
def test_step_with_closure(self):
|
||||
"""Check that ZeroRedundancyOptimizer properly exposes the
|
||||
``step(closure)`` interface."""
|
||||
@ -663,7 +663,7 @@ class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer):
|
||||
torch.testing.assert_close(layer1.bias, layer3.bias)
|
||||
|
||||
@common_distributed.skip_if_no_gpu
|
||||
@common_distributed.skip_if_rocm
|
||||
@common_distributed.skip_if_rocm_multiprocess
|
||||
def test_collect_shards(self):
|
||||
"""Check the state consolidation mechanism and the state dict exposed
|
||||
by ZeroRedundancyOptimizer."""
|
||||
@ -1383,7 +1383,7 @@ class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer):
|
||||
@common_distributed.skip_if_win32()
|
||||
@common_distributed.requires_nccl()
|
||||
@common_distributed.skip_if_no_gpu
|
||||
@common_distributed.skip_if_rocm
|
||||
@common_distributed.skip_if_rocm_multiprocess
|
||||
@parametrize(
|
||||
"use_gpu",
|
||||
[True],
|
||||
|
||||
@ -48,7 +48,7 @@ from torch.testing._internal.common_distributed import (
|
||||
requires_nccl,
|
||||
requires_nccl_version,
|
||||
skip_if_lt_x_gpu,
|
||||
skip_if_rocm,
|
||||
skip_if_rocm_multiprocess,
|
||||
TEST_SKIPS,
|
||||
with_dist_debug_levels,
|
||||
with_nccl_blocking_wait,
|
||||
@ -347,8 +347,18 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
|
||||
not (TEST_MULTIGPU and CUDA_12_AND_ABOVE),
|
||||
"NCCL test requires 2+ GPUs and Device side assert could cause unexpected errors in lower versions of CUDA",
|
||||
)
|
||||
@parametrize("type", [torch.float16, torch.float32, torch.float64, torch.bfloat16])
|
||||
@skip_if_rocm
|
||||
@parametrize(
|
||||
"type",
|
||||
[
|
||||
torch.float16,
|
||||
torch.float32,
|
||||
torch.float64,
|
||||
torch.bfloat16,
|
||||
torch.float8_e4m3fn,
|
||||
torch.float8_e5m2,
|
||||
],
|
||||
)
|
||||
@skip_if_rocm_multiprocess
|
||||
def test_nan_assert(self, type):
|
||||
# Expecting a device-side error when NaN is detected
|
||||
os.environ["TORCH_NCCL_NAN_CHECK"] = "1"
|
||||
@ -1690,7 +1700,7 @@ class DistributedDataParallelTest(
|
||||
|
||||
@requires_nccl()
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
def test_grad_layout_2devicemodule(self):
|
||||
int_devices = gpus_for_rank(self.world_size)[self.rank][:2]
|
||||
dev0 = torch.device("cuda:" + str(int_devices[0]))
|
||||
@ -2462,7 +2472,7 @@ class NcclErrorHandlingTest(MultiProcessTestCase):
|
||||
@requires_nccl()
|
||||
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
|
||||
@skip_if_lt_x_gpu(3)
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
@skip_but_pass_in_sandcastle("Test does not pass when run locally")
|
||||
def test_nccl_errors_nonblocking(self):
|
||||
# Note: we unset and restore TORCH_NCCL_ASYNC_ERROR_HANDLING for this test
|
||||
@ -2524,7 +2534,7 @@ class NcclErrorHandlingTest(MultiProcessTestCase):
|
||||
@requires_nccl()
|
||||
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
|
||||
@skip_if_lt_x_gpu(3)
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
def test_nccl_errors_blocking_clean_exit(self):
|
||||
self._test_nccl_errors_blocking(lambda: sys.exit(0))
|
||||
|
||||
@ -2532,7 +2542,7 @@ class NcclErrorHandlingTest(MultiProcessTestCase):
|
||||
@requires_nccl()
|
||||
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
|
||||
@skip_if_lt_x_gpu(3)
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
def test_nccl_errors_blocking_nonzero_exit(self):
|
||||
self._test_nccl_errors_blocking(lambda: sys.exit(1))
|
||||
|
||||
@ -2540,7 +2550,7 @@ class NcclErrorHandlingTest(MultiProcessTestCase):
|
||||
@requires_nccl()
|
||||
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
|
||||
@skip_if_lt_x_gpu(3)
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
@skip_but_pass_in_sandcastle(
|
||||
"Frequently times out see https://github.com/pytorch/pytorch/issues/58920"
|
||||
)
|
||||
@ -2551,7 +2561,7 @@ class NcclErrorHandlingTest(MultiProcessTestCase):
|
||||
@requires_nccl()
|
||||
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
|
||||
@skip_if_lt_x_gpu(3)
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
def test_nccl_errors_blocking_sigkill(self):
|
||||
self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGKILL))
|
||||
|
||||
@ -2559,7 +2569,7 @@ class NcclErrorHandlingTest(MultiProcessTestCase):
|
||||
@requires_nccl()
|
||||
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
|
||||
@skip_if_lt_x_gpu(3)
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
def test_nccl_errors_blocking_sigterm(self):
|
||||
self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGTERM))
|
||||
|
||||
@ -2775,7 +2785,7 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
|
||||
|
||||
@requires_nccl()
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
def test_intra_node_comm_all_reduce(self):
|
||||
from torch._C._distributed_c10d import _get_intra_node_comm_usage_counter
|
||||
from torch.testing._internal.common_cuda import SM80OrLater
|
||||
@ -4310,7 +4320,7 @@ class NcclErrorDumpTest(NCCLTraceTestBase):
|
||||
@requires_nccl()
|
||||
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@skip_if_rocm
|
||||
@skip_if_rocm_multiprocess
|
||||
def test_nccl_errors_dump(self):
|
||||
os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
|
||||
os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "1000"
|
||||
|
||||
@ -50,7 +50,7 @@ def requires_cuda_p2p_access():
|
||||
def requires_multicast_support():
|
||||
has_multicast_support = (
|
||||
torch.cuda.is_available()
|
||||
and _SymmetricMemory.has_multicast_support(DeviceType.CUDA)
|
||||
and _SymmetricMemory.has_multicast_support(DeviceType.CUDA, 0)
|
||||
)
|
||||
return skip_but_pass_in_sandcastle_if(
|
||||
not has_multicast_support,
|
||||
|
||||
@ -439,29 +439,6 @@ class AutogradFunctionTests(torch._dynamo.test_case.TestCase):
|
||||
self.assertEqual(result, Foo.apply(x))
|
||||
self.assertEqual(cnt.frame_count, 1)
|
||||
|
||||
def test_fwd_no_grad(self):
|
||||
# autograd.Function.forward should be traced and called under no_grad mode.
|
||||
# torch.exp with out=... arguments don't support automatic differentiation,
|
||||
# so can't be traced/called under grad mode (throwing RuntimeError),
|
||||
# therefore this unit test ensures fwd is under no_grad mode.
|
||||
class Foo(torch.autograd.Function):
|
||||
@staticmethod
|
||||
def forward(ctx, inputs):
|
||||
torch.exp(inputs, out=inputs)
|
||||
return inputs
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
return None
|
||||
|
||||
@torch.compile(backend="eager", fullgraph=True)
|
||||
def f(x):
|
||||
return Foo.apply(x)
|
||||
|
||||
x1 = torch.randn(2, 3, requires_grad=True)
|
||||
x2 = x1.clone()
|
||||
self.assertEqual(f(x1), Foo.apply(x2))
|
||||
|
||||
def test_amp_custom_fwd_bwd(self):
|
||||
torch._dynamo.utils.counters.clear()
|
||||
cnt = torch._dynamo.testing.CompileCounter()
|
||||
@ -570,13 +547,9 @@ class GraphModule(torch.nn.Module):
|
||||
|
||||
class fwd_body_0(torch.nn.Module):
|
||||
def forward(self, ctx, x: "f32[]", z: "f32[]", l_weird_b: "f32[]", l_weird_c: "f32[]"):
|
||||
_set_grad_enabled = torch._C._set_grad_enabled(False); _set_grad_enabled = None
|
||||
|
||||
mul: "f32[]" = l_weird_b * l_weird_c
|
||||
clone: "f32[]" = x.clone(); x = None
|
||||
mul_1: "f32[]" = mul * clone; mul = clone = None
|
||||
|
||||
_set_grad_enabled_1 = torch._C._set_grad_enabled(True); _set_grad_enabled_1 = None
|
||||
return (mul_1, [l_weird_b, l_weird_c])
|
||||
|
||||
class bwd_body_0(torch.nn.Module):
|
||||
@ -1140,13 +1113,9 @@ class GraphModule(torch.nn.Module):
|
||||
|
||||
class fwd_body_0(torch.nn.Module):
|
||||
def forward(self, ctx, x: "f32[]", y: "f32[]"):
|
||||
_set_grad_enabled = torch._C._set_grad_enabled(False); _set_grad_enabled = None
|
||||
|
||||
out1: "f32[]" = x.sin(); x = None
|
||||
|
||||
out2: "f32[]" = y * 2; y = None
|
||||
|
||||
_set_grad_enabled_1 = torch._C._set_grad_enabled(True); _set_grad_enabled_1 = None
|
||||
return ((out1, out2), [])
|
||||
|
||||
class bwd_body_0(torch.nn.Module):
|
||||
|
||||
@ -24,6 +24,7 @@ from torch.testing._internal.common_utils import (
|
||||
IS_WINDOWS,
|
||||
parametrize,
|
||||
run_tests,
|
||||
skipIfRocm,
|
||||
skipIfTorchDynamo,
|
||||
TEST_WITH_TORCHDYNAMO,
|
||||
TestCase,
|
||||
@ -1216,7 +1217,7 @@ def forward(self, pred_1, x_1):
|
||||
unittest.skip,
|
||||
lambda params: (
|
||||
params["combine_mode"] == "pointwise"
|
||||
and params["device"] == torch.device("cpu")
|
||||
and (params["device"] == torch.device("cpu") or torch.version.hip)
|
||||
),
|
||||
)
|
||||
def test_pointwise_associative_scan_simple(self, reverse, combine_mode, device):
|
||||
@ -1262,7 +1263,7 @@ def forward(self, pred_1, x_1):
|
||||
unittest.skip,
|
||||
lambda params: (
|
||||
params["combine_mode"] == "pointwise"
|
||||
and params["device"] == torch.device("cpu")
|
||||
and (params["device"] == torch.device("cpu") or torch.version.hip)
|
||||
),
|
||||
)
|
||||
def test_pointwise_associative_scan_dim(self, reverse, combine_mode, device):
|
||||
@ -1304,7 +1305,7 @@ def forward(self, pred_1, x_1):
|
||||
unittest.skip,
|
||||
lambda params: (
|
||||
params["combine_mode"] == "pointwise"
|
||||
and params["device"] == torch.device("cpu")
|
||||
and (params["device"] == torch.device("cpu") or torch.version.hip)
|
||||
),
|
||||
)
|
||||
def test_pointwise_associative_scan_compile(
|
||||
@ -1353,6 +1354,7 @@ def forward(self, pred_1, x_1):
|
||||
)
|
||||
self.assertEqual(cumsum1, cumsum_exp)
|
||||
|
||||
@skipIfRocm(msg="Unsupported on ROCM yet")
|
||||
@unittest.skipIf(not SM70OrLater, "triton")
|
||||
@unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA.")
|
||||
@parametrize("reverse", [False, True])
|
||||
@ -1364,7 +1366,7 @@ def forward(self, pred_1, x_1):
|
||||
unittest.skip,
|
||||
lambda params: (
|
||||
params["combine_mode"] == "pointwise"
|
||||
and params["device"] == torch.device("cpu")
|
||||
and (params["device"] == torch.device("cpu") or torch.version.hip)
|
||||
),
|
||||
)
|
||||
def test_pointwise_associative_scan_binary_operator(
|
||||
@ -1405,6 +1407,7 @@ def forward(self, pred_1, x_1):
|
||||
)
|
||||
self.assertEqual([r.device.type for r in result2], [device.type] * len(result2))
|
||||
|
||||
@skipIfRocm(msg="Unsupported on ROCM yet")
|
||||
@unittest.skipIf(not SM70OrLater, "triton")
|
||||
@unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA.")
|
||||
@parametrize("reverse", [False, True])
|
||||
@ -1416,7 +1419,7 @@ def forward(self, pred_1, x_1):
|
||||
unittest.skip,
|
||||
lambda params: (
|
||||
params["combine_mode"] == "pointwise"
|
||||
and params["device"] == torch.device("cpu")
|
||||
and (params["device"] == torch.device("cpu") or torch.version.hip)
|
||||
),
|
||||
)
|
||||
def test_pointwise_associative_scan_tuple(self, reverse, combine_mode, device):
|
||||
@ -1433,6 +1436,7 @@ def forward(self, pred_1, x_1):
|
||||
expected_result = _fake_associative_scan(fct, inp, 0, reverse=reverse)
|
||||
self.assertEqual(result1, expected_result)
|
||||
|
||||
@skipIfRocm(msg="Unsupported on ROCM yet")
|
||||
@unittest.skipIf(not SM70OrLater, "triton")
|
||||
@unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA.")
|
||||
@parametrize("reverse", [False, True])
|
||||
@ -1444,7 +1448,7 @@ def forward(self, pred_1, x_1):
|
||||
unittest.skip,
|
||||
lambda params: (
|
||||
params["combine_mode"] == "pointwise"
|
||||
and params["device"] == torch.device("cpu")
|
||||
and (params["device"] == torch.device("cpu") or torch.version.hip)
|
||||
),
|
||||
)
|
||||
def test_pointwise_associative_scan_complex_pytree(
|
||||
|
||||
@ -3422,7 +3422,6 @@ class CPUReproTests(TestCase):
|
||||
dtype if dtype else torch.float32,
|
||||
)
|
||||
|
||||
@config.patch("cpp.enable_tiling_heuristics", False)
|
||||
def test_group_norm_vec(self):
|
||||
class M(torch.nn.Module):
|
||||
def __init__(self) -> None:
|
||||
|
||||
@ -469,6 +469,77 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm):
|
||||
self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 2)
|
||||
self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 2)
|
||||
|
||||
@inductor_config.patch({"freezing": True})
|
||||
@patches
|
||||
@torch.no_grad
|
||||
@unittest.skipIf(not TEST_MKL, "Test requires MKL")
|
||||
@parametrize("batch_size", (8,))
|
||||
@parametrize("in_features", (3,))
|
||||
@parametrize("linear_in_features", (384,))
|
||||
@parametrize("out_features", (196,))
|
||||
@parametrize("bias", (True,))
|
||||
@dtypes(torch.float)
|
||||
def test_linear_with_input_of_flexible_layout(
|
||||
self, batch_size, in_features, linear_in_features, out_features, bias, dtype
|
||||
):
|
||||
# Reproducer from the resmlp_12_224 model in timm
|
||||
flatten_BS = int(batch_size * linear_in_features)
|
||||
|
||||
class M(torch.nn.Module):
|
||||
def __init__(self, bias):
|
||||
super().__init__()
|
||||
self.conv = torch.nn.Conv2d(
|
||||
in_features,
|
||||
linear_in_features,
|
||||
kernel_size=16,
|
||||
padding=0,
|
||||
stride=16,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
)
|
||||
self._frozen_param151 = torch.randn(1, 1, linear_in_features)
|
||||
self._frozen_param3 = torch.randn(1, 1, linear_in_features)
|
||||
self._frozen_param2 = torch.randn(linear_in_features)
|
||||
|
||||
self.linear = torch.nn.Linear(out_features, out_features, bias)
|
||||
|
||||
def forward(self, arg150_1):
|
||||
_convolution_pointwise_default = self.conv(arg150_1)
|
||||
view_73 = torch.ops.aten.reshape.default(
|
||||
_convolution_pointwise_default,
|
||||
[batch_size, linear_in_features, out_features],
|
||||
)
|
||||
_convolution_pointwise_default = None
|
||||
permute_62 = torch.ops.aten.permute.default(view_73, [0, 2, 1])
|
||||
view_73 = None
|
||||
mul_111 = torch.ops.aten.mul.Tensor(self._frozen_param151, permute_62)
|
||||
add_73 = torch.ops.aten.add.Tensor(self._frozen_param3, mul_111)
|
||||
permute_63 = torch.ops.aten.permute.default(add_73, [0, 2, 1])
|
||||
add_73 = None
|
||||
view_74 = torch.ops.aten.reshape.default(
|
||||
permute_63, [flatten_BS, out_features]
|
||||
)
|
||||
permute_63 = None
|
||||
_mkl_linear_36 = self.linear(view_74)
|
||||
view_75 = torch.ops.aten.reshape.default(
|
||||
_mkl_linear_36, [batch_size, linear_in_features, out_features]
|
||||
)
|
||||
_mkl_linear_36 = None
|
||||
permute_65 = torch.ops.aten.permute.default(view_75, [0, 2, 1])
|
||||
view_75 = None
|
||||
mul_112 = torch.ops.aten.mul.Tensor(self._frozen_param2, permute_65)
|
||||
_frozen_param2 = permute_65 = None
|
||||
add_74 = torch.ops.aten.add.Tensor(permute_62, mul_112)
|
||||
permute_62 = mul_112 = None
|
||||
return add_74
|
||||
|
||||
v = torch.randn(batch_size, in_features, 224, 224).to(dtype=dtype)
|
||||
mod = M(bias=bias).to(dtype=dtype).eval()
|
||||
with verify(dtype) as (atol, rtol):
|
||||
self.common(mod, (v,), atol=atol, rtol=rtol)
|
||||
self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
|
||||
self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 1)
|
||||
|
||||
@inductor_config.patch({"freezing": True})
|
||||
@patches
|
||||
@torch.no_grad
|
||||
|
||||
@ -29,6 +29,7 @@ from torch.nn.attention.flex_attention import (
|
||||
from torch.testing import FileCheck
|
||||
from torch.testing._internal import common_utils
|
||||
from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_BF16
|
||||
from torch.testing._internal.common_utils import skipIfRocm, TEST_WITH_ROCM
|
||||
from torch.utils._triton import has_triton
|
||||
|
||||
|
||||
@ -301,6 +302,8 @@ class TestFlexAttention(InductorTestCase):
|
||||
V_D: int = D,
|
||||
block_mask: Optional[BlockMask] = None,
|
||||
):
|
||||
if TEST_WITH_ROCM and Q_H != KV_H:
|
||||
self.skipTest("enable_gqa=True is unsupported on ROCM, for now")
|
||||
q = torch.randn(
|
||||
(Q_B, Q_H, Q_S, Q_D), dtype=dtype, device="cuda", requires_grad=True
|
||||
)
|
||||
@ -1264,6 +1267,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
|
||||
|
||||
self.run_test_with_call(attention)
|
||||
|
||||
@skipIfRocm
|
||||
@supported_platform
|
||||
def test_GQA_causal_mask(self):
|
||||
def mask_mod(b, h, q, kv):
|
||||
@ -1565,6 +1569,52 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
|
||||
out = func(query, key, value, block_mask=block_mask)
|
||||
out.sum().backward()
|
||||
|
||||
@supported_platform
|
||||
@common_utils.parametrize("mode", ["eager", "inductor"])
|
||||
@common_utils.parametrize(
|
||||
"permute_order",
|
||||
[
|
||||
(0, 1, 2, 3), # Default order
|
||||
(1, 0, 2, 3), # Reverse order
|
||||
(0, 2, 1, 3), # Mixed order
|
||||
(2, 0, 1, 3), # Another mixed order
|
||||
],
|
||||
)
|
||||
@common_utils.parametrize("shape", [(2, 1, 128, 16), (4, 2, 64, 16)])
|
||||
def test_flex_attention_stride_ordering(self, mode, permute_order, shape):
|
||||
from torch._inductor.ir import get_stride_order
|
||||
|
||||
# Setup
|
||||
make_tensor = functools.partial(
|
||||
torch.randn,
|
||||
shape,
|
||||
device="cuda",
|
||||
dtype=torch.float32,
|
||||
requires_grad=True,
|
||||
)
|
||||
|
||||
# Create and permute tensors
|
||||
query, key, value = make_tensor(), make_tensor(), make_tensor()
|
||||
query = query.permute(permute_order)
|
||||
key = key.permute(permute_order)
|
||||
value = value.permute(permute_order)
|
||||
|
||||
if mode == "inductor":
|
||||
func = torch.compile(flex_attention, backend=mode, fullgraph=True)
|
||||
else:
|
||||
func = flex_attention
|
||||
|
||||
out = func(query, key, value)
|
||||
|
||||
out_stride_order = get_stride_order(out.stride())
|
||||
query_stride_order = get_stride_order(query.stride())
|
||||
|
||||
self.assertEqual(
|
||||
out_stride_order,
|
||||
query_stride_order,
|
||||
f"Stride order mismatch: out {out_stride_order}, query {query_stride_order}",
|
||||
)
|
||||
|
||||
@supported_platform
|
||||
@common_utils.parametrize("compile", [True, False])
|
||||
def test_fully_masked_out_rows_0_check(self, compile: bool):
|
||||
|
||||
@ -21,6 +21,7 @@ from torch.nn.attention.flex_attention import (
|
||||
from torch.testing import FileCheck
|
||||
from torch.testing._internal import common_utils
|
||||
from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_BF16
|
||||
from torch.testing._internal.common_utils import skipIfRocm, TEST_WITH_ROCM
|
||||
from torch.utils._triton import has_triton
|
||||
|
||||
|
||||
@ -270,6 +271,8 @@ class TestFlexDecoding(InductorTestCase):
|
||||
score_mod is not None or block_mask is not None
|
||||
), "Must provide score_mod or block_mask"
|
||||
assert Q_H % KV_H == 0
|
||||
if TEST_WITH_ROCM and Q_H != KV_H:
|
||||
self.skipTest("enable_gqa=True is unsupported on ROCM, for now")
|
||||
q = torch.randn(
|
||||
(Q_B, Q_H, Q_S, Q_D),
|
||||
dtype=dtype,
|
||||
@ -784,6 +787,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
|
||||
|
||||
self.run_test(bias_mod)
|
||||
|
||||
@skipIfRocm
|
||||
@supported_platform
|
||||
def test_fully_masked_out_rows_0_check_gqa(self):
|
||||
# Ensure fully masked out rows won't cause NaNs.
|
||||
|
||||
@ -17,7 +17,7 @@ from torch._inductor.test_case import run_tests, TestCase
|
||||
from torch._inductor.test_operators import realize
|
||||
from torch._inductor.utils import sympy_index_symbol
|
||||
from torch._inductor.virtualized import ops, V
|
||||
from torch.testing._internal.common_cuda import SM90OrLater
|
||||
from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
|
||||
from torch.testing._internal.inductor_utils import HAS_CUDA
|
||||
from torch.utils._pytree import tree_map
|
||||
from torch.utils._sympy.functions import ModularIndexing
|
||||
@ -371,7 +371,7 @@ class LoopOrderingTest(TestCase):
|
||||
self.do_acc_test(f, x)
|
||||
self.assertEqual(1, metrics.generated_kernel_count)
|
||||
|
||||
@unittest.skipIf(not SM90OrLater, "FP8 requires H100+")
|
||||
@unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 requires H100+ and MI300+")
|
||||
def test_fp8_cast_and_t(self):
|
||||
"""
|
||||
This test repros the not able to fuses issue in
|
||||
|
||||
@ -4038,6 +4038,10 @@ class CommonTemplate:
|
||||
# Greatest relative difference: 0.06512477175897748 at index (0, 4, 11, 9) (up to 0.001 allowed)
|
||||
atol=6e-5,
|
||||
rtol=0.001,
|
||||
# Make sure we compute also with fp16 in the reference. Otherwise,
|
||||
# the reference will compute with fp32 and cast back to fp16, which
|
||||
# causes numeric differences beyond tolerance.
|
||||
reference_in_float=False if torch.version.hip else True,
|
||||
)
|
||||
|
||||
def test_convolution2(self):
|
||||
@ -4068,6 +4072,10 @@ class CommonTemplate:
|
||||
(torch.randn([2, 5, 16, 16]),),
|
||||
atol=6e-5,
|
||||
rtol=0.001,
|
||||
# Make sure we compute also with fp16 in the reference. Otherwise,
|
||||
# the reference will compute with fp32 and cast back to fp16, which
|
||||
# causes numeric differences beyond tolerance.
|
||||
reference_in_float=False if torch.version.hip else True,
|
||||
)
|
||||
|
||||
@skip_if_gpu_halide
|
||||
|
||||
@ -17,6 +17,7 @@ from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize as parametrize_test,
|
||||
run_tests,
|
||||
skipIfRocm,
|
||||
TEST_NUMPY,
|
||||
TEST_WITH_CROSSREF,
|
||||
)
|
||||
@ -745,6 +746,7 @@ class TestMultiheadAttentionNN(NNTestCase):
|
||||
|
||||
|
||||
class TestMultiheadAttentionNNDeviceType(NNTestCase):
|
||||
@skipIfRocm(msg="To investigate: yields NaN")
|
||||
def test_multihead_self_attn_two_masks_fast_path(self, device):
|
||||
"""
|
||||
Multihead self-attention should give the same result on the fast path (BetterTransformer) as on the slow path
|
||||
|
||||
@ -196,6 +196,15 @@ class TestExportAPIDynamo(common_utils.TestCase):
|
||||
TestRefineDynamicShapeModel(), inps, dynamic_shapes=dynamic_shapes
|
||||
)
|
||||
|
||||
def test_zero_output_aten_node(self):
|
||||
class Model(torch.nn.Module):
|
||||
def forward(self, x):
|
||||
torch.ops.aten._assert_async.msg(torch.tensor(True), "assertion failed")
|
||||
return x + x
|
||||
|
||||
input = torch.randn(2)
|
||||
self.assert_export(Model(), (input))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
common_utils.run_tests()
|
||||
|
||||
75
test/onnx/exporter/test_core.py
Normal file
75
test/onnx/exporter/test_core.py
Normal file
@ -0,0 +1,75 @@
|
||||
# Owner(s): ["module: onnx"]
|
||||
"""Unit tests for the _core module."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
from torch.onnx._internal.exporter import _core
|
||||
from torch.testing._internal import common_utils
|
||||
|
||||
|
||||
@common_utils.instantiate_parametrized_tests
|
||||
class TorchTensorTest(common_utils.TestCase):
|
||||
@common_utils.parametrize(
|
||||
"dtype, np_dtype",
|
||||
[
|
||||
(torch.bfloat16, np.uint16),
|
||||
(torch.bool, np.bool_),
|
||||
(torch.complex128, np.complex128),
|
||||
(torch.complex64, np.complex64),
|
||||
(torch.float16, np.float16),
|
||||
(torch.float32, np.float32),
|
||||
(torch.float64, np.float64),
|
||||
(torch.float8_e4m3fn, np.uint8),
|
||||
(torch.float8_e4m3fnuz, np.uint8),
|
||||
(torch.float8_e5m2, np.uint8),
|
||||
(torch.float8_e5m2fnuz, np.uint8),
|
||||
(torch.int16, np.int16),
|
||||
(torch.int32, np.int32),
|
||||
(torch.int64, np.int64),
|
||||
(torch.int8, np.int8),
|
||||
(torch.uint16, np.uint16),
|
||||
(torch.uint32, np.uint32),
|
||||
(torch.uint64, np.uint64),
|
||||
(torch.uint8, np.uint8),
|
||||
],
|
||||
)
|
||||
def test_numpy_returns_correct_dtype(self, dtype: torch.dtype, np_dtype):
|
||||
tensor = _core.TorchTensor(torch.tensor([1], dtype=dtype))
|
||||
self.assertEqual(tensor.numpy().dtype, np_dtype)
|
||||
self.assertEqual(tensor.__array__().dtype, np_dtype)
|
||||
self.assertEqual(np.array(tensor).dtype, np_dtype)
|
||||
|
||||
@common_utils.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
(torch.bfloat16),
|
||||
(torch.bool),
|
||||
(torch.complex128),
|
||||
(torch.complex64),
|
||||
(torch.float16),
|
||||
(torch.float32),
|
||||
(torch.float64),
|
||||
(torch.float8_e4m3fn),
|
||||
(torch.float8_e4m3fnuz),
|
||||
(torch.float8_e5m2),
|
||||
(torch.float8_e5m2fnuz),
|
||||
(torch.int16),
|
||||
(torch.int32),
|
||||
(torch.int64),
|
||||
(torch.int8),
|
||||
(torch.uint16),
|
||||
(torch.uint32),
|
||||
(torch.uint64),
|
||||
(torch.uint8),
|
||||
],
|
||||
)
|
||||
def test_tobytes(self, dtype: torch.dtype):
|
||||
tensor = _core.TorchTensor(torch.tensor([1], dtype=dtype))
|
||||
self.assertEqual(tensor.tobytes(), tensor.numpy().tobytes())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
common_utils.run_tests()
|
||||
22
test/onnx/exporter/test_tensors.py
Normal file
22
test/onnx/exporter/test_tensors.py
Normal file
@ -0,0 +1,22 @@
|
||||
# Owner(s): ["module: onnx"]
|
||||
"""Unit tests for the _tensors module."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import onnxscript
|
||||
|
||||
from torch.onnx._internal.exporter import _tensors
|
||||
from torch.testing._internal import common_utils
|
||||
|
||||
|
||||
class SymbolicTensorTest(common_utils.TestCase):
|
||||
def test_it_is_hashable(self):
|
||||
tensor = _tensors.SymbolicTensor(
|
||||
opset=onnxscript.values.Opset(domain="test", version=1)
|
||||
)
|
||||
self.assertEqual(hash(tensor), hash(tensor))
|
||||
self.assertIn(tensor, {tensor})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
common_utils.run_tests()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user